From 0bf71d0cb3a5ec6926a3d6927eb8f2013ac67943 Mon Sep 17 00:00:00 2001 From: Sait Imamoglu Date: Wed, 5 Feb 2020 21:32:42 +0300 Subject: [PATCH] update generic base field --- arch_adx_bmi2.go | 8 + arch_non_adx_bmi2.go | 8 + arithmetic_decl.go | 434 +- field.go | 331 +- field_test.go | 171 +- go.mod | 2 +- go.sum | 1 + x86_arithmetic.s | 100385 ++++++++++++++++++++++---------- x86_is_even.s | 11 - x86_single_limb_arithmetic.s | 212 - 10 files changed, 70162 insertions(+), 31401 deletions(-) create mode 100644 arch_adx_bmi2.go create mode 100644 arch_non_adx_bmi2.go delete mode 100644 x86_is_even.s delete mode 100644 x86_single_limb_arithmetic.s diff --git a/arch_adx_bmi2.go b/arch_adx_bmi2.go new file mode 100644 index 0000000..943f551 --- /dev/null +++ b/arch_adx_bmi2.go @@ -0,0 +1,8 @@ +// +build !no_adx_bmi2 + +package eip + +// we keep this only for testing purposes +func forceNonADXBMI2() bool { + return false +} diff --git a/arch_non_adx_bmi2.go b/arch_non_adx_bmi2.go new file mode 100644 index 0000000..fb54b12 --- /dev/null +++ b/arch_non_adx_bmi2.go @@ -0,0 +1,8 @@ +// +build no_adx_bmi2 + +package eip + +// we keep this only for testing purposes +func forceNonADXBMI2() bool { + return true +} diff --git a/arithmetic_decl.go b/arithmetic_decl.go index 5497461..9e15cdd 100644 --- a/arithmetic_decl.go +++ b/arithmetic_decl.go @@ -1,582 +1,628 @@ package eip -import "unsafe" +//go:noescape +func is_even(a fieldElement) bool + +//go:noescape +func eq1(a, b fieldElement) bool + +//go:noescape +func mul_two_1(a fieldElement) + +//go:noescape +func div_two_1(a fieldElement) + +//go:noescape +func cpy1(dst, src fieldElement) + +//go:noescape +func cmp1(a, b fieldElement) int8 + +//go:noescape +func add1(c, a, b, p fieldElement) + +//go:noescape +func addn1(a, b fieldElement) uint64 + +//go:noescape +func sub1(c, a, b, p fieldElement) + +//go:noescape +func subn1(a, b fieldElement) uint64 + +//go:noescape +func _neg1(c, a, p fieldElement) + +//go:noescape +func double1(c, a, p fieldElement) + +//go:noescape +func mul1(c, a, b, p fieldElement, inp uint64) + +//go:noescape +func mul_no_adx_bmi2_1(c, a, b, p fieldElement, inp uint64) + +//go:noescape +func eq2(a, b fieldElement) bool + +//go:noescape +func mul_two_2(a fieldElement) //go:noescape -func mul_two_1(a unsafe.Pointer) +func div_two_2(a fieldElement) //go:noescape -func mul_two_2(a unsafe.Pointer) +func cpy2(dst, src fieldElement) //go:noescape -func mul_two_3(a unsafe.Pointer) +func cmp2(a, b fieldElement) int8 //go:noescape -func mul_two_4(a unsafe.Pointer) +func add2(c, a, b, p fieldElement) //go:noescape -func mul_two_5(a unsafe.Pointer) +func addn2(a, b fieldElement) uint64 //go:noescape -func mul_two_6(a unsafe.Pointer) +func sub2(c, a, b, p fieldElement) //go:noescape -func mul_two_7(a unsafe.Pointer) +func subn2(a, b fieldElement) uint64 //go:noescape -func mul_two_8(a unsafe.Pointer) +func _neg2(c, a, p fieldElement) //go:noescape -func div_two_1(a unsafe.Pointer) +func double2(c, a, p fieldElement) //go:noescape -func div_two_2(a unsafe.Pointer) +func mul2(c, a, b, p fieldElement, inp uint64) //go:noescape -func div_two_3(a unsafe.Pointer) +func mul_no_adx_bmi2_2(c, a, b, p fieldElement, inp uint64) //go:noescape -func div_two_4(a unsafe.Pointer) +func eq3(a, b fieldElement) bool //go:noescape -func div_two_5(a unsafe.Pointer) +func mul_two_3(a fieldElement) //go:noescape -func div_two_6(a unsafe.Pointer) +func div_two_3(a fieldElement) //go:noescape -func div_two_7(a unsafe.Pointer) +func cpy3(dst, src fieldElement) //go:noescape -func div_two_8(a unsafe.Pointer) +func cmp3(a, b fieldElement) int8 //go:noescape -func is_even(a unsafe.Pointer) bool +func add3(c, a, b, p fieldElement) //go:noescape -func mul1(c, a, b, p unsafe.Pointer, inp uint64) +func addn3(a, b fieldElement) uint64 //go:noescape -func mul2(c, a, b, p unsafe.Pointer, inp uint64) +func sub3(c, a, b, p fieldElement) //go:noescape -func mul3(c, a, b, p unsafe.Pointer, inp uint64) +func subn3(a, b fieldElement) uint64 //go:noescape -func mul4(c, a, b, p unsafe.Pointer, inp uint64) +func _neg3(c, a, p fieldElement) //go:noescape -func mul5(c, a, b, p unsafe.Pointer, inp uint64) +func double3(c, a, p fieldElement) //go:noescape -func mul6(c, a, b, p unsafe.Pointer, inp uint64) +func mul3(c, a, b, p fieldElement, inp uint64) //go:noescape -func mul7(c, a, b, p unsafe.Pointer, inp uint64) +func mul_no_adx_bmi2_3(c, a, b, p fieldElement, inp uint64) //go:noescape -func mul8(c, a, b, p unsafe.Pointer, inp uint64) +func eq4(a, b fieldElement) bool //go:noescape -func add1(c, a, b, p unsafe.Pointer) +func mul_two_4(a fieldElement) //go:noescape -func add2(c, a, b, p unsafe.Pointer) +func div_two_4(a fieldElement) //go:noescape -func add3(c, a, b, p unsafe.Pointer) +func cpy4(dst, src fieldElement) //go:noescape -func add4(c, a, b, p unsafe.Pointer) +func cmp4(a, b fieldElement) int8 //go:noescape -func add5(c, a, b, p unsafe.Pointer) +func add4(c, a, b, p fieldElement) //go:noescape -func add6(c, a, b, p unsafe.Pointer) +func addn4(a, b fieldElement) uint64 //go:noescape -func add7(c, a, b, p unsafe.Pointer) +func sub4(c, a, b, p fieldElement) //go:noescape -func add8(c, a, b, p unsafe.Pointer) +func subn4(a, b fieldElement) uint64 //go:noescape -func sub1(c, a, b, p unsafe.Pointer) +func _neg4(c, a, p fieldElement) //go:noescape -func sub2(c, a, b, p unsafe.Pointer) +func double4(c, a, p fieldElement) //go:noescape -func sub3(c, a, b, p unsafe.Pointer) +func mul4(c, a, b, p fieldElement, inp uint64) //go:noescape -func sub4(c, a, b, p unsafe.Pointer) +func mul_no_adx_bmi2_4(c, a, b, p fieldElement, inp uint64) //go:noescape -func sub5(c, a, b, p unsafe.Pointer) +func eq5(a, b fieldElement) bool //go:noescape -func sub6(c, a, b, p unsafe.Pointer) +func mul_two_5(a fieldElement) //go:noescape -func sub7(c, a, b, p unsafe.Pointer) +func div_two_5(a fieldElement) //go:noescape -func sub8(c, a, b, p unsafe.Pointer) +func cpy5(dst, src fieldElement) //go:noescape -func double1(c, a, p unsafe.Pointer) +func cmp5(a, b fieldElement) int8 //go:noescape -func double2(c, a, p unsafe.Pointer) +func add5(c, a, b, p fieldElement) //go:noescape -func double3(c, a, p unsafe.Pointer) +func addn5(a, b fieldElement) uint64 //go:noescape -func double4(c, a, p unsafe.Pointer) +func sub5(c, a, b, p fieldElement) //go:noescape -func double5(c, a, p unsafe.Pointer) +func subn5(a, b fieldElement) uint64 //go:noescape -func double6(c, a, p unsafe.Pointer) +func _neg5(c, a, p fieldElement) //go:noescape -func double7(c, a, p unsafe.Pointer) +func double5(c, a, p fieldElement) //go:noescape -func double8(c, a, p unsafe.Pointer) +func mul5(c, a, b, p fieldElement, inp uint64) //go:noescape -func _neg1(c, a, p unsafe.Pointer) +func mul_no_adx_bmi2_5(c, a, b, p fieldElement, inp uint64) //go:noescape -func _neg2(c, a, p unsafe.Pointer) +func eq6(a, b fieldElement) bool //go:noescape -func _neg3(c, a, p unsafe.Pointer) +func mul_two_6(a fieldElement) //go:noescape -func _neg4(c, a, p unsafe.Pointer) +func div_two_6(a fieldElement) //go:noescape -func _neg5(c, a, p unsafe.Pointer) +func cpy6(dst, src fieldElement) //go:noescape -func _neg6(c, a, p unsafe.Pointer) +func cmp6(a, b fieldElement) int8 //go:noescape -func _neg7(c, a, p unsafe.Pointer) +func add6(c, a, b, p fieldElement) //go:noescape -func _neg8(c, a, p unsafe.Pointer) +func addn6(a, b fieldElement) uint64 //go:noescape -func eq1(a, b unsafe.Pointer) bool +func sub6(c, a, b, p fieldElement) //go:noescape -func eq2(a, b unsafe.Pointer) bool +func subn6(a, b fieldElement) uint64 //go:noescape -func eq3(a, b unsafe.Pointer) bool +func _neg6(c, a, p fieldElement) //go:noescape -func eq4(a, b unsafe.Pointer) bool +func double6(c, a, p fieldElement) //go:noescape -func eq5(a, b unsafe.Pointer) bool +func mul6(c, a, b, p fieldElement, inp uint64) //go:noescape -func eq6(a, b unsafe.Pointer) bool +func mul_no_adx_bmi2_6(c, a, b, p fieldElement, inp uint64) //go:noescape -func eq7(a, b unsafe.Pointer) bool +func eq7(a, b fieldElement) bool //go:noescape -func eq8(a, b unsafe.Pointer) bool +func mul_two_7(a fieldElement) //go:noescape -func cpy1(dst, src unsafe.Pointer) +func div_two_7(a fieldElement) //go:noescape -func cpy2(dst, src unsafe.Pointer) +func cpy7(dst, src fieldElement) //go:noescape -func cpy3(dst, src unsafe.Pointer) +func cmp7(a, b fieldElement) int8 //go:noescape -func cpy4(dst, src unsafe.Pointer) +func add7(c, a, b, p fieldElement) //go:noescape -func cpy5(dst, src unsafe.Pointer) +func addn7(a, b fieldElement) uint64 //go:noescape -func cpy6(dst, src unsafe.Pointer) +func sub7(c, a, b, p fieldElement) //go:noescape -func cpy7(dst, src unsafe.Pointer) +func subn7(a, b fieldElement) uint64 //go:noescape -func cpy8(dst, src unsafe.Pointer) +func _neg7(c, a, p fieldElement) //go:noescape -func cmp1(a, b unsafe.Pointer) int8 +func double7(c, a, p fieldElement) //go:noescape -func cmp2(a, b unsafe.Pointer) int8 +func mul7(c, a, b, p fieldElement, inp uint64) //go:noescape -func cmp3(a, b unsafe.Pointer) int8 +func mul_no_adx_bmi2_7(c, a, b, p fieldElement, inp uint64) //go:noescape -func cmp4(a, b unsafe.Pointer) int8 +func eq8(a, b fieldElement) bool //go:noescape -func cmp5(a, b unsafe.Pointer) int8 +func mul_two_8(a fieldElement) //go:noescape -func cmp6(a, b unsafe.Pointer) int8 +func div_two_8(a fieldElement) //go:noescape -func cmp7(a, b unsafe.Pointer) int8 +func cpy8(dst, src fieldElement) //go:noescape -func cmp8(a, b unsafe.Pointer) int8 +func cmp8(a, b fieldElement) int8 //go:noescape -func addn1(a, b unsafe.Pointer) uint64 +func add8(c, a, b, p fieldElement) //go:noescape -func addn2(a, b unsafe.Pointer) uint64 +func addn8(a, b fieldElement) uint64 //go:noescape -func addn3(a, b unsafe.Pointer) uint64 +func sub8(c, a, b, p fieldElement) //go:noescape -func addn4(a, b unsafe.Pointer) uint64 +func subn8(a, b fieldElement) uint64 //go:noescape -func addn5(a, b unsafe.Pointer) uint64 +func _neg8(c, a, p fieldElement) //go:noescape -func addn6(a, b unsafe.Pointer) uint64 +func double8(c, a, p fieldElement) //go:noescape -func addn7(a, b unsafe.Pointer) uint64 +func mul8(c, a, b, p fieldElement, inp uint64) //go:noescape -func addn8(a, b unsafe.Pointer) uint64 +func mul_no_adx_bmi2_8(c, a, b, p fieldElement, inp uint64) //go:noescape -func subn1(a, b unsafe.Pointer) uint64 +func eq9(a, b fieldElement) bool //go:noescape -func subn2(a, b unsafe.Pointer) uint64 +func mul_two_9(a fieldElement) //go:noescape -func subn3(a, b unsafe.Pointer) uint64 +func div_two_9(a fieldElement) //go:noescape -func subn4(a, b unsafe.Pointer) uint64 +func cpy9(dst, src fieldElement) //go:noescape -func subn5(a, b unsafe.Pointer) uint64 +func cmp9(a, b fieldElement) int8 //go:noescape -func subn6(a, b unsafe.Pointer) uint64 +func add9(c, a, b, p fieldElement) //go:noescape -func subn7(a, b unsafe.Pointer) uint64 +func addn9(a, b fieldElement) uint64 //go:noescape -func subn8(a, b unsafe.Pointer) uint64 +func sub9(c, a, b, p fieldElement) //go:noescape -func mul_two_9(a unsafe.Pointer) +func subn9(a, b fieldElement) uint64 //go:noescape -func div_two_9(a unsafe.Pointer) +func _neg9(c, a, p fieldElement) //go:noescape -func mul9(c, a, b, p unsafe.Pointer, inp uint64) +func double9(c, a, p fieldElement) //go:noescape -func add9(c, a, b, p unsafe.Pointer) +func mul9(c, a, b, p fieldElement, inp uint64) //go:noescape -func sub9(c, a, b, p unsafe.Pointer) +func mul_no_adx_bmi2_9(c, a, b, p fieldElement, inp uint64) //go:noescape -func double9(c, a, p unsafe.Pointer) +func eq10(a, b fieldElement) bool //go:noescape -func _neg9(c, a, p unsafe.Pointer) +func mul_two_10(a fieldElement) //go:noescape -func eq9(a, b unsafe.Pointer) bool +func div_two_10(a fieldElement) //go:noescape -func cpy9(dst, src unsafe.Pointer) +func cpy10(dst, src fieldElement) //go:noescape -func cmp9(a, b unsafe.Pointer) int8 +func cmp10(a, b fieldElement) int8 //go:noescape -func addn9(a, b unsafe.Pointer) uint64 +func add10(c, a, b, p fieldElement) //go:noescape -func subn9(a, b unsafe.Pointer) uint64 +func addn10(a, b fieldElement) uint64 //go:noescape -func mul_two_10(a unsafe.Pointer) +func sub10(c, a, b, p fieldElement) //go:noescape -func div_two_10(a unsafe.Pointer) +func subn10(a, b fieldElement) uint64 //go:noescape -func mul10(c, a, b, p unsafe.Pointer, inp uint64) +func _neg10(c, a, p fieldElement) //go:noescape -func add10(c, a, b, p unsafe.Pointer) +func double10(c, a, p fieldElement) //go:noescape -func sub10(c, a, b, p unsafe.Pointer) +func mul10(c, a, b, p fieldElement, inp uint64) //go:noescape -func double10(c, a, p unsafe.Pointer) +func mul_no_adx_bmi2_10(c, a, b, p fieldElement, inp uint64) //go:noescape -func _neg10(c, a, p unsafe.Pointer) +func eq11(a, b fieldElement) bool //go:noescape -func eq10(a, b unsafe.Pointer) bool +func mul_two_11(a fieldElement) //go:noescape -func cpy10(dst, src unsafe.Pointer) +func div_two_11(a fieldElement) //go:noescape -func cmp10(a, b unsafe.Pointer) int8 +func cpy11(dst, src fieldElement) //go:noescape -func addn10(a, b unsafe.Pointer) uint64 +func cmp11(a, b fieldElement) int8 //go:noescape -func subn10(a, b unsafe.Pointer) uint64 +func add11(c, a, b, p fieldElement) //go:noescape -func mul_two_11(a unsafe.Pointer) +func addn11(a, b fieldElement) uint64 //go:noescape -func div_two_11(a unsafe.Pointer) +func sub11(c, a, b, p fieldElement) //go:noescape -func mul11(c, a, b, p unsafe.Pointer, inp uint64) +func subn11(a, b fieldElement) uint64 //go:noescape -func add11(c, a, b, p unsafe.Pointer) +func _neg11(c, a, p fieldElement) //go:noescape -func sub11(c, a, b, p unsafe.Pointer) +func double11(c, a, p fieldElement) //go:noescape -func double11(c, a, p unsafe.Pointer) +func mul11(c, a, b, p fieldElement, inp uint64) //go:noescape -func _neg11(c, a, p unsafe.Pointer) +func mul_no_adx_bmi2_11(c, a, b, p fieldElement, inp uint64) //go:noescape -func eq11(a, b unsafe.Pointer) bool +func eq12(a, b fieldElement) bool //go:noescape -func cpy11(dst, src unsafe.Pointer) +func mul_two_12(a fieldElement) //go:noescape -func cmp11(a, b unsafe.Pointer) int8 +func div_two_12(a fieldElement) //go:noescape -func addn11(a, b unsafe.Pointer) uint64 +func cpy12(dst, src fieldElement) //go:noescape -func subn11(a, b unsafe.Pointer) uint64 +func cmp12(a, b fieldElement) int8 //go:noescape -func mul_two_12(a unsafe.Pointer) +func add12(c, a, b, p fieldElement) //go:noescape -func div_two_12(a unsafe.Pointer) +func addn12(a, b fieldElement) uint64 //go:noescape -func mul12(c, a, b, p unsafe.Pointer, inp uint64) +func sub12(c, a, b, p fieldElement) //go:noescape -func add12(c, a, b, p unsafe.Pointer) +func subn12(a, b fieldElement) uint64 //go:noescape -func sub12(c, a, b, p unsafe.Pointer) +func _neg12(c, a, p fieldElement) //go:noescape -func double12(c, a, p unsafe.Pointer) +func double12(c, a, p fieldElement) //go:noescape -func _neg12(c, a, p unsafe.Pointer) +func mul12(c, a, b, p fieldElement, inp uint64) //go:noescape -func eq12(a, b unsafe.Pointer) bool +func mul_no_adx_bmi2_12(c, a, b, p fieldElement, inp uint64) //go:noescape -func cpy12(dst, src unsafe.Pointer) +func eq13(a, b fieldElement) bool //go:noescape -func cmp12(a, b unsafe.Pointer) int8 +func mul_two_13(a fieldElement) //go:noescape -func addn12(a, b unsafe.Pointer) uint64 +func div_two_13(a fieldElement) //go:noescape -func subn12(a, b unsafe.Pointer) uint64 +func cpy13(dst, src fieldElement) //go:noescape -func mul_two_13(a unsafe.Pointer) +func cmp13(a, b fieldElement) int8 //go:noescape -func div_two_13(a unsafe.Pointer) +func add13(c, a, b, p fieldElement) //go:noescape -func mul13(c, a, b, p unsafe.Pointer, inp uint64) +func addn13(a, b fieldElement) uint64 //go:noescape -func add13(c, a, b, p unsafe.Pointer) +func sub13(c, a, b, p fieldElement) //go:noescape -func sub13(c, a, b, p unsafe.Pointer) +func subn13(a, b fieldElement) uint64 //go:noescape -func double13(c, a, p unsafe.Pointer) +func _neg13(c, a, p fieldElement) //go:noescape -func _neg13(c, a, p unsafe.Pointer) +func double13(c, a, p fieldElement) //go:noescape -func eq13(a, b unsafe.Pointer) bool +func mul13(c, a, b, p fieldElement, inp uint64) //go:noescape -func cpy13(dst, src unsafe.Pointer) +func mul_no_adx_bmi2_13(c, a, b, p fieldElement, inp uint64) //go:noescape -func cmp13(a, b unsafe.Pointer) int8 +func eq14(a, b fieldElement) bool //go:noescape -func addn13(a, b unsafe.Pointer) uint64 +func mul_two_14(a fieldElement) //go:noescape -func subn13(a, b unsafe.Pointer) uint64 +func div_two_14(a fieldElement) //go:noescape -func mul_two_14(a unsafe.Pointer) +func cpy14(dst, src fieldElement) //go:noescape -func div_two_14(a unsafe.Pointer) +func cmp14(a, b fieldElement) int8 //go:noescape -func mul14(c, a, b, p unsafe.Pointer, inp uint64) +func add14(c, a, b, p fieldElement) //go:noescape -func add14(c, a, b, p unsafe.Pointer) +func addn14(a, b fieldElement) uint64 //go:noescape -func sub14(c, a, b, p unsafe.Pointer) +func sub14(c, a, b, p fieldElement) //go:noescape -func double14(c, a, p unsafe.Pointer) +func subn14(a, b fieldElement) uint64 //go:noescape -func _neg14(c, a, p unsafe.Pointer) +func _neg14(c, a, p fieldElement) //go:noescape -func eq14(a, b unsafe.Pointer) bool +func double14(c, a, p fieldElement) //go:noescape -func cpy14(dst, src unsafe.Pointer) +func mul14(c, a, b, p fieldElement, inp uint64) //go:noescape -func cmp14(a, b unsafe.Pointer) int8 +func mul_no_adx_bmi2_14(c, a, b, p fieldElement, inp uint64) //go:noescape -func addn14(a, b unsafe.Pointer) uint64 +func eq15(a, b fieldElement) bool //go:noescape -func subn14(a, b unsafe.Pointer) uint64 +func mul_two_15(a fieldElement) //go:noescape -func mul_two_15(a unsafe.Pointer) +func div_two_15(a fieldElement) //go:noescape -func div_two_15(a unsafe.Pointer) +func cpy15(dst, src fieldElement) //go:noescape -func mul15(c, a, b, p unsafe.Pointer, inp uint64) +func cmp15(a, b fieldElement) int8 //go:noescape -func add15(c, a, b, p unsafe.Pointer) +func add15(c, a, b, p fieldElement) //go:noescape -func sub15(c, a, b, p unsafe.Pointer) +func addn15(a, b fieldElement) uint64 //go:noescape -func double15(c, a, p unsafe.Pointer) +func sub15(c, a, b, p fieldElement) //go:noescape -func _neg15(c, a, p unsafe.Pointer) +func subn15(a, b fieldElement) uint64 //go:noescape -func eq15(a, b unsafe.Pointer) bool +func _neg15(c, a, p fieldElement) //go:noescape -func cpy15(dst, src unsafe.Pointer) +func double15(c, a, p fieldElement) //go:noescape -func cmp15(a, b unsafe.Pointer) int8 +func mul15(c, a, b, p fieldElement, inp uint64) //go:noescape -func addn15(a, b unsafe.Pointer) uint64 +func mul_no_adx_bmi2_15(c, a, b, p fieldElement, inp uint64) //go:noescape -func subn15(a, b unsafe.Pointer) uint64 +func eq16(a, b fieldElement) bool //go:noescape -func mul_two_16(a unsafe.Pointer) +func mul_two_16(a fieldElement) //go:noescape -func div_two_16(a unsafe.Pointer) +func div_two_16(a fieldElement) //go:noescape -func mul16(c, a, b, p unsafe.Pointer, inp uint64) +func cpy16(dst, src fieldElement) //go:noescape -func add16(c, a, b, p unsafe.Pointer) +func cmp16(a, b fieldElement) int8 //go:noescape -func sub16(c, a, b, p unsafe.Pointer) +func add16(c, a, b, p fieldElement) //go:noescape -func double16(c, a, p unsafe.Pointer) +func addn16(a, b fieldElement) uint64 //go:noescape -func _neg16(c, a, p unsafe.Pointer) +func sub16(c, a, b, p fieldElement) //go:noescape -func eq16(a, b unsafe.Pointer) bool +func subn16(a, b fieldElement) uint64 //go:noescape -func cpy16(dst, src unsafe.Pointer) +func _neg16(c, a, p fieldElement) //go:noescape -func cmp16(a, b unsafe.Pointer) int8 +func double16(c, a, p fieldElement) //go:noescape -func addn16(a, b unsafe.Pointer) uint64 +func mul16(c, a, b, p fieldElement, inp uint64) //go:noescape -func subn16(a, b unsafe.Pointer) uint64 +func mul_no_adx_bmi2_16(c, a, b, p fieldElement, inp uint64) diff --git a/field.go b/field.go index 3e6a205..c18654a 100644 --- a/field.go +++ b/field.go @@ -8,14 +8,19 @@ import ( "math/big" "reflect" "unsafe" + + "golang.org/x/sys/cpu" ) // fieldElement is a pointer that addresses // any field element in any limb size type fieldElement = unsafe.Pointer +var nonADXBMI2 = !(cpu.X86.HasADX && cpu.X86.HasBMI2) || forceNonADXBMI2() + type field struct { limbSize int + bitLength uint64 // TODO: remove after fuzz testing p fieldElement inp uint64 one fieldElement @@ -37,52 +42,32 @@ type field struct { subn func(a, b fieldElement) uint64 div_two func(a fieldElement) mul_two func(a fieldElement) - bitLength uint64 // TODO: remove after fuzz testing } func newField(p []byte) (*field, error) { + var err error f := new(field) f.pbig = new(big.Int).SetBytes(p) - var err error f.p, f.limbSize, err = newFieldElementFromBytes(p) if err != nil { return nil, err } - - // TODO: remove after fuzz testing f.bitLength = uint64(f.pbig.BitLen()) R := new(big.Int) R.SetBit(R, f.byteSize()*8, 1).Mod(R, f.pbig) R2 := new(big.Int) R2.Mul(R, R).Mod(R2, f.pbig) inpT := new(big.Int).ModInverse(new(big.Int).Neg(f.pbig), new(big.Int).SetBit(new(big.Int), 64, 1)) - if inpT == nil { - return nil, fmt.Errorf("invalid inverse of prime %x", f.pbig) - } - f.r, err = newFieldElementFromBigUnchecked(f.limbSize, R) - if err != nil { - return nil, err - } + f.r = newFieldElementFromBigUnchecked(f.limbSize, R) f.rbig = R - f.one, err = newFieldElementFromBigUnchecked(f.limbSize, R) - if err != nil { - return nil, err - } - f.r2, err = newFieldElementFromBigUnchecked(f.limbSize, R2) - if err != nil { - return nil, err - } - f._one, err = newFieldElementFromBigUnchecked(f.limbSize, big.NewInt(1)) - if err != nil { - return nil, err - } - f.zero, err = newFieldElementFromBigUnchecked(f.limbSize, new(big.Int)) - if err != nil { - return nil, err + f.one = newFieldElementFromBigUnchecked(f.limbSize, R) + f.r2 = newFieldElementFromBigUnchecked(f.limbSize, R2) + f._one = newFieldElementFromBigUnchecked(f.limbSize, big.NewInt(1)) + f.zero = newFieldElementFromBigUnchecked(f.limbSize, new(big.Int)) + if inpT == nil { + return nil, fmt.Errorf("field is not applicable\n%s", hex.EncodeToString(p)) } - f.inp = inpT.Uint64() - switch f.limbSize { case 1: f.equal = eq1 @@ -90,210 +75,275 @@ func newField(p []byte) (*field, error) { f.cmp = cmp1 f.addn = addn1 f.subn = subn1 - f._mul = mul1 f._add = add1 f._sub = sub1 f._double = double1 f._neg = _neg1 f.div_two = div_two_1 f.mul_two = mul_two_1 + if nonADXBMI2 { + f._mul = mul_no_adx_bmi2_1 + } else { + f._mul = mul1 + } case 2: f.equal = eq2 f.copy = cpy2 f.cmp = cmp2 f.addn = addn2 f.subn = subn2 - f._mul = mul2 f._add = add2 f._sub = sub2 f._double = double2 f._neg = _neg2 f.div_two = div_two_2 f.mul_two = mul_two_2 + if nonADXBMI2 { + f._mul = mul_no_adx_bmi2_2 + } else { + f._mul = mul2 + } case 3: f.equal = eq3 f.copy = cpy3 f.cmp = cmp3 f.addn = addn3 f.subn = subn3 - f._mul = mul3 f._add = add3 f._sub = sub3 f._double = double3 f._neg = _neg3 f.div_two = div_two_3 f.mul_two = mul_two_3 + if nonADXBMI2 { + f._mul = mul_no_adx_bmi2_3 + } else { + f._mul = mul3 + } case 4: f.equal = eq4 f.copy = cpy4 f.cmp = cmp4 f.addn = addn4 f.subn = subn4 - f._mul = mul4 f._add = add4 f._sub = sub4 f._double = double4 f._neg = _neg4 f.div_two = div_two_4 f.mul_two = mul_two_4 + if nonADXBMI2 { + f._mul = mul_no_adx_bmi2_4 + } else { + f._mul = mul4 + } case 5: f.equal = eq5 f.copy = cpy5 f.cmp = cmp5 f.addn = addn5 f.subn = subn5 - f._mul = mul5 f._add = add5 f._sub = sub5 f._double = double5 f._neg = _neg5 f.div_two = div_two_5 f.mul_two = mul_two_5 + if nonADXBMI2 { + f._mul = mul_no_adx_bmi2_5 + } else { + f._mul = mul5 + } case 6: f.equal = eq6 f.copy = cpy6 f.cmp = cmp6 f.addn = addn6 f.subn = subn6 - f._mul = mul6 f._add = add6 f._sub = sub6 f._double = double6 f._neg = _neg6 f.div_two = div_two_6 f.mul_two = mul_two_6 + f._mul = mul6 + if nonADXBMI2 { + f._mul = mul_no_adx_bmi2_6 + } else { + f._mul = mul6 + } case 7: f.equal = eq7 f.copy = cpy7 f.cmp = cmp7 f.addn = addn7 f.subn = subn7 - f._mul = mul7 f._add = add7 f._sub = sub7 f._double = double7 f._neg = _neg7 f.div_two = div_two_7 f.mul_two = mul_two_7 + if nonADXBMI2 { + f._mul = mul_no_adx_bmi2_7 + } else { + f._mul = mul7 + } case 8: f.equal = eq8 f.copy = cpy8 f.cmp = cmp8 f.addn = addn8 f.subn = subn8 - f._mul = mul8 f._add = add8 f._sub = sub8 f._double = double8 f._neg = _neg8 f.div_two = div_two_8 f.mul_two = mul_two_8 + if nonADXBMI2 { + f._mul = mul_no_adx_bmi2_8 + } else { + f._mul = mul8 + } case 9: f.equal = eq9 f.copy = cpy9 f.cmp = cmp9 f.addn = addn9 f.subn = subn9 - f._mul = mul9 f._add = add9 f._sub = sub9 f._double = double9 f._neg = _neg9 f.div_two = div_two_9 f.mul_two = mul_two_9 + if nonADXBMI2 { + f._mul = mul_no_adx_bmi2_9 + } else { + f._mul = mul9 + } case 10: f.equal = eq10 f.copy = cpy10 f.cmp = cmp10 f.addn = addn10 f.subn = subn10 - f._mul = mul10 f._add = add10 f._sub = sub10 f._double = double10 f._neg = _neg10 f.div_two = div_two_10 f.mul_two = mul_two_10 + if nonADXBMI2 { + f._mul = mul_no_adx_bmi2_10 + } else { + f._mul = mul10 + } case 11: f.equal = eq11 f.copy = cpy11 f.cmp = cmp11 f.addn = addn11 f.subn = subn11 - f._mul = mul11 f._add = add11 f._sub = sub11 f._double = double11 f._neg = _neg11 f.div_two = div_two_11 f.mul_two = mul_two_11 + if nonADXBMI2 { + f._mul = mul_no_adx_bmi2_11 + } else { + f._mul = mul11 + } case 12: f.equal = eq12 f.copy = cpy12 f.cmp = cmp12 f.addn = addn12 f.subn = subn12 - f._mul = mul12 f._add = add12 f._sub = sub12 f._double = double12 f._neg = _neg12 f.div_two = div_two_12 f.mul_two = mul_two_12 + if nonADXBMI2 { + f._mul = mul_no_adx_bmi2_12 + } else { + f._mul = mul12 + } case 13: f.equal = eq13 f.copy = cpy13 f.cmp = cmp13 f.addn = addn13 f.subn = subn13 - f._mul = mul13 f._add = add13 f._sub = sub13 f._double = double13 f._neg = _neg13 f.div_two = div_two_13 f.mul_two = mul_two_13 + if nonADXBMI2 { + f._mul = mul_no_adx_bmi2_13 + } else { + f._mul = mul13 + } case 14: f.equal = eq14 f.copy = cpy14 f.cmp = cmp14 f.addn = addn14 f.subn = subn14 - f._mul = mul14 f._add = add14 f._sub = sub14 f._double = double14 f._neg = _neg14 f.div_two = div_two_14 f.mul_two = mul_two_14 + if nonADXBMI2 { + f._mul = mul_no_adx_bmi2_14 + } else { + f._mul = mul14 + } case 15: f.equal = eq15 f.copy = cpy15 f.cmp = cmp15 f.addn = addn15 f.subn = subn15 - f._mul = mul15 f._add = add15 f._sub = sub15 f._double = double15 f._neg = _neg15 f.div_two = div_two_15 f.mul_two = mul_two_15 + if nonADXBMI2 { + f._mul = mul_no_adx_bmi2_15 + } else { + f._mul = mul15 + } case 16: f.equal = eq16 f.copy = cpy16 f.cmp = cmp16 f.addn = addn16 f.subn = subn16 - f._mul = mul16 f._add = add16 f._sub = sub16 f._double = double16 f._neg = _neg16 f.div_two = div_two_16 f.mul_two = mul_two_16 + if nonADXBMI2 { + f._mul = mul_no_adx_bmi2_16 + } else { + f._mul = mul16 + } default: - return nil, fmt.Errorf("given limb size %d not implemented", f.limbSize) + return nil, fmt.Errorf("limb size %d is not implemented", f.limbSize) } return f, nil } @@ -330,16 +380,15 @@ func (f *field) mul(c, a, b fieldElement) { f._mul(c, a, b, f.p, f.inp) } +func (f *field) square(c, a fieldElement) { + f._mul(c, a, a, f.p, f.inp) +} + func (f *field) exp(c, a fieldElement, e *big.Int) { z := f.newFieldElement() f.copy(z, f.r) - found := false - for i := e.BitLen() - 1; i >= 0; i-- { - if found { - f.mul(z, z, z) - } else { - found = e.Bit(i) == 1 - } + for i := e.BitLen(); i >= 0; i-- { + f.mul(z, z, z) if e.Bit(i) == 1 { f.mul(z, z, a) } @@ -347,6 +396,14 @@ func (f *field) exp(c, a fieldElement, e *big.Int) { f.copy(c, z) } +func (f *field) isOne(fe fieldElement) bool { + return f.equal(fe, f.one) +} + +func (f *field) isZero(fe fieldElement) bool { + return f.equal(fe, f.zero) +} + func (f *field) isValid(fe []byte) bool { feBig := new(big.Int).SetBytes(fe) if feBig.Cmp(f.pbig) != -1 { @@ -356,40 +413,49 @@ func (f *field) isValid(fe []byte) bool { } func (f *field) newFieldElement() fieldElement { - return newFieldElement(f.limbSize) + fe, err := newFieldElement(f.limbSize) + if err != nil { + // panic("this is unexpected") + } + return fe } -func (f *field) randFieldElement(r io.Reader) (fieldElement, error) { +func (f *field) randFieldElement(r io.Reader) fieldElement { bi, err := rand.Int(r, f.pbig) if err != nil { - return nil, err + panic(err) } return newFieldElementFromBigUnchecked(f.limbSize, bi) } func (f *field) newFieldElementFromBytesNoTransform(in []byte) (fieldElement, error) { if len(in) != f.byteSize() { - return nil, fmt.Errorf("bad input size %d %d", len(in), f.byteSize()) + return nil, fmt.Errorf("bad input size") } - fe, _, err := newFieldElementFromBytes(in) + fe, limbSize, err := newFieldElementFromBytes(in) if err != nil { return nil, err } + if limbSize != f.limbSize { + // panic("this is unexpected") + } return fe, nil } func (f *field) newFieldElementFromBytes(in []byte) (fieldElement, error) { if len(in) != f.byteSize() { - return nil, fmt.Errorf("bad input size %d %d", len(in), f.byteSize()) + return nil, fmt.Errorf("bad input size") } if !f.isValid(in) { return nil, fmt.Errorf("input is a larger number than modulus") } - fe, _, err := newFieldElementFromBytes(in) + fe, limbSize, err := newFieldElementFromBytes(in) if err != nil { return nil, err } - // if limbSize != _limbSize { // panic("") // is not expected // } + if limbSize != f.limbSize { + // panic("this is unexpected") + } f.toMont(fe, fe) return fe, nil } @@ -409,10 +475,13 @@ func (f *field) newFieldElementFromString(hexStr string) (fieldElement, error) { if len(in) > f.byteSize() { return nil, fmt.Errorf("bad input size") } - fe, _, err := newFieldElementFromBytes(padBytes(in, f.byteSize())) + fe, limbSize, err := newFieldElementFromBytes(padBytes(in, f.byteSize())) if err != nil { return nil, err } + if limbSize != f.limbSize { + // panic("this is unexpected") + } f.toMont(fe, fe) return fe, nil } @@ -423,12 +492,15 @@ func (f *field) newFieldElementFromBig(a *big.Int) (fieldElement, error) { return nil, fmt.Errorf("input is a larger number than modulus") } if len(in) > f.byteSize() { - return nil, fmt.Errorf("bad input size") + return nil, fmt.Errorf("bad input size %d", len(in)) } - fe, _, err := newFieldElementFromBytes(padBytes(in, f.byteSize())) + fe, limbSize, err := newFieldElementFromBytes(padBytes(in, f.byteSize())) if err != nil { return nil, err } + if limbSize != f.limbSize { + // panic("this is unexpected") + } f.toMont(fe, fe) return fe, nil } @@ -521,81 +593,80 @@ func toBytes(fe []uint64) []byte { // limbSize is calculated according to size of input slice func newFieldElementFromBytes(in []byte) (fieldElement, int, error) { byteSize := len(in) - if byteSize%8 != 0 { - return nil, 0, fmt.Errorf("invalid input byte size %d for new field element", byteSize) - } limbSize := byteSize / 8 - if limbSize < 1 || limbSize > 16 { - return nil, 0, fmt.Errorf("given limb size %d not implemented", limbSize) + if byteSize%8 != 0 { + return nil, 0, fmt.Errorf("bad input byte size %d", byteSize) } // TODO: remove after fuzz testing if limbSize < 4 { limbSize = 4 in = padBytes(in, 32) } - a := newFieldElement(limbSize) + a, err := newFieldElement(limbSize) + if err != nil { + return nil, 0, err + } var data []uint64 sh := (*reflect.SliceHeader)(unsafe.Pointer(&data)) sh.Data = uintptr(a) sh.Len, sh.Cap = limbSize, limbSize - limbSliceFromBytes(data[:], in) + if err := limbSliceFromBytes(data[:], in); err != nil { + // panic("this is unexpected") + } return a, limbSize, nil } -func newFieldElement(limbSize int) fieldElement { +func newFieldElement(limbSize int) (fieldElement, error) { switch limbSize { case 1: - return unsafe.Pointer(&[1]uint64{}) + return unsafe.Pointer(&[1]uint64{}), nil case 2: - return unsafe.Pointer(&[2]uint64{}) + return unsafe.Pointer(&[2]uint64{}), nil case 3: - return unsafe.Pointer(&[3]uint64{}) + return unsafe.Pointer(&[3]uint64{}), nil case 4: - return unsafe.Pointer(&[4]uint64{}) + return unsafe.Pointer(&[4]uint64{}), nil case 5: - return unsafe.Pointer(&[5]uint64{}) + return unsafe.Pointer(&[5]uint64{}), nil case 6: - return unsafe.Pointer(&[6]uint64{}) + return unsafe.Pointer(&[6]uint64{}), nil case 7: - return unsafe.Pointer(&[7]uint64{}) + return unsafe.Pointer(&[7]uint64{}), nil case 8: - return unsafe.Pointer(&[8]uint64{}) + return unsafe.Pointer(&[8]uint64{}), nil case 9: - return unsafe.Pointer(&[9]uint64{}) + return unsafe.Pointer(&[9]uint64{}), nil case 10: - return unsafe.Pointer(&[10]uint64{}) + return unsafe.Pointer(&[10]uint64{}), nil case 11: - return unsafe.Pointer(&[11]uint64{}) + return unsafe.Pointer(&[11]uint64{}), nil case 12: - return unsafe.Pointer(&[12]uint64{}) + return unsafe.Pointer(&[12]uint64{}), nil case 13: - return unsafe.Pointer(&[13]uint64{}) + return unsafe.Pointer(&[13]uint64{}), nil case 14: - return unsafe.Pointer(&[14]uint64{}) + return unsafe.Pointer(&[14]uint64{}), nil case 15: - return unsafe.Pointer(&[15]uint64{}) + return unsafe.Pointer(&[15]uint64{}), nil case 16: - return unsafe.Pointer(&[16]uint64{}) + return unsafe.Pointer(&[16]uint64{}), nil default: - panic("not implemented") + return nil, fmt.Errorf("limb size %d is not implemented", limbSize) } } -func newFieldElementFromBigUnchecked(limbSize int, bi *big.Int) (fieldElement, error) { +func newFieldElementFromBigUnchecked(limbSize int, bi *big.Int) fieldElement { in := bi.Bytes() byteSize := limbSize * 8 - fe, _, err := newFieldElementFromBytes(padBytes(in, byteSize)) - if err != nil { - return nil, err - } - return fe, nil + fe, _, _ := newFieldElementFromBytes(padBytes(in, byteSize)) + return fe } -func limbSliceFromBytes(out []uint64, in []byte) { +func limbSliceFromBytes(out []uint64, in []byte) error { var byteSize = len(in) var limbSize = len(out) if limbSize*8 != byteSize { - panic("non ... input output sizes") + return fmt.Errorf("(byteSize != limbSize * 8), %d, %d", byteSize, limbSize) } var a int for i := 0; i < limbSize; i++ { @@ -605,6 +676,7 @@ func limbSliceFromBytes(out []uint64, in []byte) { uint64(in[a-5])<<32 | uint64(in[a-6])<<40 | uint64(in[a-7])<<48 | uint64(in[a-8])<<56 } + return nil } func padBytes(in []byte, size int) []byte { @@ -616,65 +688,6 @@ func padBytes(in []byte, size int) []byte { return out } -func (f *field) oldInverse(inv, e fieldElement) bool { - u, v, s, r := f.newFieldElement(), - f.newFieldElement(), - f.newFieldElement(), - f.newFieldElement() - zero := f.newFieldElement() - f.copy(u, f.p) - f.copy(v, e) - f.copy(s, f._one) - var k int - var found = false - byteSize := f.byteSize() - bitSize := byteSize * 8 - // Phase 1 - for i := 0; i < bitSize*2; i++ { - if f.equal(v, zero) { - found = true - break - } - if is_even(u) { - f.div_two(u) - f.mul_two(s) - } else if is_even(v) { - f.div_two(v) - f.mul_two(r) - } else if f.cmp(u, v) == 1 { - f.subn(u, v) - f.div_two(u) - f.addn(r, s) - f.mul_two(s) - } else { - f.subn(v, u) - f.div_two(v) - f.addn(s, r) - f.mul_two(r) - } - k += 1 - } - if !found { - f.copy(inv, zero) - return false - } - - if f.cmp(r, f.p) != -1 { - f.subn(r, f.p) - } - f.copy(u, f.p) - f.subn(u, r) - if k < bitSize { - f.copy(inv, zero) - return false - } - // Phase 2 - for i := k; i < bitSize*2; i++ { - f.double(u, u) - } - f.copy(inv, u) - return true -} func (f *field) inverse(inv, e fieldElement) bool { if f.equal(e, f.zero) { f.copy(inv, f.zero) @@ -768,11 +781,3 @@ func (f *field) inverse(inv, e fieldElement) bool { f.toMont(inv, u) return true } - -func (f *field) square(result, a fieldElement) { - f.mul(result, a, a) -} - -func (f *field) isZero(a fieldElement) bool { - return f.equal(a, f.zero) -} diff --git a/field_test.go b/field_test.go index c608fee..674ee5d 100644 --- a/field_test.go +++ b/field_test.go @@ -3,7 +3,6 @@ package eip import ( "bytes" "crypto/rand" - "encoding/hex" "flag" "fmt" "math/big" @@ -15,17 +14,14 @@ var fuz int = 1 var targetNumberOfLimb int = -1 var from = 1 -var to = 8 +var to = 16 -// TODO: remove after fuzz testing -func checkLimbSizeControl(limbsInField, limbs int) bool { - if limbsInField == limbs { - return true +func TestArch(t *testing.T) { + answer := "Yes." + if nonADXBMI2 { + answer = "No." } - if USE_4LIMBS_FOR_LOWER_LIMBS && limbs <= 4 && limbsInField == 4 { - return true - } - return false + fmt.Printf("Is using ADX backend extension? %s\n", answer) } func TestMain(m *testing.M) { @@ -53,8 +49,11 @@ func randField(limbSize int) *field { rawpbytes := pbig.Bytes() pbytes := make([]byte, byteSize) copy(pbytes[byteSize-len(rawpbytes):], pbig.Bytes()) - f, _ := newField(pbytes) - return f + field, err := newField(pbytes) + if err != nil { + panic(err) + } + return field } func debugBytes(a ...[]byte) { @@ -75,14 +74,7 @@ func resolveLimbSize(bitSize int) int { } func randBytes(max *big.Int) []byte { - // return padBytes(randBig(max).Bytes(), resolveLimbSize(max.BitLen())*8) - out := padBytes(randBig(max).Bytes(), resolveLimbSize(max.BitLen())*8) - // TODO: remove after fuzz testing - limbSize := resolveLimbSize(max.BitLen()) - if limbSize < 4 { - out = padBytes(out, 32) - } - return out + return padBytes(randBig(max).Bytes(), resolveLimbSize(max.BitLen())*8) } func randBig(max *big.Int) *big.Int { @@ -101,12 +93,12 @@ func BenchmarkField(t *testing.B) { return } field := randField(limbSize) - if !checkLimbSizeControl(field.limbSize, limbSize) { + if field.limbSize != limbSize { t.Fatalf("bad field construction") } bitSize := limbSize * 64 - a, _ := field.randFieldElement(rand.Reader) - b, _ := field.randFieldElement(rand.Reader) + a := field.randFieldElement(rand.Reader) + b := field.randFieldElement(rand.Reader) c := field.newFieldElement() t.Run(fmt.Sprintf("%d_add", bitSize), func(t *testing.B) { for i := 0; i < t.N; i++ { @@ -140,7 +132,7 @@ func TestShift(t *testing.T) { for limbSize := from; limbSize < to+1; limbSize++ { t.Run(fmt.Sprintf("%d_shift", limbSize*64), func(t *testing.T) { field := randField(limbSize) - a, _ := field.randFieldElement(rand.Reader) + a := field.randFieldElement(rand.Reader) bi := field.toBigNoTransform(a) da := field.newFieldElement() field.copy(da, a) @@ -198,7 +190,7 @@ func TestCopy(t *testing.T) { for limbSize := from; limbSize < to+1; limbSize++ { t.Run(fmt.Sprintf("%d_copy", limbSize*64), func(t *testing.T) { field := randField(limbSize) - a, _ := field.randFieldElement(rand.Reader) + a := field.randFieldElement(rand.Reader) b := field.newFieldElement() field.copy(b, a) if !field.equal(a, b) { @@ -212,7 +204,7 @@ func TestSerialization(t *testing.T) { for limbSize := from; limbSize < to+1; limbSize++ { t.Run(fmt.Sprintf("%d_serialization", limbSize*64), func(t *testing.T) { field := randField(limbSize) - if !checkLimbSizeControl(field.limbSize, limbSize) { + if field.limbSize != limbSize { t.Fatalf("bad field construction\n") } // demont(r) == 1 @@ -229,7 +221,7 @@ func TestSerialization(t *testing.T) { } for i := 0; i < fuz; i++ { field := randField(limbSize) - if !checkLimbSizeControl(field.limbSize, limbSize) { + if field.limbSize != limbSize { t.Fatalf("bad field construction") } // bytes @@ -274,11 +266,11 @@ func TestAdditionCrossAgainstBigInt(t *testing.T) { t.Run(fmt.Sprintf("%d_addition_cross", limbSize*64), func(t *testing.T) { for i := 0; i < fuz; i++ { field := randField(limbSize) - if !checkLimbSizeControl(field.limbSize, limbSize) { + if field.limbSize != limbSize { t.Fatalf("Bad field construction") } - a, _ := field.randFieldElement(rand.Reader) - b, _ := field.randFieldElement(rand.Reader) + a := field.randFieldElement(rand.Reader) + b := field.randFieldElement(rand.Reader) c := field.newFieldElement() big_a := field.toBig(a) big_b := field.toBig(b) @@ -317,11 +309,11 @@ func TestAdditionProperties(t *testing.T) { t.Run(fmt.Sprintf("%d_addition_properties", limbSize*64), func(t *testing.T) { for i := 0; i < fuz; i++ { field := randField(limbSize) - if !checkLimbSizeControl(field.limbSize, limbSize) { + if field.limbSize != limbSize { t.Fatalf("bad field construction") } - a, _ := field.randFieldElement(rand.Reader) - b, _ := field.randFieldElement(rand.Reader) + a := field.randFieldElement(rand.Reader) + b := field.randFieldElement(rand.Reader) c_1 := field.newFieldElement() c_2 := field.newFieldElement() field.add(c_1, a, field.zero) @@ -361,7 +353,7 @@ func TestAdditionProperties(t *testing.T) { if !field.equal(c_1, c_2) { t.Fatalf("a - b = - ( b - a )") } - c_x, _ := field.randFieldElement(rand.Reader) + c_x := field.randFieldElement(rand.Reader) field.add(c_1, a, b) field.add(c_1, c_1, c_x) field.add(c_2, a, c_x) @@ -386,11 +378,11 @@ func TestMultiplicationCrossAgainstBigInt(t *testing.T) { t.Run(fmt.Sprintf("%d_multiplication_cross", limbSize*64), func(t *testing.T) { for i := 0; i < fuz; i++ { field := randField(limbSize) - if !checkLimbSizeControl(field.limbSize, limbSize) { + if field.limbSize != limbSize { t.Fatalf("bad field construction") } - a, _ := field.randFieldElement(rand.Reader) - b, _ := field.randFieldElement(rand.Reader) + a := field.randFieldElement(rand.Reader) + b := field.randFieldElement(rand.Reader) c := field.newFieldElement() big_a := field.toBig(a) big_b := field.toBig(b) @@ -411,11 +403,11 @@ func TestMultiplicationProperties(t *testing.T) { t.Run(fmt.Sprintf("%d_multiplication_properties", limbSize*64), func(t *testing.T) { for i := 0; i < fuz; i++ { field := randField(limbSize) - if !checkLimbSizeControl(field.limbSize, limbSize) { + if field.limbSize != limbSize { t.Fatalf("bad field construction") } - a, _ := field.randFieldElement(rand.Reader) - b, _ := field.randFieldElement(rand.Reader) + a := field.randFieldElement(rand.Reader) + b := field.randFieldElement(rand.Reader) c_1 := field.newFieldElement() c_2 := field.newFieldElement() field.mul(c_1, a, field.zero) @@ -431,7 +423,7 @@ func TestMultiplicationProperties(t *testing.T) { if !field.equal(c_1, c_2) { t.Fatalf("a * b == b * a") } - c_x, _ := field.randFieldElement(rand.Reader) + c_x := field.randFieldElement(rand.Reader) field.mul(c_1, a, b) field.mul(c_1, c_1, c_x) field.mul(c_2, c_x, b) @@ -449,10 +441,10 @@ func TestExponentiation(t *testing.T) { t.Run(fmt.Sprintf("%d_exponention", limbSize*64), func(t *testing.T) { for i := 0; i < fuz; i++ { field := randField(limbSize) - if !checkLimbSizeControl(field.limbSize, limbSize) { + if field.limbSize != limbSize { t.Fatalf("bad field construction") } - a, _ := field.randFieldElement(rand.Reader) + a := field.randFieldElement(rand.Reader) u := field.newFieldElement() field.exp(u, a, big.NewInt(0)) if !field.equal(u, field.one) { @@ -484,39 +476,7 @@ func TestExponentiation(t *testing.T) { } } -// func TestInversion(t *testing.T) { -// for limbSize := from; limbSize < to+1; limbSize++ { -// t.Run(fmt.Sprintf("%d_inversion", limbSize*64), func(t *testing.T) { -// for i := 0; i < fuz; i++ { -// field := randField(limbSize) -// u := field.newFieldElement() -// field.inverse(u, field.zero) -// if !field.equal(u, field.zero) { -// t.Fatalf("(0^-1) == 0)") -// } -// field.inverse(u, field.one) -// if !field.equal(u, field.one) { -// t.Fatalf("(1^-1) == 1)") -// } -// a, _ := field.randFieldElement(rand.Reader) -// field.inverse(u, a) -// field.mul(u, u, a) -// if !field.equal(u, field.r) { -// t.Fatalf("(r*a) * r*(a^-1) == r)") -// } -// v := field.newFieldElement() -// p := new(big.Int).Set(field.pbig) -// field.exp(u, a, p.Sub(p, big.NewInt(2))) -// field.inverse(v, a) -// if !field.equal(v, u) { -// t.Fatalf("a^(p-2) == a^-1") -// } -// } -// }) -// } -// } - -func TestNewInversion(t *testing.T) { +func TestInversion(t *testing.T) { for limbSize := from; limbSize < to+1; limbSize++ { t.Run(fmt.Sprintf("%d_inversion", limbSize*64), func(t *testing.T) { for i := 0; i < fuz; i++ { @@ -530,7 +490,7 @@ func TestNewInversion(t *testing.T) { if !field.equal(u, field.one) { t.Fatalf("(1^-1) == 1)") } - a, _ := field.randFieldElement(rand.Reader) + a := field.randFieldElement(rand.Reader) field.inverse(u, a) field.mul(u, u, a) if !field.equal(u, field.r) { @@ -547,60 +507,3 @@ func TestNewInversion(t *testing.T) { }) } } - -func TestNewInverse(t *testing.T) { - modBytes, err := hex.DecodeString("f3") - if err != nil { - t.Fatal(err) - } - if len(modBytes) < 8 { - modBytes = padBytes(modBytes, 8) - } - // fmt.Printf("modBytes: %x\n", modBytes) - f, err := newField(modBytes) - if err != nil { - t.Fatal(err) - } - - elemBytes, err := hex.DecodeString("e3") - if err != nil { - t.Fatal(err) - } - if len(elemBytes) < 32 { - elemBytes = padBytes(elemBytes, 32) - } - elem, err := f.newFieldElementFromBytes(elemBytes) - if err != nil { - t.Fatal(err) - } - inv := f.newFieldElement() - if ok := f.inverse(inv, elem); !ok { - t.Logf("no inverse") - } - f.mul(inv, inv, elem) - if !f.equal(inv, f.one) { - t.Fatalf("bad inversion") - } -} -func TestNewInverse2(t *testing.T) { - modLen := 32 - modBytes := bytes_(modLen, "0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47") - f, err := newField(modBytes) - if err != nil { - t.Fatal(err) - } - elemBytes := bytes_(modLen, "0x07") - elem, err := f.newFieldElementFromBytes(elemBytes) - if err != nil { - t.Fatal(err) - } - inv := f.newFieldElement() - if ok := f.inverse(inv, elem); !ok { - t.Logf("no inverse") - } - f.mul(inv, inv, elem) - if !f.equal(inv, f.one) { - t.Logf("inv: %s\n", f.toString(inv)) - t.Fatalf("bad inversion") - } -} diff --git a/go.mod b/go.mod index cb3507b..b2d9a6d 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,7 @@ require ( golang.org/x/arch v0.0.0-20190312162104-788fe5ffcd8c // indirect golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4 // indirect golang.org/x/net v0.0.0-20190724013045-ca1201d0de80 // indirect - golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e // indirect + golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e golang.org/x/text v0.3.2 // indirect golang.org/x/tools v0.0.0-20190729092621-ff9f1409240a // indirect ) diff --git a/go.sum b/go.sum index de24c7f..353b225 100644 --- a/go.sum +++ b/go.sum @@ -14,6 +14,7 @@ golang.org/x/net v0.0.0-20190724013045-ca1201d0de80/go.mod h1:z5CRVTTTmAJ677TzLL golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e h1:D5TXcfTk7xF7hvieo4QErS3qqCB4teTffacDWr7CI+0= golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= diff --git a/x86_arithmetic.s b/x86_arithmetic.s index 5769564..b8da3ab 100644 --- a/x86_arithmetic.s +++ b/x86_arithmetic.s @@ -1,7 +1,212 @@ -// Code generated by command: go run main.go -output ./generic_adx -opt D -arch ADX. DO NOT EDIT. +// Code generated by command: go run main.go -output generic -opt D. DO NOT EDIT. #include "textflag.h" +// func cpy1(dst *[1]uint64, src *[1]uint64) +TEXT ·cpy1(SB), NOSPLIT, $0-16 + MOVQ dst+0(FP), DI + MOVQ src+8(FP), SI + MOVQ (SI), R8 + MOVQ R8, (DI) + RET + +// func eq1(a *[1]uint64, b *[1]uint64) bool +TEXT ·eq1(SB), NOSPLIT, $0-17 + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + MOVB $0x00, ret+16(FP) + MOVQ (DI), R8 + CMPQ (SI), R8 + JNE ret + MOVB $0x01, ret+16(FP) + +ret: + RET + +// func cmp1(a *[1]uint64, b *[1]uint64) int8 +TEXT ·cmp1(SB), NOSPLIT, $0-17 + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + MOVQ (DI), R8 + CMPQ (SI), R8 + JB gt + JA lt + MOVB $0x00, ret+16(FP) + JMP ret + +gt: + MOVB $0x01, ret+16(FP) + JMP ret + +lt: + MOVB $0xff, ret+16(FP) + +ret: + RET + +// func add1(c *[1]uint64, a *[1]uint64, b *[1]uint64, p *[1]uint64) +TEXT ·add1(SB), NOSPLIT, $0-32 + // | + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + XORQ AX, AX + + // | + MOVQ (DI), CX + ADDQ (SI), CX + ADCQ $0x00, AX + + // | + MOVQ p+24(FP), SI + MOVQ CX, DX + SUBQ (SI), DX + SBBQ $0x00, AX + + // | + MOVQ c+0(FP), DI + CMOVQCC DX, CX + MOVQ CX, (DI) + RET + + // | + +/* end */ + + RET + +// func addn1(a *[1]uint64, b *[1]uint64) uint64 +TEXT ·addn1(SB), NOSPLIT, $0-24 + // | + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + + // | + MOVQ (DI), CX + ADDQ (SI), CX + ADCQ $0x00, AX + + // | + MOVQ CX, (DI) + MOVQ AX, ret+16(FP) + RET + + // | + +/* end */ + + RET + +// func double1(c *[1]uint64, a *[1]uint64, p *[1]uint64) +TEXT ·double1(SB), NOSPLIT, $0-24 + // | + MOVQ a+8(FP), DI + XORQ AX, AX + MOVQ (DI), CX + ADDQ CX, CX + ADCQ $0x00, AX + + // | + MOVQ p+16(FP), SI + MOVQ CX, DX + SUBQ (SI), DX + SBBQ $0x00, AX + + // | + MOVQ c+0(FP), DI + CMOVQCC DX, CX + MOVQ CX, (DI) + RET + + // | + +/* end */ + + RET + +// func sub1(c *[1]uint64, a *[1]uint64, b *[1]uint64, p *[1]uint64) +TEXT ·sub1(SB), NOSPLIT, $0-32 + // | + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + XORQ AX, AX + MOVQ (DI), CX + SUBQ (SI), CX + + // | + MOVQ p+24(FP), SI + MOVQ (SI), DX + CMOVQCC AX, DX + + // | + MOVQ c+0(FP), DI + ADDQ DX, CX + MOVQ CX, (DI) + RET + + // | + +/* end */ + + RET + +// func subn1(a *[1]uint64, b *[1]uint64) uint64 +TEXT ·subn1(SB), NOSPLIT, $0-24 + // | + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + XORQ AX, AX + + // | + MOVQ (DI), CX + SUBQ (SI), CX + ADCQ $0x00, AX + + // | + MOVQ CX, (DI) + MOVQ AX, ret+16(FP) + RET + + // | + +/* end */ + + RET + +// func _neg1(c *[1]uint64, a *[1]uint64, p *[1]uint64) +TEXT ·_neg1(SB), NOSPLIT, $0-24 + // | + MOVQ a+8(FP), DI + + // | + MOVQ p+16(FP), SI + MOVQ (SI), CX + SUBQ (DI), CX + + // | + MOVQ c+0(FP), DI + MOVQ CX, (DI) + RET + + // | + +/* end */ + + RET + +// func mul_two_1(a *[1]uint64) +TEXT ·mul_two_1(SB), NOSPLIT, $0-8 + MOVQ a+0(FP), DI + XORQ AX, AX + RCLQ $0x01, (DI) + RET + +// func div_two_1(a *[1]uint64) +TEXT ·div_two_1(SB), NOSPLIT, $0-8 + MOVQ a+0(FP), DI + XORQ AX, AX + RCRQ $0x01, (DI) + RET + // func cpy2(dst *[2]uint64, src *[2]uint64) TEXT ·cpy2(SB), NOSPLIT, $0-16 MOVQ dst+0(FP), DI @@ -83,6 +288,12 @@ TEXT ·add2(SB), NOSPLIT, $0-32 MOVQ DX, 8(DI) RET + // | + +/* end */ + + RET + // func addn2(a *[2]uint64, b *[2]uint64) uint64 TEXT ·addn2(SB), NOSPLIT, $0-24 // | @@ -102,6 +313,12 @@ TEXT ·addn2(SB), NOSPLIT, $0-24 MOVQ AX, ret+16(FP) RET + // | + +/* end */ + + RET + // func double2(c *[2]uint64, a *[2]uint64, p *[2]uint64) TEXT ·double2(SB), NOSPLIT, $0-24 // | @@ -129,6 +346,12 @@ TEXT ·double2(SB), NOSPLIT, $0-24 MOVQ DX, 8(DI) RET + // | + +/* end */ + + RET + // func sub2(c *[2]uint64, a *[2]uint64, b *[2]uint64, p *[2]uint64) TEXT ·sub2(SB), NOSPLIT, $0-32 // | @@ -155,6 +378,12 @@ TEXT ·sub2(SB), NOSPLIT, $0-32 MOVQ DX, 8(DI) RET + // | + +/* end */ + + RET + // func subn2(a *[2]uint64, b *[2]uint64) uint64 TEXT ·subn2(SB), NOSPLIT, $0-24 // | @@ -175,6 +404,12 @@ TEXT ·subn2(SB), NOSPLIT, $0-24 MOVQ AX, ret+16(FP) RET + // | + +/* end */ + + RET + // func _neg2(c *[2]uint64, a *[2]uint64, p *[2]uint64) TEXT ·_neg2(SB), NOSPLIT, $0-24 // | @@ -193,6 +428,12 @@ TEXT ·_neg2(SB), NOSPLIT, $0-24 MOVQ DX, 8(DI) RET + // | + +/* end */ + + RET + // func mul_two_2(a *[2]uint64) TEXT ·mul_two_2(SB), NOSPLIT, $0-8 MOVQ a+0(FP), DI @@ -209,107 +450,363 @@ TEXT ·div_two_2(SB), NOSPLIT, $0-8 RCRQ $0x01, (DI) RET -// func mul2(c *[4]uint64, a *[2]uint64, b *[2]uint64, p *[2]uint64, inp uint64) +// func mul2(c *[2]uint64, a *[2]uint64, b *[2]uint64, p *[2]uint64, inp uint64) TEXT ·mul2(SB), NOSPLIT, $0-40 // | -/* inputs */ +/* inputs */ MOVQ a+8(FP), DI MOVQ b+16(FP), SI XORQ AX, AX - // | - // | - MOVQ (SI), DX + // | - // | - MULXQ (DI), AX, R8 - MOVQ AX, CX +/* i = 0 */ - // | - MULXQ 8(DI), AX, R9 + // | a0 @ DX + MOVQ (DI), DX + + // | a0 * b0 + MULXQ (SI), CX, R8 + + // | a0 * b1 + MULXQ 8(SI), AX, R9 ADCXQ AX, R8 ADCQ $0x00, R9 - // | - // | - MOVQ 8(SI), DX - XORQ SI, SI + // | - // | - MULXQ (DI), AX, BX +/* i = 1 */ + + // | a1 @ DX + MOVQ 8(DI), DX + XORQ DI, DI + + // | a1 * b0 + MULXQ (SI), AX, BX ADOXQ AX, R8 ADCXQ BX, R9 - // | - MULXQ 8(DI), AX, BX + // | a1 * b1 + MULXQ 8(SI), AX, BX ADOXQ AX, R9 - ADOXQ BX, SI - ADCQ $0x00, SI + ADOXQ BX, DI + ADCQ $0x00, DI - // | - MOVQ p+24(FP), R15 + // | - // | - // | - XORQ BX, BX - MOVQ CX, DX - MULXQ inp+32(FP), DX, DI +/* */ - // | - MULXQ (R15), AX, DI - ADOXQ AX, CX - ADCXQ DI, R8 + // | + // | W + // | 0 CX | 1 R8 + // | 2 R9 | 3 DI - // | - MULXQ 8(R15), AX, DI - ADOXQ AX, R8 - ADCXQ DI, R9 - ADOXQ BX, R9 - ADCXQ BX, BX - XORQ CX, CX - // | - // | - MOVQ R8, DX - MULXQ inp+32(FP), DX, DI + // | fetch modulus + MOVQ p+24(FP), SI - // | - MULXQ (R15), AX, DI - ADOXQ AX, R8 - ADCXQ DI, R9 + // | + // | W ready to mont + // | 0 CX | 1 R8 + // | 2 R9 | 3 DI - // | - MULXQ 8(R15), AX, DI - ADOXQ AX, R9 - ADCXQ DI, SI - ADOXQ BX, SI - MOVQ $0x00, BX - ADCXQ BX, BX - ADOXQ R8, BX // | -/* reduction */ +/* montgomery reduction */ - MOVQ R9, AX - SUBQ (R15), AX - MOVQ SI, DX - SBBQ 8(R15), DX - SBBQ $0x00, BX + // | clear flags + XORQ AX, AX - // | - MOVQ c+0(FP), BX - CMOVQCC AX, R9 - MOVQ R9, (BX) - CMOVQCC DX, SI - MOVQ SI, 8(BX) + // | + +/* i = 0 */ + + // | + // | W + // | 0 CX | 1 R8 + // | 2 R9 | 3 DI + + + // | | u0 = w0 * inp + MOVQ CX, DX + MULXQ inp+32(FP), DX, BX + + // | + +/* */ + + // | j0 + + // | w0 @ CX + MULXQ (SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | j1 + + // | w1 @ R8 + MULXQ 8(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + ADOXQ CX, R9 + ADCXQ CX, CX + MOVQ $0x00, AX + ADOXQ AX, CX + + // | clear flags + XORQ AX, AX + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 R8 + // | 2 R9 | 3 DI + + + // | | u1 = w1 * inp + MOVQ R8, DX + MULXQ inp+32(FP), DX, BX + + // | + +/* */ + + // | j0 + + // | w1 @ R8 + MULXQ (SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | j1 + + // | w2 @ R9 + MULXQ 8(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, DI + ADOXQ CX, DI + ADCXQ R8, R8 + MOVQ $0x00, AX + ADOXQ AX, R8 + + // | + // | W montgomery reduction ends + // | 0 - | 1 - + // | 2 R9 | 3 DI + + + // | + +/* modular reduction */ + + MOVQ R9, CX + SUBQ (SI), CX + MOVQ DI, AX + SBBQ 8(SI), AX + SBBQ $0x00, R8 + + // | + +/* out */ + + MOVQ c+0(FP), R8 + CMOVQCC CX, R9 + MOVQ R9, (R8) + CMOVQCC AX, DI + MOVQ DI, 8(R8) RET // | -/* end */ +/* end */ + + +// func mul_no_adx_bmi2_2(c *[2]uint64, a *[2]uint64, b *[2]uint64, p *[2]uint64, inp uint64) +TEXT ·mul_no_adx_bmi2_2(SB), NOSPLIT, $0-40 + // | + +/* inputs */ + + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + MOVQ $0x00, R10 + MOVQ $0x00, BX + + // | + +/* i = 0 */ + + // | a0 @ CX + MOVQ (DI), CX + + // | a0 * b0 + MOVQ (SI), AX + MULQ CX + MOVQ AX, R8 + MOVQ DX, R9 + + // | a0 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + + // | + +/* i = 1 */ + + // | a1 @ CX + MOVQ 8(DI), CX + + // | a1 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ $0x00, BX + + // | a1 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, BX + + // | + +/* */ + + // | + // | W + // | 0 R8 | 1 R9 + // | 2 R10 | 3 BX + + + // | fetch modulus + MOVQ p+24(FP), SI + + // | + +/* montgomery reduction */ + + // | + +/* i = 0 */ + + // | + // | W + // | 0 R8 | 1 R9 + // | 2 R10 | 3 BX + + + // | | u0 = w0 * inp + MOVQ R8, AX + MULQ inp+32(FP) + MOVQ AX, DI + MOVQ $0x00, CX + + // | + +/* */ + + // | j0 + + // | w0 @ R8 + MOVQ (SI), AX + MULQ DI + ADDQ AX, R8 + ADCQ DX, CX + + // | j1 + + // | w1 @ R9 + MOVQ 8(SI), AX + MULQ DI + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ CX, R9 + + // | w2 @ R10 + ADCQ DX, R10 + ADCQ $0x00, R8 + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 R9 + // | 2 R10 | 3 BX + + + // | | u1 = w1 * inp + MOVQ R9, AX + MULQ inp+32(FP) + MOVQ AX, DI + MOVQ $0x00, CX + + // | + +/* */ + + // | j0 + + // | w1 @ R9 + MOVQ (SI), AX + MULQ DI + ADDQ AX, R9 + ADCQ DX, CX + + // | j1 + + // | w2 @ R10 + MOVQ 8(SI), AX + MULQ DI + ADDQ AX, R10 + ADCQ DX, R8 + ADDQ CX, R10 + + // | w-1 @ BX + ADCQ R8, BX + MOVQ $0x00, R8 + ADCQ $0x00, R8 + + // | + // | W montgomerry reduction ends + // | 0 - | 1 - + // | 2 R10 | 3 BX + + + // | + +/* modular reduction */ + + MOVQ R10, R9 + SUBQ (SI), R9 + MOVQ BX, R11 + SBBQ 8(SI), R11 + SBBQ $0x00, R8 + + // | + +/* out */ + + MOVQ c+0(FP), R8 + CMOVQCC R9, R10 + MOVQ R10, (R8) + CMOVQCC R11, BX + MOVQ BX, 8(R8) + RET + + // | + +/* end */ // func cpy3(dst *[3]uint64, src *[3]uint64) @@ -408,6 +905,12 @@ TEXT ·add3(SB), NOSPLIT, $0-32 MOVQ R8, 16(DI) RET + // | + +/* end */ + + RET + // func addn3(a *[3]uint64, b *[3]uint64) uint64 TEXT ·addn3(SB), NOSPLIT, $0-24 // | @@ -430,6 +933,12 @@ TEXT ·addn3(SB), NOSPLIT, $0-24 MOVQ AX, ret+16(FP) RET + // | + +/* end */ + + RET + // func double3(c *[3]uint64, a *[3]uint64, p *[3]uint64) TEXT ·double3(SB), NOSPLIT, $0-24 // | @@ -463,6 +972,12 @@ TEXT ·double3(SB), NOSPLIT, $0-24 MOVQ R8, 16(DI) RET + // | + +/* end */ + + RET + // func sub3(c *[3]uint64, a *[3]uint64, b *[3]uint64, p *[3]uint64) TEXT ·sub3(SB), NOSPLIT, $0-32 // | @@ -495,6 +1010,12 @@ TEXT ·sub3(SB), NOSPLIT, $0-32 MOVQ R8, 16(DI) RET + // | + +/* end */ + + RET + // func subn3(a *[3]uint64, b *[3]uint64) uint64 TEXT ·subn3(SB), NOSPLIT, $0-24 // | @@ -518,6 +1039,12 @@ TEXT ·subn3(SB), NOSPLIT, $0-24 MOVQ AX, ret+16(FP) RET + // | + +/* end */ + + RET + // func _neg3(c *[3]uint64, a *[3]uint64, p *[3]uint64) TEXT ·_neg3(SB), NOSPLIT, $0-24 // | @@ -539,6 +1066,12 @@ TEXT ·_neg3(SB), NOSPLIT, $0-24 MOVQ R8, 16(DI) RET + // | + +/* end */ + + RET + // func mul_two_3(a *[3]uint64) TEXT ·mul_two_3(SB), NOSPLIT, $0-8 MOVQ a+0(FP), DI @@ -557,721 +1090,588 @@ TEXT ·div_two_3(SB), NOSPLIT, $0-8 RCRQ $0x01, (DI) RET -// func mul3(c *[6]uint64, a *[3]uint64, b *[3]uint64, p *[3]uint64, inp uint64) +// func mul3(c *[3]uint64, a *[3]uint64, b *[3]uint64, p *[3]uint64, inp uint64) TEXT ·mul3(SB), NOSPLIT, $0-40 // | -/* inputs */ +/* inputs */ MOVQ a+8(FP), DI MOVQ b+16(FP), SI XORQ AX, AX - // | - // | - MOVQ (SI), DX + // | - // | - MULXQ (DI), AX, R8 - MOVQ AX, CX +/* i = 0 */ - // | - MULXQ 8(DI), AX, R9 + // | a0 @ DX + MOVQ (DI), DX + + // | a0 * b0 + MULXQ (SI), CX, R8 + + // | a0 * b1 + MULXQ 8(SI), AX, R9 ADCXQ AX, R8 - // | - MULXQ 16(DI), AX, R10 + // | a0 * b2 + MULXQ 16(SI), AX, R10 ADCXQ AX, R9 ADCQ $0x00, R10 - // | - // | - MOVQ 8(SI), DX + // | + +/* i = 1 */ + + // | a1 @ DX + MOVQ 8(DI), DX XORQ R11, R11 - // | - MULXQ (DI), AX, BX + // | a1 * b0 + MULXQ (SI), AX, BX ADOXQ AX, R8 ADCXQ BX, R9 - // | - MULXQ 8(DI), AX, BX + // | a1 * b1 + MULXQ 8(SI), AX, BX ADOXQ AX, R9 ADCXQ BX, R10 - // | - MULXQ 16(DI), AX, BX + // | a1 * b2 + MULXQ 16(SI), AX, BX ADOXQ AX, R10 ADOXQ R11, R11 ADCXQ BX, R11 - // | - // | - MOVQ 16(SI), DX - XORQ SI, SI + // | - // | - MULXQ (DI), AX, BX +/* i = 2 */ + + // | a2 @ DX + MOVQ 16(DI), DX + XORQ DI, DI + + // | a2 * b0 + MULXQ (SI), AX, BX ADOXQ AX, R9 ADCXQ BX, R10 - // | - MULXQ 8(DI), AX, BX + // | a2 * b1 + MULXQ 8(SI), AX, BX ADOXQ AX, R10 ADCXQ BX, R11 - // | - MULXQ 16(DI), AX, BX + // | a2 * b2 + MULXQ 16(SI), AX, BX ADOXQ AX, R11 - ADOXQ BX, SI - ADCQ $0x00, SI + ADOXQ BX, DI + ADCQ $0x00, DI - // | - MOVQ p+24(FP), R15 + // | - // | - // | - XORQ BX, BX - MOVQ CX, DX - MULXQ inp+32(FP), DX, DI +/* */ - // | - MULXQ (R15), AX, DI - ADOXQ AX, CX - ADCXQ DI, R8 + // | + // | W + // | 0 CX | 1 R8 | 2 R9 + // | 3 R10 | 4 R11 | 5 DI - // | - MULXQ 8(R15), AX, DI - ADOXQ AX, R8 - ADCXQ DI, R9 - // | - MULXQ 16(R15), AX, DI - ADOXQ AX, R9 - ADCXQ DI, R10 - ADOXQ BX, R10 - ADCXQ BX, BX - XORQ CX, CX + // | fetch modulus + MOVQ p+24(FP), SI - // | - // | - MOVQ R8, DX - MULXQ inp+32(FP), DX, DI + // | + // | W ready to mont + // | 0 CX | 1 R8 | 2 R9 + // | 3 R10 | 4 R11 | 5 DI - // | - MULXQ (R15), AX, DI - ADOXQ AX, R8 - ADCXQ DI, R9 - // | - MULXQ 8(R15), AX, DI - ADOXQ AX, R9 - ADCXQ DI, R10 + // | - // | - MULXQ 16(R15), AX, DI - ADOXQ AX, R10 - ADCXQ DI, R11 - ADOXQ BX, R11 - MOVQ $0x00, BX - ADCXQ BX, BX - XORQ R8, R8 +/* montgomery reduction */ - // | - // | - MOVQ R9, DX - MULXQ inp+32(FP), DX, DI + // | clear flags + XORQ AX, AX - // | - MULXQ (R15), AX, DI + // | + +/* i = 0 */ + + // | + // | W + // | 0 CX | 1 R8 | 2 R9 + // | 3 R10 | 4 R11 | 5 DI + + + // | | u0 = w0 * inp + MOVQ CX, DX + MULXQ inp+32(FP), DX, BX + + // | + +/* */ + + // | j0 + + // | w0 @ CX + MULXQ (SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | j1 + + // | w1 @ R8 + MULXQ 8(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | j2 + + // | w2 @ R9 + MULXQ 16(SI), AX, BX ADOXQ AX, R9 - ADCXQ DI, R10 + ADCXQ BX, R10 + ADOXQ CX, R10 + ADCXQ CX, CX + MOVQ $0x00, AX + ADOXQ AX, CX - // | - MULXQ 8(R15), AX, DI + // | clear flags + XORQ AX, AX + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 R8 | 2 R9 + // | 3 R10 | 4 R11 | 5 DI + + + // | | u1 = w1 * inp + MOVQ R8, DX + MULXQ inp+32(FP), DX, BX + + // | + +/* */ + + // | j0 + + // | w1 @ R8 + MULXQ (SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | j1 + + // | w2 @ R9 + MULXQ 8(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | j2 + + // | w3 @ R10 + MULXQ 16(SI), AX, BX ADOXQ AX, R10 - ADCXQ DI, R11 + ADCXQ BX, R11 + ADOXQ CX, R11 + ADCXQ R8, R8 + MOVQ $0x00, AX + ADOXQ AX, R8 - // | - MULXQ 16(R15), AX, DI - ADOXQ AX, R11 - ADCXQ DI, SI - ADOXQ BX, SI - MOVQ $0x00, BX - ADCXQ BX, BX - ADOXQ R9, BX + // | clear flags + XORQ AX, AX // | -/* reduction */ +/* i = 2 */ - MOVQ R10, AX - SUBQ (R15), AX - MOVQ R11, DX - SBBQ 8(R15), DX - MOVQ SI, DI - SBBQ 16(R15), DI - SBBQ $0x00, BX + // | + // | W + // | 0 - | 1 - | 2 R9 + // | 3 R10 | 4 R11 | 5 DI - // | - MOVQ c+0(FP), BX - CMOVQCC AX, R10 - MOVQ R10, (BX) - CMOVQCC DX, R11 - MOVQ R11, 8(BX) - CMOVQCC DI, SI - MOVQ SI, 16(BX) - RET + + // | | u2 = w2 * inp + MOVQ R9, DX + MULXQ inp+32(FP), DX, BX // | -/* end */ +/* */ + // | j0 -// func cpy4(dst *[4]uint64, src *[4]uint64) -TEXT ·cpy4(SB), NOSPLIT, $0-16 - MOVQ dst+0(FP), DI - MOVQ src+8(FP), SI - MOVQ (SI), R8 - MOVQ R8, (DI) - MOVQ 8(SI), R8 - MOVQ R8, 8(DI) - MOVQ 16(SI), R8 - MOVQ R8, 16(DI) - MOVQ 24(SI), R8 - MOVQ R8, 24(DI) - RET + // | w2 @ R9 + MULXQ (SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 -// func eq4(a *[4]uint64, b *[4]uint64) bool -TEXT ·eq4(SB), NOSPLIT, $0-17 - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - MOVB $0x00, ret+16(FP) - MOVQ (DI), R8 - CMPQ (SI), R8 - JNE ret - MOVQ 8(DI), R8 - CMPQ 8(SI), R8 - JNE ret - MOVQ 16(DI), R8 - CMPQ 16(SI), R8 - JNE ret - MOVQ 24(DI), R8 - CMPQ 24(SI), R8 - JNE ret - MOVB $0x01, ret+16(FP) + // | j1 -ret: - RET + // | w3 @ R10 + MULXQ 8(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 -// func cmp4(a *[4]uint64, b *[4]uint64) int8 -TEXT ·cmp4(SB), NOSPLIT, $0-17 - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - MOVQ 24(DI), R8 - CMPQ 24(SI), R8 - JB gt - JA lt - MOVQ 16(DI), R8 - CMPQ 16(SI), R8 - JB gt - JA lt - MOVQ 8(DI), R8 - CMPQ 8(SI), R8 - JB gt - JA lt - MOVQ (DI), R8 - CMPQ (SI), R8 - JB gt - JA lt - MOVB $0x00, ret+16(FP) - JMP ret + // | j2 -gt: - MOVB $0x01, ret+16(FP) - JMP ret + // | w4 @ R11 + MULXQ 16(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, DI + ADOXQ R8, DI + ADCXQ R9, R9 + MOVQ $0x00, AX + ADOXQ AX, R9 -lt: - MOVB $0xff, ret+16(FP) + // | + // | W montgomery reduction ends + // | 0 - | 1 - | 2 - + // | 3 R10 | 4 R11 | 5 DI -ret: - RET -// func add4(c *[4]uint64, a *[4]uint64, b *[4]uint64, p *[4]uint64) -TEXT ·add4(SB), NOSPLIT, $0-32 - // | - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI - XORQ AX, AX + // | - // | - MOVQ (DI), CX - ADDQ (SI), CX - MOVQ 8(DI), DX - ADCQ 8(SI), DX - MOVQ 16(DI), R8 - ADCQ 16(SI), R8 - MOVQ 24(DI), R9 - ADCQ 24(SI), R9 - ADCQ $0x00, AX +/* modular reduction */ - // | - MOVQ p+24(FP), SI - MOVQ CX, R10 - SUBQ (SI), R10 - MOVQ DX, R11 - SBBQ 8(SI), R11 - MOVQ R8, R12 - SBBQ 16(SI), R12 - MOVQ R9, R13 - SBBQ 24(SI), R13 - SBBQ $0x00, AX + MOVQ R10, CX + SUBQ (SI), CX + MOVQ R11, AX + SBBQ 8(SI), AX + MOVQ DI, BX + SBBQ 16(SI), BX + SBBQ $0x00, R9 - // | - MOVQ c+0(FP), DI - CMOVQCC R10, CX - MOVQ CX, (DI) - CMOVQCC R11, DX - MOVQ DX, 8(DI) - CMOVQCC R12, R8 - MOVQ R8, 16(DI) - CMOVQCC R13, R9 - MOVQ R9, 24(DI) + // | + +/* out */ + + MOVQ c+0(FP), R9 + CMOVQCC CX, R10 + MOVQ R10, (R9) + CMOVQCC AX, R11 + MOVQ R11, 8(R9) + CMOVQCC BX, DI + MOVQ DI, 16(R9) RET -// func addn4(a *[4]uint64, b *[4]uint64) uint64 -TEXT ·addn4(SB), NOSPLIT, $0-24 - // | - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI + // | - // | - MOVQ (DI), CX - ADDQ (SI), CX - MOVQ 8(DI), DX - ADCQ 8(SI), DX - MOVQ 16(DI), R8 - ADCQ 16(SI), R8 - MOVQ 24(DI), R9 - ADCQ 24(SI), R9 - ADCQ $0x00, AX +/* end */ - // | - MOVQ CX, (DI) - MOVQ DX, 8(DI) - MOVQ R8, 16(DI) - MOVQ R9, 24(DI) - MOVQ AX, ret+16(FP) - RET -// func double4(c *[4]uint64, a *[4]uint64, p *[4]uint64) -TEXT ·double4(SB), NOSPLIT, $0-24 - // | +// func mul_no_adx_bmi2_3(c *[3]uint64, a *[3]uint64, b *[3]uint64, p *[3]uint64, inp uint64) +TEXT ·mul_no_adx_bmi2_3(SB), NOSPLIT, $0-40 + // | + +/* inputs */ + MOVQ a+8(FP), DI - XORQ AX, AX + MOVQ b+16(FP), SI + MOVQ $0x00, R10 + MOVQ $0x00, R11 + MOVQ $0x00, R12 + + // | + +/* i = 0 */ + + // | a0 @ CX MOVQ (DI), CX - ADDQ CX, CX - MOVQ 8(DI), DX - ADCQ DX, DX - MOVQ 16(DI), R8 - ADCQ R8, R8 - MOVQ 24(DI), R9 - ADCQ R9, R9 - ADCQ $0x00, AX - // | - MOVQ p+16(FP), SI - MOVQ CX, R10 - SUBQ (SI), R10 - MOVQ DX, R11 - SBBQ 8(SI), R11 - MOVQ R8, R12 - SBBQ 16(SI), R12 - MOVQ R9, R13 - SBBQ 24(SI), R13 - SBBQ $0x00, AX + // | a0 * b0 + MOVQ (SI), AX + MULQ CX + MOVQ AX, R8 + MOVQ DX, R9 - // | - MOVQ c+0(FP), DI - CMOVQCC R10, CX - MOVQ CX, (DI) - CMOVQCC R11, DX - MOVQ DX, 8(DI) - CMOVQCC R12, R8 - MOVQ R8, 16(DI) - CMOVQCC R13, R9 - MOVQ R9, 24(DI) - RET + // | a0 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 -// func sub4(c *[4]uint64, a *[4]uint64, b *[4]uint64, p *[4]uint64) -TEXT ·sub4(SB), NOSPLIT, $0-32 - // | - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI - XORQ AX, AX - MOVQ (DI), CX - SUBQ (SI), CX - MOVQ 8(DI), DX - SBBQ 8(SI), DX - MOVQ 16(DI), R8 - SBBQ 16(SI), R8 - MOVQ 24(DI), R9 - SBBQ 24(SI), R9 + // | a0 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 - // | - MOVQ p+24(FP), SI - MOVQ (SI), R10 - CMOVQCC AX, R10 - MOVQ 8(SI), R11 - CMOVQCC AX, R11 - MOVQ 16(SI), R12 - CMOVQCC AX, R12 - MOVQ 24(SI), R13 - CMOVQCC AX, R13 + // | - // | - MOVQ c+0(FP), DI - ADDQ R10, CX - MOVQ CX, (DI) - ADCQ R11, DX - MOVQ DX, 8(DI) - ADCQ R12, R8 - MOVQ R8, 16(DI) - ADCQ R13, R9 - MOVQ R9, 24(DI) - RET +/* i = 1 */ -// func subn4(a *[4]uint64, b *[4]uint64) uint64 -TEXT ·subn4(SB), NOSPLIT, $0-24 - // | - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - XORQ AX, AX + // | a1 @ CX + MOVQ 8(DI), CX + MOVQ $0x00, BX - // | - MOVQ (DI), CX - SUBQ (SI), CX - MOVQ 8(DI), DX - SBBQ 8(SI), DX - MOVQ 16(DI), R8 - SBBQ 16(SI), R8 - MOVQ 24(DI), R9 - SBBQ 24(SI), R9 - ADCQ $0x00, AX + // | a1 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ $0x00, R11 + ADCQ $0x00, BX - // | - MOVQ CX, (DI) - MOVQ DX, 8(DI) - MOVQ R8, 16(DI) - MOVQ R9, 24(DI) - MOVQ AX, ret+16(FP) - RET + // | a1 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 -// func _neg4(c *[4]uint64, a *[4]uint64, p *[4]uint64) -TEXT ·_neg4(SB), NOSPLIT, $0-24 - // | - MOVQ a+8(FP), DI + // | a1 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 - // | - MOVQ p+16(FP), SI - MOVQ (SI), CX - SUBQ (DI), CX - MOVQ 8(SI), DX - SBBQ 8(DI), DX - MOVQ 16(SI), R8 - SBBQ 16(DI), R8 - MOVQ 24(SI), R9 - SBBQ 24(DI), R9 + // | - // | - MOVQ c+0(FP), DI - MOVQ CX, (DI) - MOVQ DX, 8(DI) - MOVQ R8, 16(DI) - MOVQ R9, 24(DI) - RET +/* i = 2 */ -// func mul_two_4(a *[4]uint64) -TEXT ·mul_two_4(SB), NOSPLIT, $0-8 - MOVQ a+0(FP), DI - XORQ AX, AX - RCLQ $0x01, (DI) - RCLQ $0x01, 8(DI) - RCLQ $0x01, 16(DI) - RCLQ $0x01, 24(DI) - RET + // | a2 @ CX + MOVQ 16(DI), CX + MOVQ $0x00, BX -// func div_two_4(a *[4]uint64) -TEXT ·div_two_4(SB), NOSPLIT, $0-8 - MOVQ a+0(FP), DI - XORQ AX, AX - RCRQ $0x01, 24(DI) - RCRQ $0x01, 16(DI) - RCRQ $0x01, 8(DI) - RCRQ $0x01, (DI) - RET + // | a2 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ $0x00, R12 + ADCQ $0x00, BX + + // | a2 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ $0x00, BX + + // | a2 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, BX -// func mul4(c *[8]uint64, a *[4]uint64, b *[4]uint64, p *[4]uint64, inp uint64) -TEXT ·mul4(SB), NOSPLIT, $0-40 // | -/* inputs */ +/* */ - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI - XORQ AX, AX + // | + // | W + // | 0 R8 | 1 R9 | 2 R10 + // | 3 R11 | 4 R12 | 5 BX - // | - // | - MOVQ (SI), DX - // | - MULXQ (DI), AX, R8 - MOVQ AX, CX + // | fetch modulus + MOVQ p+24(FP), SI - // | - MULXQ 8(DI), AX, R9 - ADCXQ AX, R8 + // | - // | - MULXQ 16(DI), AX, R10 - ADCXQ AX, R9 +/* montgomery reduction */ - // | - MULXQ 24(DI), AX, R11 - ADCXQ AX, R10 - ADCQ $0x00, R11 + // | - // | - // | - MOVQ 8(SI), DX - XORQ R12, R12 +/* i = 0 */ - // | - MULXQ (DI), AX, BX - ADOXQ AX, R8 - ADCXQ BX, R9 + // | + // | W + // | 0 R8 | 1 R9 | 2 R10 + // | 3 R11 | 4 R12 | 5 BX - // | - MULXQ 8(DI), AX, BX - ADOXQ AX, R9 - ADCXQ BX, R10 - // | - MULXQ 16(DI), AX, BX - ADOXQ AX, R10 - ADCXQ BX, R11 + // | | u0 = w0 * inp + MOVQ R8, AX + MULQ inp+32(FP) + MOVQ AX, DI + MOVQ $0x00, CX - // | - MULXQ 24(DI), AX, BX - ADOXQ AX, R11 - ADOXQ R12, R12 - ADCXQ BX, R12 + // | - // | - // | - MOVQ 16(SI), DX - XORQ R13, R13 +/* */ - // | - MULXQ (DI), AX, BX - ADOXQ AX, R9 - ADCXQ BX, R10 + // | j0 - // | - MULXQ 8(DI), AX, BX - ADOXQ AX, R10 - ADCXQ BX, R11 + // | w0 @ R8 + MOVQ (SI), AX + MULQ DI + ADDQ AX, R8 + ADCQ DX, CX - // | - MULXQ 16(DI), AX, BX - ADOXQ AX, R11 - ADCXQ BX, R12 + // | j1 - // | - MULXQ 24(DI), AX, BX - ADOXQ AX, R12 - ADOXQ R13, R13 - ADCXQ BX, R13 + // | w1 @ R9 + MOVQ 8(SI), AX + MULQ DI + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ CX, R9 + MOVQ $0x00, CX + ADCQ DX, CX - // | - // | - MOVQ 24(SI), DX - XORQ SI, SI + // | j2 - // | - MULXQ (DI), AX, BX - ADOXQ AX, R10 - ADCXQ BX, R11 + // | w2 @ R10 + MOVQ 16(SI), AX + MULQ DI + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ CX, R10 - // | - MULXQ 8(DI), AX, BX - ADOXQ AX, R11 - ADCXQ BX, R12 + // | w3 @ R11 + ADCQ DX, R11 + ADCQ $0x00, R8 - // | - MULXQ 16(DI), AX, BX - ADOXQ AX, R12 - ADCXQ BX, R13 + // | - // | - MULXQ 24(DI), AX, BX - ADOXQ AX, R13 - ADOXQ BX, SI - ADCQ $0x00, SI +/* i = 1 */ - // | - MOVQ p+24(FP), R15 + // | + // | W + // | 0 - | 1 R9 | 2 R10 + // | 3 R11 | 4 R12 | 5 BX - // | - // | - XORQ BX, BX - MOVQ CX, DX - MULXQ inp+32(FP), DX, DI - // | - MULXQ (R15), AX, DI - ADOXQ AX, CX - ADCXQ DI, R8 + // | | u1 = w1 * inp + MOVQ R9, AX + MULQ inp+32(FP) + MOVQ AX, DI + MOVQ $0x00, CX - // | - MULXQ 8(R15), AX, DI - ADOXQ AX, R8 - ADCXQ DI, R9 + // | - // | - MULXQ 16(R15), AX, DI - ADOXQ AX, R9 - ADCXQ DI, R10 +/* */ - // | - MULXQ 24(R15), AX, DI - ADOXQ AX, R10 - ADCXQ DI, R11 - ADOXQ BX, R11 - ADCXQ BX, BX - XORQ CX, CX + // | j0 - // | - // | - MOVQ R8, DX - MULXQ inp+32(FP), DX, DI + // | w1 @ R9 + MOVQ (SI), AX + MULQ DI + ADDQ AX, R9 + ADCQ DX, CX - // | - MULXQ (R15), AX, DI - ADOXQ AX, R8 - ADCXQ DI, R9 + // | j1 - // | - MULXQ 8(R15), AX, DI - ADOXQ AX, R9 - ADCXQ DI, R10 + // | w2 @ R10 + MOVQ 8(SI), AX + MULQ DI + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ CX, R10 + MOVQ $0x00, CX + ADCQ DX, CX - // | - MULXQ 16(R15), AX, DI - ADOXQ AX, R10 - ADCXQ DI, R11 + // | j2 - // | - MULXQ 24(R15), AX, DI - ADOXQ AX, R11 - ADCXQ DI, R12 - ADOXQ BX, R12 - MOVQ $0x00, BX - ADCXQ BX, BX - XORQ R8, R8 + // | w3 @ R11 + MOVQ 16(SI), AX + MULQ DI + ADDQ AX, R11 + ADCQ DX, R8 + ADDQ CX, R11 - // | - // | - MOVQ R9, DX - MULXQ inp+32(FP), DX, DI + // | w4 @ R12 + ADCQ R8, R12 + MOVQ $0x00, R8 + ADCQ $0x00, R8 - // | - MULXQ (R15), AX, DI - ADOXQ AX, R9 - ADCXQ DI, R10 + // | - // | - MULXQ 8(R15), AX, DI - ADOXQ AX, R10 - ADCXQ DI, R11 +/* i = 2 */ - // | - MULXQ 16(R15), AX, DI - ADOXQ AX, R11 - ADCXQ DI, R12 + // | + // | W + // | 0 - | 1 - | 2 R10 + // | 3 R11 | 4 R12 | 5 BX - // | - MULXQ 24(R15), AX, DI - ADOXQ AX, R12 - ADCXQ DI, R13 - ADOXQ BX, R13 - MOVQ $0x00, BX - ADCXQ BX, BX - XORQ R9, R9 - // | - // | - MOVQ R10, DX - MULXQ inp+32(FP), DX, DI + // | | u2 = w2 * inp + MOVQ R10, AX + MULQ inp+32(FP) + MOVQ AX, DI + MOVQ $0x00, CX - // | - MULXQ (R15), AX, DI - ADOXQ AX, R10 - ADCXQ DI, R11 + // | - // | - MULXQ 8(R15), AX, DI - ADOXQ AX, R11 - ADCXQ DI, R12 +/* */ - // | - MULXQ 16(R15), AX, DI - ADOXQ AX, R12 - ADCXQ DI, R13 + // | j0 - // | - MULXQ 24(R15), AX, DI - ADOXQ AX, R13 - ADCXQ DI, SI - ADOXQ BX, SI - MOVQ $0x00, BX - ADCXQ BX, BX - ADOXQ R10, BX + // | w2 @ R10 + MOVQ (SI), AX + MULQ DI + ADDQ AX, R10 + ADCQ DX, CX + + // | j1 + + // | w3 @ R11 + MOVQ 8(SI), AX + MULQ DI + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ CX, R11 + MOVQ $0x00, CX + ADCQ DX, CX + + // | j2 + + // | w4 @ R12 + MOVQ 16(SI), AX + MULQ DI + ADDQ AX, R12 + ADCQ DX, R8 + ADDQ CX, R12 + + // | w-1 @ BX + ADCQ R8, BX + MOVQ $0x00, R8 + ADCQ $0x00, R8 // | + // | W montgomerry reduction ends + // | 0 - | 1 - | 2 - + // | 3 R11 | 4 R12 | 5 BX -/* reduction */ - MOVQ R11, AX - SUBQ (R15), AX - MOVQ R12, DX - SBBQ 8(R15), DX - MOVQ R13, DI - SBBQ 16(R15), DI - MOVQ SI, R8 - SBBQ 24(R15), R8 - SBBQ $0x00, BX + // | - // | - MOVQ c+0(FP), BX - CMOVQCC AX, R11 - MOVQ R11, (BX) - CMOVQCC DX, R12 - MOVQ R12, 8(BX) - CMOVQCC DI, R13 - MOVQ R13, 16(BX) - CMOVQCC R8, SI - MOVQ SI, 24(BX) +/* modular reduction */ + + MOVQ R11, R9 + SUBQ (SI), R9 + MOVQ R12, R10 + SBBQ 8(SI), R10 + MOVQ BX, R13 + SBBQ 16(SI), R13 + SBBQ $0x00, R8 + + // | + +/* out */ + + MOVQ c+0(FP), R8 + CMOVQCC R9, R11 + MOVQ R11, (R8) + CMOVQCC R10, R12 + MOVQ R12, 8(R8) + CMOVQCC R13, BX + MOVQ BX, 16(R8) RET // | -/* end */ +/* end */ -// func cpy5(dst *[5]uint64, src *[5]uint64) -TEXT ·cpy5(SB), NOSPLIT, $0-16 +// func cpy4(dst *[4]uint64, src *[4]uint64) +TEXT ·cpy4(SB), NOSPLIT, $0-16 MOVQ dst+0(FP), DI MOVQ src+8(FP), SI MOVQ (SI), R8 @@ -1282,12 +1682,10 @@ TEXT ·cpy5(SB), NOSPLIT, $0-16 MOVQ R8, 16(DI) MOVQ 24(SI), R8 MOVQ R8, 24(DI) - MOVQ 32(SI), R8 - MOVQ R8, 32(DI) RET -// func eq5(a *[5]uint64, b *[5]uint64) bool -TEXT ·eq5(SB), NOSPLIT, $0-17 +// func eq4(a *[4]uint64, b *[4]uint64) bool +TEXT ·eq4(SB), NOSPLIT, $0-17 MOVQ a+0(FP), DI MOVQ b+8(FP), SI MOVB $0x00, ret+16(FP) @@ -1303,22 +1701,15 @@ TEXT ·eq5(SB), NOSPLIT, $0-17 MOVQ 24(DI), R8 CMPQ 24(SI), R8 JNE ret - MOVQ 32(DI), R8 - CMPQ 32(SI), R8 - JNE ret MOVB $0x01, ret+16(FP) ret: RET -// func cmp5(a *[5]uint64, b *[5]uint64) int8 -TEXT ·cmp5(SB), NOSPLIT, $0-17 +// func cmp4(a *[4]uint64, b *[4]uint64) int8 +TEXT ·cmp4(SB), NOSPLIT, $0-17 MOVQ a+0(FP), DI MOVQ b+8(FP), SI - MOVQ 32(DI), R8 - CMPQ 32(SI), R8 - JB gt - JA lt MOVQ 24(DI), R8 CMPQ 24(SI), R8 JB gt @@ -1348,8 +1739,8 @@ lt: ret: RET -// func add5(c *[5]uint64, a *[5]uint64, b *[5]uint64, p *[5]uint64) -TEXT ·add5(SB), NOSPLIT, $0-32 +// func add4(c *[4]uint64, a *[4]uint64, b *[4]uint64, p *[4]uint64) +TEXT ·add4(SB), NOSPLIT, $0-32 // | MOVQ a+8(FP), DI MOVQ b+16(FP), SI @@ -1364,40 +1755,40 @@ TEXT ·add5(SB), NOSPLIT, $0-32 ADCQ 16(SI), R8 MOVQ 24(DI), R9 ADCQ 24(SI), R9 - MOVQ 32(DI), R10 - ADCQ 32(SI), R10 ADCQ $0x00, AX // | MOVQ p+24(FP), SI - MOVQ CX, R11 - SUBQ (SI), R11 - MOVQ DX, R12 - SBBQ 8(SI), R12 - MOVQ R8, R13 - SBBQ 16(SI), R13 - MOVQ R9, R14 - SBBQ 24(SI), R14 - MOVQ R10, R15 - SBBQ 32(SI), R15 + MOVQ CX, R10 + SUBQ (SI), R10 + MOVQ DX, R11 + SBBQ 8(SI), R11 + MOVQ R8, R12 + SBBQ 16(SI), R12 + MOVQ R9, R13 + SBBQ 24(SI), R13 SBBQ $0x00, AX // | MOVQ c+0(FP), DI - CMOVQCC R11, CX + CMOVQCC R10, CX MOVQ CX, (DI) - CMOVQCC R12, DX + CMOVQCC R11, DX MOVQ DX, 8(DI) - CMOVQCC R13, R8 + CMOVQCC R12, R8 MOVQ R8, 16(DI) - CMOVQCC R14, R9 + CMOVQCC R13, R9 MOVQ R9, 24(DI) - CMOVQCC R15, R10 - MOVQ R10, 32(DI) RET -// func addn5(a *[5]uint64, b *[5]uint64) uint64 -TEXT ·addn5(SB), NOSPLIT, $0-24 + // | + +/* end */ + + RET + +// func addn4(a *[4]uint64, b *[4]uint64) uint64 +TEXT ·addn4(SB), NOSPLIT, $0-24 // | MOVQ a+0(FP), DI MOVQ b+8(FP), SI @@ -1411,8 +1802,6 @@ TEXT ·addn5(SB), NOSPLIT, $0-24 ADCQ 16(SI), R8 MOVQ 24(DI), R9 ADCQ 24(SI), R9 - MOVQ 32(DI), R10 - ADCQ 32(SI), R10 ADCQ $0x00, AX // | @@ -1420,12 +1809,17 @@ TEXT ·addn5(SB), NOSPLIT, $0-24 MOVQ DX, 8(DI) MOVQ R8, 16(DI) MOVQ R9, 24(DI) - MOVQ R10, 32(DI) MOVQ AX, ret+16(FP) RET -// func double5(c *[5]uint64, a *[5]uint64, p *[5]uint64) -TEXT ·double5(SB), NOSPLIT, $0-24 + // | + +/* end */ + + RET + +// func double4(c *[4]uint64, a *[4]uint64, p *[4]uint64) +TEXT ·double4(SB), NOSPLIT, $0-24 // | MOVQ a+8(FP), DI XORQ AX, AX @@ -1437,40 +1831,40 @@ TEXT ·double5(SB), NOSPLIT, $0-24 ADCQ R8, R8 MOVQ 24(DI), R9 ADCQ R9, R9 - MOVQ 32(DI), R10 - ADCQ R10, R10 ADCQ $0x00, AX // | MOVQ p+16(FP), SI - MOVQ CX, R11 - SUBQ (SI), R11 - MOVQ DX, R12 - SBBQ 8(SI), R12 - MOVQ R8, R13 - SBBQ 16(SI), R13 - MOVQ R9, R14 - SBBQ 24(SI), R14 - MOVQ R10, R15 - SBBQ 32(SI), R15 + MOVQ CX, R10 + SUBQ (SI), R10 + MOVQ DX, R11 + SBBQ 8(SI), R11 + MOVQ R8, R12 + SBBQ 16(SI), R12 + MOVQ R9, R13 + SBBQ 24(SI), R13 SBBQ $0x00, AX // | MOVQ c+0(FP), DI - CMOVQCC R11, CX + CMOVQCC R10, CX MOVQ CX, (DI) - CMOVQCC R12, DX + CMOVQCC R11, DX MOVQ DX, 8(DI) - CMOVQCC R13, R8 + CMOVQCC R12, R8 MOVQ R8, 16(DI) - CMOVQCC R14, R9 + CMOVQCC R13, R9 MOVQ R9, 24(DI) - CMOVQCC R15, R10 - MOVQ R10, 32(DI) RET -// func sub5(c *[5]uint64, a *[5]uint64, b *[5]uint64, p *[5]uint64) -TEXT ·sub5(SB), NOSPLIT, $0-32 + // | + +/* end */ + + RET + +// func sub4(c *[4]uint64, a *[4]uint64, b *[4]uint64, p *[4]uint64) +TEXT ·sub4(SB), NOSPLIT, $0-32 // | MOVQ a+8(FP), DI MOVQ b+16(FP), SI @@ -1483,38 +1877,38 @@ TEXT ·sub5(SB), NOSPLIT, $0-32 SBBQ 16(SI), R8 MOVQ 24(DI), R9 SBBQ 24(SI), R9 - MOVQ 32(DI), R10 - SBBQ 32(SI), R10 // | MOVQ p+24(FP), SI - MOVQ (SI), R11 + MOVQ (SI), R10 + CMOVQCC AX, R10 + MOVQ 8(SI), R11 CMOVQCC AX, R11 - MOVQ 8(SI), R12 + MOVQ 16(SI), R12 CMOVQCC AX, R12 - MOVQ 16(SI), R13 + MOVQ 24(SI), R13 CMOVQCC AX, R13 - MOVQ 24(SI), R14 - CMOVQCC AX, R14 - MOVQ 32(SI), R15 - CMOVQCC AX, R15 // | MOVQ c+0(FP), DI - ADDQ R11, CX + ADDQ R10, CX MOVQ CX, (DI) - ADCQ R12, DX + ADCQ R11, DX MOVQ DX, 8(DI) - ADCQ R13, R8 + ADCQ R12, R8 MOVQ R8, 16(DI) - ADCQ R14, R9 + ADCQ R13, R9 MOVQ R9, 24(DI) - ADCQ R15, R10 - MOVQ R10, 32(DI) RET -// func subn5(a *[5]uint64, b *[5]uint64) uint64 -TEXT ·subn5(SB), NOSPLIT, $0-24 + // | + +/* end */ + + RET + +// func subn4(a *[4]uint64, b *[4]uint64) uint64 +TEXT ·subn4(SB), NOSPLIT, $0-24 // | MOVQ a+0(FP), DI MOVQ b+8(FP), SI @@ -1529,8 +1923,6 @@ TEXT ·subn5(SB), NOSPLIT, $0-24 SBBQ 16(SI), R8 MOVQ 24(DI), R9 SBBQ 24(SI), R9 - MOVQ 32(DI), R10 - SBBQ 32(SI), R10 ADCQ $0x00, AX // | @@ -1538,12 +1930,17 @@ TEXT ·subn5(SB), NOSPLIT, $0-24 MOVQ DX, 8(DI) MOVQ R8, 16(DI) MOVQ R9, 24(DI) - MOVQ R10, 32(DI) MOVQ AX, ret+16(FP) RET -// func _neg5(c *[5]uint64, a *[5]uint64, p *[5]uint64) -TEXT ·_neg5(SB), NOSPLIT, $0-24 + // | + +/* end */ + + RET + +// func _neg4(c *[4]uint64, a *[4]uint64, p *[4]uint64) +TEXT ·_neg4(SB), NOSPLIT, $0-24 // | MOVQ a+8(FP), DI @@ -1557,8 +1954,6 @@ TEXT ·_neg5(SB), NOSPLIT, $0-24 SBBQ 16(DI), R8 MOVQ 24(SI), R9 SBBQ 24(DI), R9 - MOVQ 32(SI), R10 - SBBQ 32(DI), R10 // | MOVQ c+0(FP), DI @@ -1566,1270 +1961,902 @@ TEXT ·_neg5(SB), NOSPLIT, $0-24 MOVQ DX, 8(DI) MOVQ R8, 16(DI) MOVQ R9, 24(DI) - MOVQ R10, 32(DI) RET -// func mul_two_5(a *[5]uint64) -TEXT ·mul_two_5(SB), NOSPLIT, $0-8 + // | + +/* end */ + + RET + +// func mul_two_4(a *[4]uint64) +TEXT ·mul_two_4(SB), NOSPLIT, $0-8 MOVQ a+0(FP), DI XORQ AX, AX RCLQ $0x01, (DI) RCLQ $0x01, 8(DI) RCLQ $0x01, 16(DI) RCLQ $0x01, 24(DI) - RCLQ $0x01, 32(DI) RET -// func div_two_5(a *[5]uint64) -TEXT ·div_two_5(SB), NOSPLIT, $0-8 +// func div_two_4(a *[4]uint64) +TEXT ·div_two_4(SB), NOSPLIT, $0-8 MOVQ a+0(FP), DI XORQ AX, AX - RCRQ $0x01, 32(DI) RCRQ $0x01, 24(DI) RCRQ $0x01, 16(DI) RCRQ $0x01, 8(DI) RCRQ $0x01, (DI) RET -// func mul5(c *[10]uint64, a *[5]uint64, b *[5]uint64, p *[5]uint64, inp uint64) -TEXT ·mul5(SB), NOSPLIT, $8-40 +// func mul4(c *[4]uint64, a *[4]uint64, b *[4]uint64, p *[4]uint64, inp uint64) +TEXT ·mul4(SB), NOSPLIT, $0-40 // | -/* inputs */ +/* inputs */ MOVQ a+8(FP), DI MOVQ b+16(FP), SI XORQ AX, AX - // | - // | - MOVQ (SI), DX + // | - // | - MULXQ (DI), AX, R8 - MOVQ AX, CX +/* i = 0 */ - // | - MULXQ 8(DI), AX, R9 + // | a0 @ DX + MOVQ (DI), DX + + // | a0 * b0 + MULXQ (SI), CX, R8 + + // | a0 * b1 + MULXQ 8(SI), AX, R9 ADCXQ AX, R8 - // | - MULXQ 16(DI), AX, R10 + // | a0 * b2 + MULXQ 16(SI), AX, R10 ADCXQ AX, R9 - // | - MULXQ 24(DI), AX, R11 + // | a0 * b3 + MULXQ 24(SI), AX, R11 ADCXQ AX, R10 + ADCQ $0x00, R11 - // | - MULXQ 32(DI), AX, R12 - ADCXQ AX, R11 - ADCQ $0x00, R12 + // | - // | - // | - MOVQ 8(SI), DX - XORQ R13, R13 +/* i = 1 */ - // | - MULXQ (DI), AX, BX + // | a1 @ DX + MOVQ 8(DI), DX + XORQ R12, R12 + + // | a1 * b0 + MULXQ (SI), AX, BX ADOXQ AX, R8 ADCXQ BX, R9 - // | - MULXQ 8(DI), AX, BX + // | a1 * b1 + MULXQ 8(SI), AX, BX ADOXQ AX, R9 ADCXQ BX, R10 - // | - MULXQ 16(DI), AX, BX + // | a1 * b2 + MULXQ 16(SI), AX, BX ADOXQ AX, R10 ADCXQ BX, R11 - // | - MULXQ 24(DI), AX, BX + // | a1 * b3 + MULXQ 24(SI), AX, BX ADOXQ AX, R11 + ADOXQ R12, R12 ADCXQ BX, R12 - // | - MULXQ 32(DI), AX, BX - ADOXQ AX, R12 - ADOXQ R13, R13 - ADCXQ BX, R13 + // | - // | - // | - MOVQ 16(SI), DX - XORQ R14, R14 +/* i = 2 */ - // | - MULXQ (DI), AX, BX + // | a2 @ DX + MOVQ 16(DI), DX + XORQ R13, R13 + + // | a2 * b0 + MULXQ (SI), AX, BX ADOXQ AX, R9 ADCXQ BX, R10 - // | - MULXQ 8(DI), AX, BX + // | a2 * b1 + MULXQ 8(SI), AX, BX ADOXQ AX, R10 ADCXQ BX, R11 - // | - MULXQ 16(DI), AX, BX + // | a2 * b2 + MULXQ 16(SI), AX, BX ADOXQ AX, R11 ADCXQ BX, R12 - // | - MULXQ 24(DI), AX, BX + // | a2 * b3 + MULXQ 24(SI), AX, BX ADOXQ AX, R12 + ADOXQ R13, R13 ADCXQ BX, R13 - // | - MULXQ 32(DI), AX, BX - ADOXQ AX, R13 - ADOXQ R14, R14 - ADCXQ BX, R14 + // | - // | - // | - MOVQ 24(SI), DX - XORQ R15, R15 +/* i = 3 */ - // | - MULXQ (DI), AX, BX + // | a3 @ DX + MOVQ 24(DI), DX + XORQ DI, DI + + // | a3 * b0 + MULXQ (SI), AX, BX ADOXQ AX, R10 ADCXQ BX, R11 - // | - MULXQ 8(DI), AX, BX + // | a3 * b1 + MULXQ 8(SI), AX, BX ADOXQ AX, R11 ADCXQ BX, R12 - // | - MULXQ 16(DI), AX, BX + // | a3 * b2 + MULXQ 16(SI), AX, BX ADOXQ AX, R12 ADCXQ BX, R13 - // | - MULXQ 24(DI), AX, BX + // | a3 * b3 + MULXQ 24(SI), AX, BX ADOXQ AX, R13 - ADCXQ BX, R14 + ADOXQ BX, DI + ADCQ $0x00, DI - // | - MULXQ 32(DI), AX, BX - ADOXQ AX, R14 - ADOXQ R15, R15 - ADCXQ BX, R15 + // | - // | - // | - MOVQ 32(SI), DX - XORQ SI, SI +/* */ - // | - MULXQ (DI), AX, BX - ADOXQ AX, R11 - ADCXQ BX, R12 + // | + // | W + // | 0 CX | 1 R8 | 2 R9 | 3 R10 + // | 4 R11 | 5 R12 | 6 R13 | 7 DI - // | - MULXQ 8(DI), AX, BX - ADOXQ AX, R12 - ADCXQ BX, R13 - // | - MULXQ 16(DI), AX, BX - ADOXQ AX, R13 - ADCXQ BX, R14 + // | fetch modulus + MOVQ p+24(FP), SI - // | - MULXQ 24(DI), AX, BX - ADOXQ AX, R14 - ADCXQ BX, R15 + // | + // | W ready to mont + // | 0 CX | 1 R8 | 2 R9 | 3 R10 + // | 4 R11 | 5 R12 | 6 R13 | 7 DI - // | - MULXQ 32(DI), AX, BX - ADOXQ AX, R15 - ADOXQ BX, SI - ADCQ $0x00, SI - // | - MOVQ SI, (SP) - MOVQ p+24(FP), SI + // | - // | - // | - XORQ BX, BX +/* montgomery reduction */ + + // | clear flags + XORQ AX, AX + + // | + +/* i = 0 */ + + // | + // | W + // | 0 CX | 1 R8 | 2 R9 | 3 R10 + // | 4 R11 | 5 R12 | 6 R13 | 7 DI + + + // | | u0 = w0 * inp MOVQ CX, DX - MULXQ inp+32(FP), DX, DI + MULXQ inp+32(FP), DX, BX - // | - MULXQ (SI), AX, DI + // | + +/* */ + + // | j0 + + // | w0 @ CX + MULXQ (SI), AX, BX ADOXQ AX, CX - ADCXQ DI, R8 + ADCXQ BX, R8 - // | - MULXQ 8(SI), AX, DI + // | j1 + + // | w1 @ R8 + MULXQ 8(SI), AX, BX ADOXQ AX, R8 - ADCXQ DI, R9 + ADCXQ BX, R9 - // | - MULXQ 16(SI), AX, DI + // | j2 + + // | w2 @ R9 + MULXQ 16(SI), AX, BX ADOXQ AX, R9 - ADCXQ DI, R10 + ADCXQ BX, R10 - // | - MULXQ 24(SI), AX, DI + // | j3 + + // | w3 @ R10 + MULXQ 24(SI), AX, BX ADOXQ AX, R10 - ADCXQ DI, R11 + ADCXQ BX, R11 + ADOXQ CX, R11 + ADCXQ CX, CX + MOVQ $0x00, AX + ADOXQ AX, CX - // | - MULXQ 32(SI), AX, DI - ADOXQ AX, R11 - ADCXQ DI, R12 - ADOXQ BX, R12 - ADCXQ BX, BX - XORQ CX, CX + // | clear flags + XORQ AX, AX - // | - // | + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 R8 | 2 R9 | 3 R10 + // | 4 R11 | 5 R12 | 6 R13 | 7 DI + + + // | | u1 = w1 * inp MOVQ R8, DX - MULXQ inp+32(FP), DX, DI + MULXQ inp+32(FP), DX, BX - // | - MULXQ (SI), AX, DI + // | + +/* */ + + // | j0 + + // | w1 @ R8 + MULXQ (SI), AX, BX ADOXQ AX, R8 - ADCXQ DI, R9 + ADCXQ BX, R9 - // | - MULXQ 8(SI), AX, DI + // | j1 + + // | w2 @ R9 + MULXQ 8(SI), AX, BX ADOXQ AX, R9 - ADCXQ DI, R10 + ADCXQ BX, R10 - // | - MULXQ 16(SI), AX, DI + // | j2 + + // | w3 @ R10 + MULXQ 16(SI), AX, BX ADOXQ AX, R10 - ADCXQ DI, R11 + ADCXQ BX, R11 - // | - MULXQ 24(SI), AX, DI + // | j3 + + // | w4 @ R11 + MULXQ 24(SI), AX, BX ADOXQ AX, R11 - ADCXQ DI, R12 + ADCXQ BX, R12 + ADOXQ CX, R12 + ADCXQ R8, R8 + MOVQ $0x00, AX + ADOXQ AX, R8 - // | - MULXQ 32(SI), AX, DI - ADOXQ AX, R12 - ADCXQ DI, R13 - ADOXQ BX, R13 - MOVQ $0x00, BX - ADCXQ BX, BX - XORQ R8, R8 + // | clear flags + XORQ AX, AX - // | - // | + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 R9 | 3 R10 + // | 4 R11 | 5 R12 | 6 R13 | 7 DI + + + // | | u2 = w2 * inp MOVQ R9, DX - MULXQ inp+32(FP), DX, DI + MULXQ inp+32(FP), DX, BX - // | - MULXQ (SI), AX, DI + // | + +/* */ + + // | j0 + + // | w2 @ R9 + MULXQ (SI), AX, BX ADOXQ AX, R9 - ADCXQ DI, R10 + ADCXQ BX, R10 - // | - MULXQ 8(SI), AX, DI + // | j1 + + // | w3 @ R10 + MULXQ 8(SI), AX, BX ADOXQ AX, R10 - ADCXQ DI, R11 + ADCXQ BX, R11 - // | - MULXQ 16(SI), AX, DI + // | j2 + + // | w4 @ R11 + MULXQ 16(SI), AX, BX ADOXQ AX, R11 - ADCXQ DI, R12 + ADCXQ BX, R12 - // | - MULXQ 24(SI), AX, DI + // | j3 + + // | w5 @ R12 + MULXQ 24(SI), AX, BX ADOXQ AX, R12 - ADCXQ DI, R13 + ADCXQ BX, R13 + ADOXQ R8, R13 + ADCXQ R9, R9 + MOVQ $0x00, AX + ADOXQ AX, R9 - // | - MULXQ 32(SI), AX, DI - ADOXQ AX, R13 - ADCXQ DI, R14 - ADOXQ BX, R14 - MOVQ $0x00, BX - ADCXQ BX, BX - XORQ R9, R9 + // | clear flags + XORQ AX, AX - // | - // | + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 R10 + // | 4 R11 | 5 R12 | 6 R13 | 7 DI + + + // | | u3 = w3 * inp MOVQ R10, DX - MULXQ inp+32(FP), DX, DI + MULXQ inp+32(FP), DX, BX - // | - MULXQ (SI), AX, DI + // | + +/* */ + + // | j0 + + // | w3 @ R10 + MULXQ (SI), AX, BX ADOXQ AX, R10 - ADCXQ DI, R11 + ADCXQ BX, R11 - // | - MULXQ 8(SI), AX, DI + // | j1 + + // | w4 @ R11 + MULXQ 8(SI), AX, BX ADOXQ AX, R11 - ADCXQ DI, R12 + ADCXQ BX, R12 - // | - MULXQ 16(SI), AX, DI + // | j2 + + // | w5 @ R12 + MULXQ 16(SI), AX, BX ADOXQ AX, R12 - ADCXQ DI, R13 + ADCXQ BX, R13 - // | - MULXQ 24(SI), AX, DI + // | j3 + + // | w6 @ R13 + MULXQ 24(SI), AX, BX ADOXQ AX, R13 - ADCXQ DI, R14 + ADCXQ BX, DI + ADOXQ R9, DI + ADCXQ R10, R10 + MOVQ $0x00, AX + ADOXQ AX, R10 - // | - MULXQ 32(SI), AX, DI - ADOXQ AX, R14 - ADCXQ DI, R15 - ADOXQ BX, R15 - MOVQ $0x00, BX - ADCXQ BX, BX - XORQ R10, R10 - MOVQ (SP), R10 + // | + // | W montgomery reduction ends + // | 0 - | 1 - | 2 - | 3 - + // | 4 R11 | 5 R12 | 6 R13 | 7 DI - // | - // | - MOVQ R11, DX - MULXQ inp+32(FP), DX, DI - // | - MULXQ (SI), AX, DI - ADOXQ AX, R11 - ADCXQ DI, R12 + // | - // | - MULXQ 8(SI), AX, DI - ADOXQ AX, R12 - ADCXQ DI, R13 +/* modular reduction */ - // | - MULXQ 16(SI), AX, DI - ADOXQ AX, R13 - ADCXQ DI, R14 - - // | - MULXQ 24(SI), AX, DI - ADOXQ AX, R14 - ADCXQ DI, R15 - - // | - MULXQ 32(SI), AX, DI - ADOXQ AX, R15 - ADCXQ DI, R10 - ADOXQ BX, R10 - MOVQ $0x00, BX - ADCXQ BX, BX - ADOXQ R11, BX + MOVQ R11, CX + SUBQ (SI), CX + MOVQ R12, AX + SBBQ 8(SI), AX + MOVQ R13, BX + SBBQ 16(SI), BX + MOVQ DI, R8 + SBBQ 24(SI), R8 + SBBQ $0x00, R10 // | -/* reduction */ - - MOVQ R12, AX - SUBQ (SI), AX - MOVQ R13, DX - SBBQ 8(SI), DX - MOVQ R14, DI - SBBQ 16(SI), DI - MOVQ R15, R8 - SBBQ 24(SI), R8 - MOVQ R10, R9 - SBBQ 32(SI), R9 - SBBQ $0x00, BX +/* out */ - // | - MOVQ c+0(FP), BX + MOVQ c+0(FP), R10 + CMOVQCC CX, R11 + MOVQ R11, (R10) CMOVQCC AX, R12 - MOVQ R12, (BX) - CMOVQCC DX, R13 - MOVQ R13, 8(BX) - CMOVQCC DI, R14 - MOVQ R14, 16(BX) - CMOVQCC R8, R15 - MOVQ R15, 24(BX) - CMOVQCC R9, R10 - MOVQ R10, 32(BX) + MOVQ R12, 8(R10) + CMOVQCC BX, R13 + MOVQ R13, 16(R10) + CMOVQCC R8, DI + MOVQ DI, 24(R10) RET // | -/* end */ +/* end */ -// func cpy6(dst *[6]uint64, src *[6]uint64) -TEXT ·cpy6(SB), NOSPLIT, $0-16 - MOVQ dst+0(FP), DI - MOVQ src+8(FP), SI - MOVQ (SI), R8 - MOVQ R8, (DI) - MOVQ 8(SI), R8 - MOVQ R8, 8(DI) - MOVQ 16(SI), R8 - MOVQ R8, 16(DI) - MOVQ 24(SI), R8 - MOVQ R8, 24(DI) - MOVQ 32(SI), R8 - MOVQ R8, 32(DI) - MOVQ 40(SI), R8 - MOVQ R8, 40(DI) - RET +// func mul_no_adx_bmi2_4(c *[4]uint64, a *[4]uint64, b *[4]uint64, p *[4]uint64, inp uint64) +TEXT ·mul_no_adx_bmi2_4(SB), NOSPLIT, $0-40 + // | -// func eq6(a *[6]uint64, b *[6]uint64) bool -TEXT ·eq6(SB), NOSPLIT, $0-17 - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - MOVB $0x00, ret+16(FP) - MOVQ (DI), R8 - CMPQ (SI), R8 - JNE ret - MOVQ 8(DI), R8 - CMPQ 8(SI), R8 - JNE ret - MOVQ 16(DI), R8 - CMPQ 16(SI), R8 - JNE ret - MOVQ 24(DI), R8 - CMPQ 24(SI), R8 - JNE ret - MOVQ 32(DI), R8 - CMPQ 32(SI), R8 - JNE ret - MOVQ 40(DI), R8 - CMPQ 40(SI), R8 - JNE ret - MOVB $0x01, ret+16(FP) +/* inputs */ -ret: - RET + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + MOVQ $0x00, R10 + MOVQ $0x00, R11 + MOVQ $0x00, R12 + MOVQ $0x00, R13 + MOVQ $0x00, R14 -// func cmp6(a *[6]uint64, b *[6]uint64) int8 -TEXT ·cmp6(SB), NOSPLIT, $0-17 - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - MOVQ 40(DI), R8 - CMPQ 40(SI), R8 - JB gt - JA lt - MOVQ 32(DI), R8 - CMPQ 32(SI), R8 - JB gt - JA lt - MOVQ 24(DI), R8 - CMPQ 24(SI), R8 - JB gt - JA lt - MOVQ 16(DI), R8 - CMPQ 16(SI), R8 - JB gt - JA lt - MOVQ 8(DI), R8 - CMPQ 8(SI), R8 - JB gt - JA lt - MOVQ (DI), R8 - CMPQ (SI), R8 - JB gt - JA lt - MOVB $0x00, ret+16(FP) - JMP ret + // | -gt: - MOVB $0x01, ret+16(FP) - JMP ret +/* i = 0 */ -lt: - MOVB $0xff, ret+16(FP) + // | a0 @ CX + MOVQ (DI), CX -ret: - RET + // | a0 * b0 + MOVQ (SI), AX + MULQ CX + MOVQ AX, R8 + MOVQ DX, R9 -// func add6(c *[6]uint64, a *[6]uint64, b *[6]uint64, p *[6]uint64) -TEXT ·add6(SB), NOSPLIT, $16-32 - // | - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI - XORQ AX, AX + // | a0 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 - // | - MOVQ (DI), CX - ADDQ (SI), CX - MOVQ 8(DI), DX - ADCQ 8(SI), DX - MOVQ 16(DI), R8 - ADCQ 16(SI), R8 - MOVQ 24(DI), R9 - ADCQ 24(SI), R9 - MOVQ 32(DI), R10 - ADCQ 32(SI), R10 - MOVQ 40(DI), R11 - ADCQ 40(SI), R11 - ADCQ $0x00, AX + // | a0 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 - // | - MOVQ p+24(FP), SI - MOVQ CX, R12 - SUBQ (SI), R12 - MOVQ DX, R13 - SBBQ 8(SI), R13 - MOVQ R8, R14 - SBBQ 16(SI), R14 - MOVQ R9, R15 - SBBQ 24(SI), R15 - MOVQ R10, BX - SBBQ 32(SI), BX - MOVQ BX, (SP) - MOVQ R11, BX - SBBQ 40(SI), BX - MOVQ BX, 8(SP) - SBBQ $0x00, AX + // | a0 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 - // | - MOVQ c+0(FP), DI - CMOVQCC R12, CX - MOVQ CX, (DI) - CMOVQCC R13, DX - MOVQ DX, 8(DI) - CMOVQCC R14, R8 - MOVQ R8, 16(DI) - CMOVQCC R15, R9 - MOVQ R9, 24(DI) - CMOVQCC (SP), R10 - MOVQ R10, 32(DI) - CMOVQCC 8(SP), R11 - MOVQ R11, 40(DI) - RET + // | -// func addn6(a *[6]uint64, b *[6]uint64) uint64 -TEXT ·addn6(SB), NOSPLIT, $0-24 - // | - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI +/* i = 1 */ - // | - MOVQ (DI), CX - ADDQ (SI), CX - MOVQ 8(DI), DX - ADCQ 8(SI), DX - MOVQ 16(DI), R8 - ADCQ 16(SI), R8 - MOVQ 24(DI), R9 - ADCQ 24(SI), R9 - MOVQ 32(DI), R10 - ADCQ 32(SI), R10 - MOVQ 40(DI), R11 - ADCQ 40(SI), R11 - ADCQ $0x00, AX + // | a1 @ CX + MOVQ 8(DI), CX + MOVQ $0x00, BX - // | - MOVQ CX, (DI) - MOVQ DX, 8(DI) - MOVQ R8, 16(DI) - MOVQ R9, 24(DI) - MOVQ R10, 32(DI) - MOVQ R11, 40(DI) - MOVQ AX, ret+16(FP) - RET + // | a1 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ $0x00, R11 + ADCQ $0x00, BX -// func double6(c *[6]uint64, a *[6]uint64, p *[6]uint64) -TEXT ·double6(SB), NOSPLIT, $16-24 - // | - MOVQ a+8(FP), DI - XORQ AX, AX - MOVQ (DI), CX - ADDQ CX, CX - MOVQ 8(DI), DX - ADCQ DX, DX - MOVQ 16(DI), R8 - ADCQ R8, R8 - MOVQ 24(DI), R9 - ADCQ R9, R9 - MOVQ 32(DI), R10 - ADCQ R10, R10 - MOVQ 40(DI), R11 - ADCQ R11, R11 - ADCQ $0x00, AX + // | a1 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | - MOVQ p+16(FP), SI - MOVQ CX, R12 - SUBQ (SI), R12 - MOVQ DX, R13 - SBBQ 8(SI), R13 - MOVQ R8, R14 - SBBQ 16(SI), R14 - MOVQ R9, R15 - SBBQ 24(SI), R15 - MOVQ R10, BX - SBBQ 32(SI), BX - MOVQ BX, (SP) - MOVQ R11, BX - SBBQ 40(SI), BX - MOVQ BX, 8(SP) - SBBQ $0x00, AX + // | a1 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 - // | - MOVQ c+0(FP), DI - CMOVQCC R12, CX - MOVQ CX, (DI) - CMOVQCC R13, DX - MOVQ DX, 8(DI) - CMOVQCC R14, R8 - MOVQ R8, 16(DI) - CMOVQCC R15, R9 - MOVQ R9, 24(DI) - CMOVQCC (SP), R10 - MOVQ R10, 32(DI) - CMOVQCC 8(SP), R11 - MOVQ R11, 40(DI) - RET + // | a1 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 -// func sub6(c *[6]uint64, a *[6]uint64, b *[6]uint64, p *[6]uint64) -TEXT ·sub6(SB), NOSPLIT, $16-32 - // | - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI - XORQ AX, AX - MOVQ (DI), CX - SUBQ (SI), CX - MOVQ 8(DI), DX - SBBQ 8(SI), DX - MOVQ 16(DI), R8 - SBBQ 16(SI), R8 - MOVQ 24(DI), R9 - SBBQ 24(SI), R9 - MOVQ 32(DI), R10 - SBBQ 32(SI), R10 - MOVQ 40(DI), R11 - SBBQ 40(SI), R11 + // | - // | - MOVQ p+24(FP), SI - MOVQ (SI), R12 - CMOVQCC AX, R12 - MOVQ 8(SI), R13 - CMOVQCC AX, R13 - MOVQ 16(SI), R14 - CMOVQCC AX, R14 - MOVQ 24(SI), R15 - CMOVQCC AX, R15 - CMOVQCS 32(SI), AX - MOVQ AX, (SP) - CMOVQCS 40(SI), AX - MOVQ AX, 8(SP) +/* i = 2 */ - // | - MOVQ c+0(FP), DI - ADDQ R12, CX - MOVQ CX, (DI) - ADCQ R13, DX - MOVQ DX, 8(DI) - ADCQ R14, R8 - MOVQ R8, 16(DI) - ADCQ R15, R9 - MOVQ R9, 24(DI) - ADCQ (SP), R10 - MOVQ R10, 32(DI) - ADCQ 8(SP), R11 - MOVQ R11, 40(DI) - RET + // | a2 @ CX + MOVQ 16(DI), CX + MOVQ $0x00, BX -// func subn6(a *[6]uint64, b *[6]uint64) uint64 -TEXT ·subn6(SB), NOSPLIT, $0-24 - // | - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - XORQ AX, AX + // | a2 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ $0x00, R12 + ADCQ $0x00, BX - // | - MOVQ (DI), CX - SUBQ (SI), CX - MOVQ 8(DI), DX - SBBQ 8(SI), DX - MOVQ 16(DI), R8 - SBBQ 16(SI), R8 - MOVQ 24(DI), R9 - SBBQ 24(SI), R9 - MOVQ 32(DI), R10 - SBBQ 32(SI), R10 - MOVQ 40(DI), R11 - SBBQ 40(SI), R11 - ADCQ $0x00, AX + // | a2 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | - MOVQ CX, (DI) - MOVQ DX, 8(DI) - MOVQ R8, 16(DI) - MOVQ R9, 24(DI) - MOVQ R10, 32(DI) - MOVQ R11, 40(DI) - MOVQ AX, ret+16(FP) - RET + // | a2 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 -// func _neg6(c *[6]uint64, a *[6]uint64, p *[6]uint64) -TEXT ·_neg6(SB), NOSPLIT, $0-24 - // | - MOVQ a+8(FP), DI + // | a2 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 - // | - MOVQ p+16(FP), SI - MOVQ (SI), CX - SUBQ (DI), CX - MOVQ 8(SI), DX - SBBQ 8(DI), DX - MOVQ 16(SI), R8 - SBBQ 16(DI), R8 - MOVQ 24(SI), R9 - SBBQ 24(DI), R9 - MOVQ 32(SI), R10 - SBBQ 32(DI), R10 - MOVQ 40(SI), R11 - SBBQ 40(DI), R11 + // | - // | - MOVQ c+0(FP), DI - MOVQ CX, (DI) - MOVQ DX, 8(DI) - MOVQ R8, 16(DI) - MOVQ R9, 24(DI) - MOVQ R10, 32(DI) - MOVQ R11, 40(DI) - RET +/* i = 3 */ -// func mul_two_6(a *[6]uint64) -TEXT ·mul_two_6(SB), NOSPLIT, $0-8 - MOVQ a+0(FP), DI - XORQ AX, AX - RCLQ $0x01, (DI) - RCLQ $0x01, 8(DI) - RCLQ $0x01, 16(DI) - RCLQ $0x01, 24(DI) - RCLQ $0x01, 32(DI) - RCLQ $0x01, 40(DI) - RET + // | a3 @ CX + MOVQ 24(DI), CX + MOVQ $0x00, BX -// func div_two_6(a *[6]uint64) -TEXT ·div_two_6(SB), NOSPLIT, $0-8 - MOVQ a+0(FP), DI - XORQ AX, AX - RCRQ $0x01, 40(DI) - RCRQ $0x01, 32(DI) - RCRQ $0x01, 24(DI) - RCRQ $0x01, 16(DI) - RCRQ $0x01, 8(DI) - RCRQ $0x01, (DI) - RET + // | a3 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ $0x00, R13 + ADCQ $0x00, BX -// func mul6(c *[12]uint64, a *[6]uint64, b *[6]uint64, p *[6]uint64, inp uint64) -TEXT ·mul6(SB), NOSPLIT, $24-40 - // | + // | a3 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX -/* inputs */ + // | a3 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ $0x00, BX - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI - XORQ AX, AX + // | a3 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, BX - // | - // | - MOVQ (SI), DX + // | - // | - MULXQ (DI), AX, CX - MOVQ AX, (SP) +/* */ - // | - MULXQ 8(DI), AX, R8 - ADCXQ AX, CX + // | + // | W + // | 0 R8 | 1 R9 | 2 R10 | 3 R11 + // | 4 R12 | 5 R13 | 6 R14 | 7 BX - // | - MULXQ 16(DI), AX, R9 - ADCXQ AX, R8 - // | - MULXQ 24(DI), AX, R10 - ADCXQ AX, R9 + // | fetch modulus + MOVQ p+24(FP), SI - // | - MULXQ 32(DI), AX, R11 - ADCXQ AX, R10 + // | - // | - MULXQ 40(DI), AX, R12 - ADCXQ AX, R11 - ADCQ $0x00, R12 +/* montgomery reduction */ - // | - // | - MOVQ 8(SI), DX - XORQ R13, R13 + // | - // | - MULXQ (DI), AX, BX - ADOXQ AX, CX - ADCXQ BX, R8 - MOVQ CX, 8(SP) +/* i = 0 */ - // | - MULXQ 8(DI), AX, BX - ADOXQ AX, R8 - ADCXQ BX, R9 + // | + // | W + // | 0 R8 | 1 R9 | 2 R10 | 3 R11 + // | 4 R12 | 5 R13 | 6 R14 | 7 BX - // | - MULXQ 16(DI), AX, BX - ADOXQ AX, R9 - ADCXQ BX, R10 - // | - MULXQ 24(DI), AX, BX - ADOXQ AX, R10 - ADCXQ BX, R11 + // | | u0 = w0 * inp + MOVQ R8, AX + MULQ inp+32(FP) + MOVQ AX, DI + MOVQ $0x00, CX - // | - MULXQ 32(DI), AX, BX - ADOXQ AX, R11 - ADCXQ BX, R12 + // | - // | - MULXQ 40(DI), AX, BX - ADOXQ AX, R12 - ADOXQ R13, R13 - ADCXQ BX, R13 +/* */ - // | - // | - MOVQ 16(SI), DX - XORQ R14, R14 + // | j0 - // | - MULXQ (DI), AX, BX - ADOXQ AX, R8 - ADCXQ BX, R9 + // | w0 @ R8 + MOVQ (SI), AX + MULQ DI + ADDQ AX, R8 + ADCQ DX, CX - // | - MULXQ 8(DI), AX, BX - ADOXQ AX, R9 - ADCXQ BX, R10 + // | j1 - // | - MULXQ 16(DI), AX, BX - ADOXQ AX, R10 - ADCXQ BX, R11 + // | w1 @ R9 + MOVQ 8(SI), AX + MULQ DI + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ CX, R9 + MOVQ $0x00, CX + ADCQ DX, CX - // | - MULXQ 24(DI), AX, BX - ADOXQ AX, R11 - ADCXQ BX, R12 + // | j2 - // | - MULXQ 32(DI), AX, BX - ADOXQ AX, R12 - ADCXQ BX, R13 + // | w2 @ R10 + MOVQ 16(SI), AX + MULQ DI + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ CX, R10 + MOVQ $0x00, CX + ADCQ DX, CX - // | - MULXQ 40(DI), AX, BX - ADOXQ AX, R13 - ADOXQ R14, R14 - ADCXQ BX, R14 + // | j3 - // | - // | - MOVQ 24(SI), DX - XORQ R15, R15 + // | w3 @ R11 + MOVQ 24(SI), AX + MULQ DI + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ CX, R11 - // | - MULXQ (DI), AX, BX - ADOXQ AX, R9 - ADCXQ BX, R10 + // | w4 @ R12 + ADCQ DX, R12 + ADCQ $0x00, R8 - // | - MULXQ 8(DI), AX, BX - ADOXQ AX, R10 - ADCXQ BX, R11 - - // | - MULXQ 16(DI), AX, BX - ADOXQ AX, R11 - ADCXQ BX, R12 + // | - // | - MULXQ 24(DI), AX, BX - ADOXQ AX, R12 - ADCXQ BX, R13 +/* i = 1 */ - // | - MULXQ 32(DI), AX, BX - ADOXQ AX, R13 - ADCXQ BX, R14 + // | + // | W + // | 0 - | 1 R9 | 2 R10 | 3 R11 + // | 4 R12 | 5 R13 | 6 R14 | 7 BX - // | - MULXQ 40(DI), AX, BX - ADOXQ AX, R14 - ADOXQ R15, R15 - ADCXQ BX, R15 - // | - // | - MOVQ 32(SI), DX - XORQ CX, CX + // | | u1 = w1 * inp + MOVQ R9, AX + MULQ inp+32(FP) + MOVQ AX, DI + MOVQ $0x00, CX - // | - MULXQ (DI), AX, BX - ADOXQ AX, R10 - ADCXQ BX, R11 + // | - // | - MULXQ 8(DI), AX, BX - ADOXQ AX, R11 - ADCXQ BX, R12 +/* */ - // | - MULXQ 16(DI), AX, BX - ADOXQ AX, R12 - ADCXQ BX, R13 + // | j0 - // | - MULXQ 24(DI), AX, BX - ADOXQ AX, R13 - ADCXQ BX, R14 + // | w1 @ R9 + MOVQ (SI), AX + MULQ DI + ADDQ AX, R9 + ADCQ DX, CX - // | - MULXQ 32(DI), AX, BX - ADOXQ AX, R14 - ADCXQ BX, R15 + // | j1 - // | - MULXQ 40(DI), AX, BX - ADOXQ AX, R15 - ADOXQ CX, CX - ADCXQ BX, CX + // | w2 @ R10 + MOVQ 8(SI), AX + MULQ DI + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ CX, R10 + MOVQ $0x00, CX + ADCQ DX, CX - // | - // | - MOVQ 40(SI), DX - XORQ SI, SI + // | j2 - // | - MULXQ (DI), AX, BX - ADOXQ AX, R11 - ADCXQ BX, R12 + // | w3 @ R11 + MOVQ 16(SI), AX + MULQ DI + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ CX, R11 + MOVQ $0x00, CX + ADCQ DX, CX - // | - MULXQ 8(DI), AX, BX - ADOXQ AX, R12 - ADCXQ BX, R13 + // | j3 - // | - MULXQ 16(DI), AX, BX - ADOXQ AX, R13 - ADCXQ BX, R14 + // | w4 @ R12 + MOVQ 24(SI), AX + MULQ DI + ADDQ AX, R12 + ADCQ DX, R8 + ADDQ CX, R12 - // | - MULXQ 24(DI), AX, BX - ADOXQ AX, R14 - ADCXQ BX, R15 + // | w5 @ R13 + ADCQ R8, R13 + MOVQ $0x00, R8 + ADCQ $0x00, R8 - // | - MULXQ 32(DI), AX, BX - ADOXQ AX, R15 - ADCXQ BX, CX + // | - // | - MULXQ 40(DI), AX, BX - ADOXQ AX, CX - ADOXQ BX, SI - ADCQ $0x00, SI +/* i = 2 */ - // | - MOVQ (SP), DI - MOVQ 8(SP), BX - MOVQ R15, (SP) - MOVQ CX, 8(SP) - MOVQ SI, 16(SP) - MOVQ p+24(FP), SI + // | + // | W + // | 0 - | 1 - | 2 R10 | 3 R11 + // | 4 R12 | 5 R13 | 6 R14 | 7 BX - // | - // | - XORQ CX, CX - MOVQ DI, DX - MULXQ inp+32(FP), DX, R15 - // | - MULXQ (SI), AX, R15 - ADOXQ AX, DI - ADCXQ R15, BX + // | | u2 = w2 * inp + MOVQ R10, AX + MULQ inp+32(FP) + MOVQ AX, DI + MOVQ $0x00, CX - // | - MULXQ 8(SI), AX, R15 - ADOXQ AX, BX - ADCXQ R15, R8 + // | - // | - MULXQ 16(SI), AX, R15 - ADOXQ AX, R8 - ADCXQ R15, R9 +/* */ - // | - MULXQ 24(SI), AX, R15 - ADOXQ AX, R9 - ADCXQ R15, R10 + // | j0 - // | - MULXQ 32(SI), AX, R15 - ADOXQ AX, R10 - ADCXQ R15, R11 + // | w2 @ R10 + MOVQ (SI), AX + MULQ DI + ADDQ AX, R10 + ADCQ DX, CX - // | - MULXQ 40(SI), AX, R15 - ADOXQ AX, R11 - ADCXQ R15, R12 - ADOXQ CX, R12 - ADCXQ CX, CX - XORQ DI, DI + // | j1 - // | - // | - MOVQ BX, DX - MULXQ inp+32(FP), DX, R15 + // | w3 @ R11 + MOVQ 8(SI), AX + MULQ DI + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ CX, R11 + MOVQ $0x00, CX + ADCQ DX, CX - // | - MULXQ (SI), AX, R15 - ADOXQ AX, BX - ADCXQ R15, R8 + // | j2 - // | - MULXQ 8(SI), AX, R15 - ADOXQ AX, R8 - ADCXQ R15, R9 + // | w4 @ R12 + MOVQ 16(SI), AX + MULQ DI + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ CX, R12 + MOVQ $0x00, CX + ADCQ DX, CX - // | - MULXQ 16(SI), AX, R15 - ADOXQ AX, R9 - ADCXQ R15, R10 + // | j3 - // | - MULXQ 24(SI), AX, R15 - ADOXQ AX, R10 - ADCXQ R15, R11 + // | w5 @ R13 + MOVQ 24(SI), AX + MULQ DI + ADDQ AX, R13 + ADCQ DX, R8 + ADDQ CX, R13 - // | - MULXQ 32(SI), AX, R15 - ADOXQ AX, R11 - ADCXQ R15, R12 + // | w6 @ R14 + ADCQ R8, R14 + MOVQ $0x00, R8 + ADCQ $0x00, R8 - // | - MULXQ 40(SI), AX, R15 - ADOXQ AX, R12 - ADCXQ R15, R13 - ADOXQ CX, R13 - MOVQ $0x00, CX - ADCXQ CX, CX - XORQ BX, BX + // | - // | - // | - MOVQ R8, DX - MULXQ inp+32(FP), DX, R15 +/* i = 3 */ - // | - MULXQ (SI), AX, R15 - ADOXQ AX, R8 - ADCXQ R15, R9 + // | + // | W + // | 0 - | 1 - | 2 - | 3 R11 + // | 4 R12 | 5 R13 | 6 R14 | 7 BX - // | - MULXQ 8(SI), AX, R15 - ADOXQ AX, R9 - ADCXQ R15, R10 - // | - MULXQ 16(SI), AX, R15 - ADOXQ AX, R10 - ADCXQ R15, R11 + // | | u3 = w3 * inp + MOVQ R11, AX + MULQ inp+32(FP) + MOVQ AX, DI + MOVQ $0x00, CX - // | - MULXQ 24(SI), AX, R15 - ADOXQ AX, R11 - ADCXQ R15, R12 + // | - // | - MULXQ 32(SI), AX, R15 - ADOXQ AX, R12 - ADCXQ R15, R13 +/* */ - // | - MULXQ 40(SI), AX, R15 - ADOXQ AX, R13 - ADCXQ R15, R14 - ADOXQ CX, R14 - MOVQ $0x00, CX - ADCXQ CX, CX - XORQ R8, R8 - MOVQ (SP), R8 + // | j0 - // | - // | - MOVQ R9, DX - MULXQ inp+32(FP), DX, R15 + // | w3 @ R11 + MOVQ (SI), AX + MULQ DI + ADDQ AX, R11 + ADCQ DX, CX - // | - MULXQ (SI), AX, R15 - ADOXQ AX, R9 - ADCXQ R15, R10 + // | j1 - // | - MULXQ 8(SI), AX, R15 - ADOXQ AX, R10 - ADCXQ R15, R11 + // | w4 @ R12 + MOVQ 8(SI), AX + MULQ DI + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ CX, R12 + MOVQ $0x00, CX + ADCQ DX, CX - // | - MULXQ 16(SI), AX, R15 - ADOXQ AX, R11 - ADCXQ R15, R12 + // | j2 - // | - MULXQ 24(SI), AX, R15 - ADOXQ AX, R12 - ADCXQ R15, R13 + // | w5 @ R13 + MOVQ 16(SI), AX + MULQ DI + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ CX, R13 + MOVQ $0x00, CX + ADCQ DX, CX - // | - MULXQ 32(SI), AX, R15 - ADOXQ AX, R13 - ADCXQ R15, R14 + // | j3 - // | - MULXQ 40(SI), AX, R15 - ADOXQ AX, R14 - ADCXQ R15, R8 - ADOXQ CX, R8 - MOVQ $0x00, CX - ADCXQ CX, CX - XORQ R9, R9 - MOVQ 8(SP), R9 + // | w6 @ R14 + MOVQ 24(SI), AX + MULQ DI + ADDQ AX, R14 + ADCQ DX, R8 + ADDQ CX, R14 - // | - // | - MOVQ R10, DX - MULXQ inp+32(FP), DX, R15 + // | w-1 @ BX + ADCQ R8, BX + MOVQ $0x00, R8 + ADCQ $0x00, R8 - // | - MULXQ (SI), AX, R15 - ADOXQ AX, R10 - ADCXQ R15, R11 + // | + // | W montgomerry reduction ends + // | 0 - | 1 - | 2 - | 3 - + // | 4 R12 | 5 R13 | 6 R14 | 7 BX - // | - MULXQ 8(SI), AX, R15 - ADOXQ AX, R11 - ADCXQ R15, R12 - // | - MULXQ 16(SI), AX, R15 - ADOXQ AX, R12 - ADCXQ R15, R13 + // | - // | - MULXQ 24(SI), AX, R15 - ADOXQ AX, R13 - ADCXQ R15, R14 +/* modular reduction */ - // | - MULXQ 32(SI), AX, R15 - ADOXQ AX, R14 - ADCXQ R15, R8 + MOVQ R12, R9 + SUBQ (SI), R9 + MOVQ R13, R10 + SBBQ 8(SI), R10 + MOVQ R14, R11 + SBBQ 16(SI), R11 + MOVQ BX, R15 + SBBQ 24(SI), R15 + SBBQ $0x00, R8 - // | - MULXQ 40(SI), AX, R15 - ADOXQ AX, R8 - ADCXQ R15, R9 - ADOXQ CX, R9 - MOVQ $0x00, CX - ADCXQ CX, CX - XORQ R10, R10 - MOVQ 16(SP), R10 + // | - // | - // | - MOVQ R11, DX - MULXQ inp+32(FP), DX, R15 +/* out */ + + MOVQ c+0(FP), R8 + CMOVQCC R9, R12 + MOVQ R12, (R8) + CMOVQCC R10, R13 + MOVQ R13, 8(R8) + CMOVQCC R11, R14 + MOVQ R14, 16(R8) + CMOVQCC R15, BX + MOVQ BX, 24(R8) + RET - // | - MULXQ (SI), AX, R15 - ADOXQ AX, R11 - ADCXQ R15, R12 + // | - // | - MULXQ 8(SI), AX, R15 - ADOXQ AX, R12 - ADCXQ R15, R13 +/* end */ - // | - MULXQ 16(SI), AX, R15 - ADOXQ AX, R13 - ADCXQ R15, R14 - // | - MULXQ 24(SI), AX, R15 - ADOXQ AX, R14 - ADCXQ R15, R8 - - // | - MULXQ 32(SI), AX, R15 - ADOXQ AX, R8 - ADCXQ R15, R9 - - // | - MULXQ 40(SI), AX, R15 - ADOXQ AX, R9 - ADCXQ R15, R10 - ADOXQ CX, R10 - MOVQ $0x00, CX - ADCXQ CX, CX - ADOXQ R11, CX - - // | - -/* reduction */ - - MOVQ R12, DX - SUBQ (SI), DX - MOVQ R13, BX - SBBQ 8(SI), BX - MOVQ R14, DI - SBBQ 16(SI), DI - MOVQ R8, R11 - SBBQ 24(SI), R11 - MOVQ R9, R15 - SBBQ 32(SI), R15 - MOVQ R10, AX - SBBQ 40(SI), AX - MOVQ AX, (SP) - SBBQ $0x00, CX - - // | - MOVQ c+0(FP), CX - CMOVQCC DX, R12 - MOVQ R12, (CX) - CMOVQCC BX, R13 - MOVQ R13, 8(CX) - CMOVQCC DI, R14 - MOVQ R14, 16(CX) - CMOVQCC R11, R8 - MOVQ R8, 24(CX) - CMOVQCC R15, R9 - MOVQ R9, 32(CX) - CMOVQCC (SP), R10 - MOVQ R10, 40(CX) - RET - - // | - -/* end */ - - -// func cpy7(dst *[7]uint64, src *[7]uint64) -TEXT ·cpy7(SB), NOSPLIT, $0-16 +// func cpy5(dst *[5]uint64, src *[5]uint64) +TEXT ·cpy5(SB), NOSPLIT, $0-16 MOVQ dst+0(FP), DI MOVQ src+8(FP), SI MOVQ (SI), R8 @@ -2842,14 +2869,10 @@ TEXT ·cpy7(SB), NOSPLIT, $0-16 MOVQ R8, 24(DI) MOVQ 32(SI), R8 MOVQ R8, 32(DI) - MOVQ 40(SI), R8 - MOVQ R8, 40(DI) - MOVQ 48(SI), R8 - MOVQ R8, 48(DI) RET -// func eq7(a *[7]uint64, b *[7]uint64) bool -TEXT ·eq7(SB), NOSPLIT, $0-17 +// func eq5(a *[5]uint64, b *[5]uint64) bool +TEXT ·eq5(SB), NOSPLIT, $0-17 MOVQ a+0(FP), DI MOVQ b+8(FP), SI MOVB $0x00, ret+16(FP) @@ -2868,29 +2891,15 @@ TEXT ·eq7(SB), NOSPLIT, $0-17 MOVQ 32(DI), R8 CMPQ 32(SI), R8 JNE ret - MOVQ 40(DI), R8 - CMPQ 40(SI), R8 - JNE ret - MOVQ 48(DI), R8 - CMPQ 48(SI), R8 - JNE ret MOVB $0x01, ret+16(FP) ret: RET -// func cmp7(a *[7]uint64, b *[7]uint64) int8 -TEXT ·cmp7(SB), NOSPLIT, $0-17 +// func cmp5(a *[5]uint64, b *[5]uint64) int8 +TEXT ·cmp5(SB), NOSPLIT, $0-17 MOVQ a+0(FP), DI MOVQ b+8(FP), SI - MOVQ 48(DI), R8 - CMPQ 48(SI), R8 - JB gt - JA lt - MOVQ 40(DI), R8 - CMPQ 40(SI), R8 - JB gt - JA lt MOVQ 32(DI), R8 CMPQ 32(SI), R8 JB gt @@ -2924,8 +2933,8 @@ lt: ret: RET -// func add7(c *[7]uint64, a *[7]uint64, b *[7]uint64, p *[7]uint64) -TEXT ·add7(SB), NOSPLIT, $32-32 +// func add5(c *[5]uint64, a *[5]uint64, b *[5]uint64, p *[5]uint64) +TEXT ·add5(SB), NOSPLIT, $0-32 // | MOVQ a+8(FP), DI MOVQ b+16(FP), SI @@ -2942,54 +2951,44 @@ TEXT ·add7(SB), NOSPLIT, $32-32 ADCQ 24(SI), R9 MOVQ 32(DI), R10 ADCQ 32(SI), R10 - MOVQ 40(DI), R11 - ADCQ 40(SI), R11 - MOVQ 48(DI), R12 - ADCQ 48(SI), R12 ADCQ $0x00, AX // | MOVQ p+24(FP), SI - MOVQ CX, R13 - SUBQ (SI), R13 - MOVQ DX, R14 - SBBQ 8(SI), R14 - MOVQ R8, R15 - SBBQ 16(SI), R15 - MOVQ R9, BX - SBBQ 24(SI), BX - MOVQ BX, (SP) - MOVQ R10, BX - SBBQ 32(SI), BX - MOVQ BX, 8(SP) - MOVQ R11, BX - SBBQ 40(SI), BX - MOVQ BX, 16(SP) - MOVQ R12, BX - SBBQ 48(SI), BX - MOVQ BX, 24(SP) + MOVQ CX, R11 + SUBQ (SI), R11 + MOVQ DX, R12 + SBBQ 8(SI), R12 + MOVQ R8, R13 + SBBQ 16(SI), R13 + MOVQ R9, R14 + SBBQ 24(SI), R14 + MOVQ R10, R15 + SBBQ 32(SI), R15 SBBQ $0x00, AX // | MOVQ c+0(FP), DI - CMOVQCC R13, CX + CMOVQCC R11, CX MOVQ CX, (DI) - CMOVQCC R14, DX + CMOVQCC R12, DX MOVQ DX, 8(DI) - CMOVQCC R15, R8 + CMOVQCC R13, R8 MOVQ R8, 16(DI) - CMOVQCC (SP), R9 + CMOVQCC R14, R9 MOVQ R9, 24(DI) - CMOVQCC 8(SP), R10 + CMOVQCC R15, R10 MOVQ R10, 32(DI) - CMOVQCC 16(SP), R11 - MOVQ R11, 40(DI) - CMOVQCC 24(SP), R12 - MOVQ R12, 48(DI) RET -// func addn7(a *[7]uint64, b *[7]uint64) uint64 -TEXT ·addn7(SB), NOSPLIT, $0-24 + // | + +/* end */ + + RET + +// func addn5(a *[5]uint64, b *[5]uint64) uint64 +TEXT ·addn5(SB), NOSPLIT, $0-24 // | MOVQ a+0(FP), DI MOVQ b+8(FP), SI @@ -3005,10 +3004,6 @@ TEXT ·addn7(SB), NOSPLIT, $0-24 ADCQ 24(SI), R9 MOVQ 32(DI), R10 ADCQ 32(SI), R10 - MOVQ 40(DI), R11 - ADCQ 40(SI), R11 - MOVQ 48(DI), R12 - ADCQ 48(SI), R12 ADCQ $0x00, AX // | @@ -3017,13 +3012,17 @@ TEXT ·addn7(SB), NOSPLIT, $0-24 MOVQ R8, 16(DI) MOVQ R9, 24(DI) MOVQ R10, 32(DI) - MOVQ R11, 40(DI) - MOVQ R12, 48(DI) MOVQ AX, ret+16(FP) RET -// func double7(c *[7]uint64, a *[7]uint64, p *[7]uint64) -TEXT ·double7(SB), NOSPLIT, $32-24 + // | + +/* end */ + + RET + +// func double5(c *[5]uint64, a *[5]uint64, p *[5]uint64) +TEXT ·double5(SB), NOSPLIT, $0-24 // | MOVQ a+8(FP), DI XORQ AX, AX @@ -3037,54 +3036,44 @@ TEXT ·double7(SB), NOSPLIT, $32-24 ADCQ R9, R9 MOVQ 32(DI), R10 ADCQ R10, R10 - MOVQ 40(DI), R11 - ADCQ R11, R11 - MOVQ 48(DI), R12 - ADCQ R12, R12 ADCQ $0x00, AX // | MOVQ p+16(FP), SI - MOVQ CX, R13 - SUBQ (SI), R13 - MOVQ DX, R14 - SBBQ 8(SI), R14 - MOVQ R8, R15 - SBBQ 16(SI), R15 - MOVQ R9, BX - SBBQ 24(SI), BX - MOVQ BX, (SP) - MOVQ R10, BX - SBBQ 32(SI), BX - MOVQ BX, 8(SP) - MOVQ R11, BX - SBBQ 40(SI), BX - MOVQ BX, 16(SP) - MOVQ R12, BX - SBBQ 48(SI), BX - MOVQ BX, 24(SP) + MOVQ CX, R11 + SUBQ (SI), R11 + MOVQ DX, R12 + SBBQ 8(SI), R12 + MOVQ R8, R13 + SBBQ 16(SI), R13 + MOVQ R9, R14 + SBBQ 24(SI), R14 + MOVQ R10, R15 + SBBQ 32(SI), R15 SBBQ $0x00, AX // | MOVQ c+0(FP), DI - CMOVQCC R13, CX + CMOVQCC R11, CX MOVQ CX, (DI) - CMOVQCC R14, DX + CMOVQCC R12, DX MOVQ DX, 8(DI) - CMOVQCC R15, R8 + CMOVQCC R13, R8 MOVQ R8, 16(DI) - CMOVQCC (SP), R9 + CMOVQCC R14, R9 MOVQ R9, 24(DI) - CMOVQCC 8(SP), R10 + CMOVQCC R15, R10 MOVQ R10, 32(DI) - CMOVQCC 16(SP), R11 - MOVQ R11, 40(DI) - CMOVQCC 24(SP), R12 - MOVQ R12, 48(DI) RET -// func sub7(c *[7]uint64, a *[7]uint64, b *[7]uint64, p *[7]uint64) -TEXT ·sub7(SB), NOSPLIT, $32-32 + // | + +/* end */ + + RET + +// func sub5(c *[5]uint64, a *[5]uint64, b *[5]uint64, p *[5]uint64) +TEXT ·sub5(SB), NOSPLIT, $0-32 // | MOVQ a+8(FP), DI MOVQ b+16(FP), SI @@ -3099,48 +3088,42 @@ TEXT ·sub7(SB), NOSPLIT, $32-32 SBBQ 24(SI), R9 MOVQ 32(DI), R10 SBBQ 32(SI), R10 - MOVQ 40(DI), R11 - SBBQ 40(SI), R11 - MOVQ 48(DI), R12 - SBBQ 48(SI), R12 // | MOVQ p+24(FP), SI - MOVQ (SI), R13 + MOVQ (SI), R11 + CMOVQCC AX, R11 + MOVQ 8(SI), R12 + CMOVQCC AX, R12 + MOVQ 16(SI), R13 CMOVQCC AX, R13 - MOVQ 8(SI), R14 + MOVQ 24(SI), R14 CMOVQCC AX, R14 - MOVQ 16(SI), R15 + MOVQ 32(SI), R15 CMOVQCC AX, R15 - CMOVQCS 24(SI), AX - MOVQ AX, (SP) - CMOVQCS 32(SI), AX - MOVQ AX, 8(SP) - CMOVQCS 40(SI), AX - MOVQ AX, 16(SP) - CMOVQCS 48(SI), AX - MOVQ AX, 24(SP) // | MOVQ c+0(FP), DI - ADDQ R13, CX + ADDQ R11, CX MOVQ CX, (DI) - ADCQ R14, DX + ADCQ R12, DX MOVQ DX, 8(DI) - ADCQ R15, R8 + ADCQ R13, R8 MOVQ R8, 16(DI) - ADCQ (SP), R9 + ADCQ R14, R9 MOVQ R9, 24(DI) - ADCQ 8(SP), R10 + ADCQ R15, R10 MOVQ R10, 32(DI) - ADCQ 16(SP), R11 - MOVQ R11, 40(DI) - ADCQ 24(SP), R12 - MOVQ R12, 48(DI) RET -// func subn7(a *[7]uint64, b *[7]uint64) uint64 -TEXT ·subn7(SB), NOSPLIT, $0-24 + // | + +/* end */ + + RET + +// func subn5(a *[5]uint64, b *[5]uint64) uint64 +TEXT ·subn5(SB), NOSPLIT, $0-24 // | MOVQ a+0(FP), DI MOVQ b+8(FP), SI @@ -3157,10 +3140,6 @@ TEXT ·subn7(SB), NOSPLIT, $0-24 SBBQ 24(SI), R9 MOVQ 32(DI), R10 SBBQ 32(SI), R10 - MOVQ 40(DI), R11 - SBBQ 40(SI), R11 - MOVQ 48(DI), R12 - SBBQ 48(SI), R12 ADCQ $0x00, AX // | @@ -3169,13 +3148,17 @@ TEXT ·subn7(SB), NOSPLIT, $0-24 MOVQ R8, 16(DI) MOVQ R9, 24(DI) MOVQ R10, 32(DI) - MOVQ R11, 40(DI) - MOVQ R12, 48(DI) MOVQ AX, ret+16(FP) RET -// func _neg7(c *[7]uint64, a *[7]uint64, p *[7]uint64) -TEXT ·_neg7(SB), NOSPLIT, $0-24 + // | + +/* end */ + + RET + +// func _neg5(c *[5]uint64, a *[5]uint64, p *[5]uint64) +TEXT ·_neg5(SB), NOSPLIT, $0-24 // | MOVQ a+8(FP), DI @@ -3191,10 +3174,6 @@ TEXT ·_neg7(SB), NOSPLIT, $0-24 SBBQ 24(DI), R9 MOVQ 32(SI), R10 SBBQ 32(DI), R10 - MOVQ 40(SI), R11 - SBBQ 40(DI), R11 - MOVQ 48(SI), R12 - SBBQ 48(DI), R12 // | MOVQ c+0(FP), DI @@ -3203,12 +3182,16 @@ TEXT ·_neg7(SB), NOSPLIT, $0-24 MOVQ R8, 16(DI) MOVQ R9, 24(DI) MOVQ R10, 32(DI) - MOVQ R11, 40(DI) - MOVQ R12, 48(DI) RET -// func mul_two_7(a *[7]uint64) -TEXT ·mul_two_7(SB), NOSPLIT, $0-8 + // | + +/* end */ + + RET + +// func mul_two_5(a *[5]uint64) +TEXT ·mul_two_5(SB), NOSPLIT, $0-8 MOVQ a+0(FP), DI XORQ AX, AX RCLQ $0x01, (DI) @@ -3216,16 +3199,12 @@ TEXT ·mul_two_7(SB), NOSPLIT, $0-8 RCLQ $0x01, 16(DI) RCLQ $0x01, 24(DI) RCLQ $0x01, 32(DI) - RCLQ $0x01, 40(DI) - RCLQ $0x01, 48(DI) RET -// func div_two_7(a *[7]uint64) -TEXT ·div_two_7(SB), NOSPLIT, $0-8 +// func div_two_5(a *[5]uint64) +TEXT ·div_two_5(SB), NOSPLIT, $0-8 MOVQ a+0(FP), DI XORQ AX, AX - RCRQ $0x01, 48(DI) - RCRQ $0x01, 40(DI) RCRQ $0x01, 32(DI) RCRQ $0x01, 24(DI) RCRQ $0x01, 16(DI) @@ -3233,2035 +3212,1280 @@ TEXT ·div_two_7(SB), NOSPLIT, $0-8 RCRQ $0x01, (DI) RET -// func mul7(c *[14]uint64, a *[7]uint64, b *[7]uint64, p *[7]uint64, inp uint64) -TEXT ·mul7(SB), NOSPLIT, $40-40 +// func mul5(c *[5]uint64, a *[5]uint64, b *[5]uint64, p *[5]uint64, inp uint64) +TEXT ·mul5(SB), NOSPLIT, $0-40 // | -/* inputs */ +/* inputs */ MOVQ a+8(FP), DI MOVQ b+16(FP), SI XORQ AX, AX - // | - // | - MOVQ (SI), DX + // | - // | - MULXQ (DI), AX, CX - MOVQ AX, (SP) +/* i = 0 */ - // | - MULXQ 8(DI), AX, R8 - ADCXQ AX, CX + // | a0 @ DX + MOVQ (DI), DX - // | - MULXQ 16(DI), AX, R9 + // | a0 * b0 + MULXQ (SI), CX, R8 + + // | a0 * b1 + MULXQ 8(SI), AX, R9 ADCXQ AX, R8 - // | - MULXQ 24(DI), AX, R10 + // | a0 * b2 + MULXQ 16(SI), AX, R10 ADCXQ AX, R9 - // | - MULXQ 32(DI), AX, R11 + // | a0 * b3 + MULXQ 24(SI), AX, R11 ADCXQ AX, R10 - // | - MULXQ 40(DI), AX, R12 + // | a0 * b4 + MULXQ 32(SI), AX, R12 ADCXQ AX, R11 + ADCQ $0x00, R12 - // | - MULXQ 48(DI), AX, R13 - ADCXQ AX, R12 - ADCQ $0x00, R13 + // | - // | - // | - MOVQ 8(SI), DX - XORQ R14, R14 +/* i = 1 */ - // | - MULXQ (DI), AX, BX - ADOXQ AX, CX - ADCXQ BX, R8 - MOVQ CX, 8(SP) + // | a1 @ DX + MOVQ 8(DI), DX + XORQ R13, R13 - // | - MULXQ 8(DI), AX, BX + // | a1 * b0 + MULXQ (SI), AX, BX ADOXQ AX, R8 ADCXQ BX, R9 - // | - MULXQ 16(DI), AX, BX + // | a1 * b1 + MULXQ 8(SI), AX, BX ADOXQ AX, R9 ADCXQ BX, R10 - // | - MULXQ 24(DI), AX, BX + // | a1 * b2 + MULXQ 16(SI), AX, BX ADOXQ AX, R10 ADCXQ BX, R11 - // | - MULXQ 32(DI), AX, BX + // | a1 * b3 + MULXQ 24(SI), AX, BX ADOXQ AX, R11 ADCXQ BX, R12 - // | - MULXQ 40(DI), AX, BX + // | a1 * b4 + MULXQ 32(SI), AX, BX ADOXQ AX, R12 + ADOXQ R13, R13 ADCXQ BX, R13 - // | - MULXQ 48(DI), AX, BX - ADOXQ AX, R13 - ADOXQ R14, R14 - ADCXQ BX, R14 + // | - // | - // | - MOVQ 16(SI), DX - XORQ R15, R15 +/* i = 2 */ - // | - MULXQ (DI), AX, BX - ADOXQ AX, R8 - ADCXQ BX, R9 - MOVQ R8, 16(SP) + // | a2 @ DX + MOVQ 16(DI), DX + XORQ R14, R14 - // | - MULXQ 8(DI), AX, BX + // | a2 * b0 + MULXQ (SI), AX, BX ADOXQ AX, R9 ADCXQ BX, R10 - // | - MULXQ 16(DI), AX, BX + // | a2 * b1 + MULXQ 8(SI), AX, BX ADOXQ AX, R10 ADCXQ BX, R11 - // | - MULXQ 24(DI), AX, BX + // | a2 * b2 + MULXQ 16(SI), AX, BX ADOXQ AX, R11 ADCXQ BX, R12 - // | - MULXQ 32(DI), AX, BX + // | a2 * b3 + MULXQ 24(SI), AX, BX ADOXQ AX, R12 ADCXQ BX, R13 - // | - MULXQ 40(DI), AX, BX + // | a2 * b4 + MULXQ 32(SI), AX, BX ADOXQ AX, R13 + ADOXQ R14, R14 ADCXQ BX, R14 - // | - MULXQ 48(DI), AX, BX - ADOXQ AX, R14 - ADOXQ R15, R15 - ADCXQ BX, R15 + // | - // | - // | - MOVQ 24(SI), DX - XORQ CX, CX +/* i = 3 */ - // | - MULXQ (DI), AX, BX - ADOXQ AX, R9 - ADCXQ BX, R10 - MOVQ R9, 24(SP) + // | a3 @ DX + MOVQ 24(DI), DX + XORQ R15, R15 - // | - MULXQ 8(DI), AX, BX + // | a3 * b0 + MULXQ (SI), AX, BX ADOXQ AX, R10 ADCXQ BX, R11 - // | - MULXQ 16(DI), AX, BX + // | a3 * b1 + MULXQ 8(SI), AX, BX ADOXQ AX, R11 ADCXQ BX, R12 - // | - MULXQ 24(DI), AX, BX + // | a3 * b2 + MULXQ 16(SI), AX, BX ADOXQ AX, R12 ADCXQ BX, R13 - // | - MULXQ 32(DI), AX, BX + // | a3 * b3 + MULXQ 24(SI), AX, BX ADOXQ AX, R13 ADCXQ BX, R14 - // | - MULXQ 40(DI), AX, BX + // | a3 * b4 + MULXQ 32(SI), AX, BX ADOXQ AX, R14 + ADOXQ R15, R15 ADCXQ BX, R15 - // | - MULXQ 48(DI), AX, BX - ADOXQ AX, R15 - ADOXQ CX, CX - ADCXQ BX, CX + // | - // | - // | - MOVQ 32(SI), DX - XORQ R8, R8 +/* i = 4 */ - // | - MULXQ (DI), AX, BX - ADOXQ AX, R10 - ADCXQ BX, R11 + // | a4 @ DX + MOVQ 32(DI), DX + XORQ DI, DI - // | - MULXQ 8(DI), AX, BX + // | a4 * b0 + MULXQ (SI), AX, BX ADOXQ AX, R11 ADCXQ BX, R12 - // | - MULXQ 16(DI), AX, BX + // | a4 * b1 + MULXQ 8(SI), AX, BX ADOXQ AX, R12 ADCXQ BX, R13 - // | - MULXQ 24(DI), AX, BX + // | a4 * b2 + MULXQ 16(SI), AX, BX ADOXQ AX, R13 ADCXQ BX, R14 - // | - MULXQ 32(DI), AX, BX + // | a4 * b3 + MULXQ 24(SI), AX, BX ADOXQ AX, R14 ADCXQ BX, R15 - // | - MULXQ 40(DI), AX, BX + // | a4 * b4 + MULXQ 32(SI), AX, BX ADOXQ AX, R15 - ADCXQ BX, CX + ADOXQ BX, DI + ADCQ $0x00, DI - // | - MULXQ 48(DI), AX, BX - ADOXQ AX, CX - ADOXQ R8, R8 - ADCXQ BX, R8 + // | - // | - // | - MOVQ 40(SI), DX - XORQ R9, R9 +/* */ - // | - MULXQ (DI), AX, BX - ADOXQ AX, R11 - ADCXQ BX, R12 + // | + // | W + // | 0 CX | 1 R8 | 2 R9 | 3 R10 | 4 R11 + // | 5 R12 | 6 R13 | 7 R14 | 8 R15 | 9 DI - // | - MULXQ 8(DI), AX, BX - ADOXQ AX, R12 - ADCXQ BX, R13 - // | - MULXQ 16(DI), AX, BX - ADOXQ AX, R13 - ADCXQ BX, R14 + // | fetch modulus + MOVQ p+24(FP), SI - // | - MULXQ 24(DI), AX, BX - ADOXQ AX, R14 - ADCXQ BX, R15 + // | + // | W ready to mont + // | 0 CX | 1 R8 | 2 R9 | 3 R10 | 4 R11 + // | 5 R12 | 6 R13 | 7 R14 | 8 R15 | 9 DI - // | - MULXQ 32(DI), AX, BX - ADOXQ AX, R15 - ADCXQ BX, CX - // | - MULXQ 40(DI), AX, BX - ADOXQ AX, CX - ADCXQ BX, R8 + // | - // | - MULXQ 48(DI), AX, BX - ADOXQ AX, R8 - ADOXQ R9, R9 - ADCXQ BX, R9 +/* montgomery reduction */ - // | - // | - MOVQ 48(SI), DX - XORQ SI, SI + // | clear flags + XORQ AX, AX - // | - MULXQ (DI), AX, BX - ADOXQ AX, R12 - ADCXQ BX, R13 + // | - // | - MULXQ 8(DI), AX, BX - ADOXQ AX, R13 - ADCXQ BX, R14 +/* i = 0 */ - // | - MULXQ 16(DI), AX, BX - ADOXQ AX, R14 - ADCXQ BX, R15 + // | + // | W + // | 0 CX | 1 R8 | 2 R9 | 3 R10 | 4 R11 + // | 5 R12 | 6 R13 | 7 R14 | 8 R15 | 9 DI - // | - MULXQ 24(DI), AX, BX - ADOXQ AX, R15 - ADCXQ BX, CX - // | - MULXQ 32(DI), AX, BX + // | | u0 = w0 * inp + MOVQ CX, DX + MULXQ inp+32(FP), DX, BX + + // | + +/* */ + + // | j0 + + // | w0 @ CX + MULXQ (SI), AX, BX ADOXQ AX, CX ADCXQ BX, R8 - // | - MULXQ 40(DI), AX, BX + // | j1 + + // | w1 @ R8 + MULXQ 8(SI), AX, BX ADOXQ AX, R8 ADCXQ BX, R9 - // | - MULXQ 48(DI), AX, BX - ADOXQ AX, R9 - ADOXQ BX, SI - ADCQ $0x00, SI - - // | - MOVQ (SP), DI - MOVQ 8(SP), BX - MOVQ R15, (SP) - MOVQ 16(SP), R15 - MOVQ CX, 8(SP) - MOVQ 24(SP), CX - MOVQ R8, 16(SP) - MOVQ R9, 24(SP) - MOVQ SI, 32(SP) - MOVQ p+24(FP), SI + // | j2 - // | - // | - XORQ R9, R9 - MOVQ DI, DX - MULXQ inp+32(FP), DX, R8 + // | w2 @ R9 + MULXQ 16(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 - // | - MULXQ (SI), AX, R8 - ADOXQ AX, DI - ADCXQ R8, BX + // | j3 - // | - MULXQ 8(SI), AX, R8 - ADOXQ AX, BX - ADCXQ R8, R15 + // | w3 @ R10 + MULXQ 24(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 - // | - MULXQ 16(SI), AX, R8 - ADOXQ AX, R15 - ADCXQ R8, CX + // | j4 - // | - MULXQ 24(SI), AX, R8 + // | w4 @ R11 + MULXQ 32(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + ADOXQ CX, R12 + ADCXQ CX, CX + MOVQ $0x00, AX ADOXQ AX, CX - ADCXQ R8, R10 - // | - MULXQ 32(SI), AX, R8 - ADOXQ AX, R10 - ADCXQ R8, R11 + // | clear flags + XORQ AX, AX - // | - MULXQ 40(SI), AX, R8 - ADOXQ AX, R11 - ADCXQ R8, R12 + // | - // | - MULXQ 48(SI), AX, R8 - ADOXQ AX, R12 - ADCXQ R8, R13 - ADOXQ R9, R13 - ADCXQ R9, R9 - XORQ DI, DI +/* i = 1 */ - // | - // | - MOVQ BX, DX - MULXQ inp+32(FP), DX, R8 + // | + // | W + // | 0 - | 1 R8 | 2 R9 | 3 R10 | 4 R11 + // | 5 R12 | 6 R13 | 7 R14 | 8 R15 | 9 DI - // | - MULXQ (SI), AX, R8 - ADOXQ AX, BX - ADCXQ R8, R15 - // | - MULXQ 8(SI), AX, R8 - ADOXQ AX, R15 - ADCXQ R8, CX + // | | u1 = w1 * inp + MOVQ R8, DX + MULXQ inp+32(FP), DX, BX - // | - MULXQ 16(SI), AX, R8 - ADOXQ AX, CX - ADCXQ R8, R10 + // | - // | - MULXQ 24(SI), AX, R8 - ADOXQ AX, R10 - ADCXQ R8, R11 +/* */ - // | - MULXQ 32(SI), AX, R8 - ADOXQ AX, R11 - ADCXQ R8, R12 + // | j0 - // | - MULXQ 40(SI), AX, R8 - ADOXQ AX, R12 - ADCXQ R8, R13 + // | w1 @ R8 + MULXQ (SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 - // | - MULXQ 48(SI), AX, R8 - ADOXQ AX, R13 - ADCXQ R8, R14 - ADOXQ R9, R14 - MOVQ $0x00, R9 - ADCXQ R9, R9 - XORQ BX, BX - MOVQ (SP), BX - - // | - // | - MOVQ R15, DX - MULXQ inp+32(FP), DX, R8 + // | j1 - // | - MULXQ (SI), AX, R8 - ADOXQ AX, R15 - ADCXQ R8, CX + // | w2 @ R9 + MULXQ 8(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 - // | - MULXQ 8(SI), AX, R8 - ADOXQ AX, CX - ADCXQ R8, R10 + // | j2 - // | - MULXQ 16(SI), AX, R8 + // | w3 @ R10 + MULXQ 16(SI), AX, BX ADOXQ AX, R10 - ADCXQ R8, R11 + ADCXQ BX, R11 - // | - MULXQ 24(SI), AX, R8 + // | j3 + + // | w4 @ R11 + MULXQ 24(SI), AX, BX ADOXQ AX, R11 - ADCXQ R8, R12 + ADCXQ BX, R12 - // | - MULXQ 32(SI), AX, R8 + // | j4 + + // | w5 @ R12 + MULXQ 32(SI), AX, BX ADOXQ AX, R12 - ADCXQ R8, R13 + ADCXQ BX, R13 + ADOXQ CX, R13 + ADCXQ R8, R8 + MOVQ $0x00, AX + ADOXQ AX, R8 - // | - MULXQ 40(SI), AX, R8 - ADOXQ AX, R13 - ADCXQ R8, R14 + // | clear flags + XORQ AX, AX - // | - MULXQ 48(SI), AX, R8 - ADOXQ AX, R14 - ADCXQ R8, BX - ADOXQ R9, BX - MOVQ $0x00, R9 - ADCXQ R9, R9 - XORQ R15, R15 - MOVQ 8(SP), R15 + // | - // | - // | - MOVQ CX, DX - MULXQ inp+32(FP), DX, R8 +/* i = 2 */ - // | - MULXQ (SI), AX, R8 - ADOXQ AX, CX - ADCXQ R8, R10 + // | + // | W + // | 0 - | 1 - | 2 R9 | 3 R10 | 4 R11 + // | 5 R12 | 6 R13 | 7 R14 | 8 R15 | 9 DI - // | - MULXQ 8(SI), AX, R8 + + // | | u2 = w2 * inp + MOVQ R9, DX + MULXQ inp+32(FP), DX, BX + + // | + +/* */ + + // | j0 + + // | w2 @ R9 + MULXQ (SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | j1 + + // | w3 @ R10 + MULXQ 8(SI), AX, BX ADOXQ AX, R10 - ADCXQ R8, R11 + ADCXQ BX, R11 - // | - MULXQ 16(SI), AX, R8 + // | j2 + + // | w4 @ R11 + MULXQ 16(SI), AX, BX ADOXQ AX, R11 - ADCXQ R8, R12 + ADCXQ BX, R12 - // | - MULXQ 24(SI), AX, R8 + // | j3 + + // | w5 @ R12 + MULXQ 24(SI), AX, BX ADOXQ AX, R12 - ADCXQ R8, R13 + ADCXQ BX, R13 - // | - MULXQ 32(SI), AX, R8 + // | j4 + + // | w6 @ R13 + MULXQ 32(SI), AX, BX ADOXQ AX, R13 - ADCXQ R8, R14 + ADCXQ BX, R14 + ADOXQ R8, R14 + ADCXQ R9, R9 + MOVQ $0x00, AX + ADOXQ AX, R9 - // | - MULXQ 40(SI), AX, R8 - ADOXQ AX, R14 - ADCXQ R8, BX + // | clear flags + XORQ AX, AX - // | - MULXQ 48(SI), AX, R8 - ADOXQ AX, BX - ADCXQ R8, R15 - ADOXQ R9, R15 - MOVQ $0x00, R9 - ADCXQ R9, R9 - XORQ CX, CX - MOVQ 16(SP), CX + // | - // | - // | +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 R10 | 4 R11 + // | 5 R12 | 6 R13 | 7 R14 | 8 R15 | 9 DI + + + // | | u3 = w3 * inp MOVQ R10, DX - MULXQ inp+32(FP), DX, R8 + MULXQ inp+32(FP), DX, BX - // | - MULXQ (SI), AX, R8 + // | + +/* */ + + // | j0 + + // | w3 @ R10 + MULXQ (SI), AX, BX ADOXQ AX, R10 - ADCXQ R8, R11 + ADCXQ BX, R11 - // | - MULXQ 8(SI), AX, R8 + // | j1 + + // | w4 @ R11 + MULXQ 8(SI), AX, BX ADOXQ AX, R11 - ADCXQ R8, R12 + ADCXQ BX, R12 - // | - MULXQ 16(SI), AX, R8 + // | j2 + + // | w5 @ R12 + MULXQ 16(SI), AX, BX ADOXQ AX, R12 - ADCXQ R8, R13 + ADCXQ BX, R13 - // | - MULXQ 24(SI), AX, R8 + // | j3 + + // | w6 @ R13 + MULXQ 24(SI), AX, BX ADOXQ AX, R13 - ADCXQ R8, R14 + ADCXQ BX, R14 - // | - MULXQ 32(SI), AX, R8 + // | j4 + + // | w7 @ R14 + MULXQ 32(SI), AX, BX ADOXQ AX, R14 - ADCXQ R8, BX + ADCXQ BX, R15 + ADOXQ R9, R15 + ADCXQ R10, R10 + MOVQ $0x00, AX + ADOXQ AX, R10 - // | - MULXQ 40(SI), AX, R8 - ADOXQ AX, BX - ADCXQ R8, R15 + // | clear flags + XORQ AX, AX - // | - MULXQ 48(SI), AX, R8 - ADOXQ AX, R15 - ADCXQ R8, CX - ADOXQ R9, CX - MOVQ $0x00, R9 - ADCXQ R9, R9 - XORQ R10, R10 - MOVQ 24(SP), R10 + // | - // | - // | - MOVQ R11, DX - MULXQ inp+32(FP), DX, R8 +/* i = 4 */ - // | - MULXQ (SI), AX, R8 - ADOXQ AX, R11 - ADCXQ R8, R12 + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 R11 + // | 5 R12 | 6 R13 | 7 R14 | 8 R15 | 9 DI - // | - MULXQ 8(SI), AX, R8 - ADOXQ AX, R12 - ADCXQ R8, R13 - // | - MULXQ 16(SI), AX, R8 - ADOXQ AX, R13 - ADCXQ R8, R14 + // | | u4 = w4 * inp + MOVQ R11, DX + MULXQ inp+32(FP), DX, BX - // | - MULXQ 24(SI), AX, R8 - ADOXQ AX, R14 - ADCXQ R8, BX + // | - // | - MULXQ 32(SI), AX, R8 - ADOXQ AX, BX - ADCXQ R8, R15 +/* */ - // | - MULXQ 40(SI), AX, R8 - ADOXQ AX, R15 - ADCXQ R8, CX + // | j0 - // | - MULXQ 48(SI), AX, R8 - ADOXQ AX, CX - ADCXQ R8, R10 - ADOXQ R9, R10 - MOVQ $0x00, R9 - ADCXQ R9, R9 - XORQ R11, R11 - MOVQ 32(SP), R11 + // | w4 @ R11 + MULXQ (SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 - // | - // | - MOVQ R12, DX - MULXQ inp+32(FP), DX, R8 + // | j1 - // | - MULXQ (SI), AX, R8 + // | w5 @ R12 + MULXQ 8(SI), AX, BX ADOXQ AX, R12 - ADCXQ R8, R13 + ADCXQ BX, R13 - // | - MULXQ 8(SI), AX, R8 + // | j2 + + // | w6 @ R13 + MULXQ 16(SI), AX, BX ADOXQ AX, R13 - ADCXQ R8, R14 + ADCXQ BX, R14 - // | - MULXQ 16(SI), AX, R8 + // | j3 + + // | w7 @ R14 + MULXQ 24(SI), AX, BX ADOXQ AX, R14 - ADCXQ R8, BX + ADCXQ BX, R15 - // | - MULXQ 24(SI), AX, R8 - ADOXQ AX, BX - ADCXQ R8, R15 + // | j4 - // | - MULXQ 32(SI), AX, R8 + // | w8 @ R15 + MULXQ 32(SI), AX, BX ADOXQ AX, R15 - ADCXQ R8, CX + ADCXQ BX, DI + ADOXQ R10, DI + ADCXQ R11, R11 + MOVQ $0x00, AX + ADOXQ AX, R11 - // | - MULXQ 40(SI), AX, R8 - ADOXQ AX, CX - ADCXQ R8, R10 + // | + // | W montgomery reduction ends + // | 0 - | 1 - | 2 - | 3 - | 4 - + // | 5 R12 | 6 R13 | 7 R14 | 8 R15 | 9 DI - // | - MULXQ 48(SI), AX, R8 - ADOXQ AX, R10 - ADCXQ R8, R11 - ADOXQ R9, R11 - MOVQ $0x00, R9 - ADCXQ R9, R9 - ADOXQ R12, R9 // | -/* reduction */ +/* modular reduction */ - MOVQ R13, DX - SUBQ (SI), DX - MOVQ R14, DI - SBBQ 8(SI), DI - MOVQ BX, R8 - SBBQ 16(SI), R8 - MOVQ R15, R12 - SBBQ 24(SI), R12 - MOVQ CX, AX - SBBQ 32(SI), AX - MOVQ AX, (SP) - MOVQ R10, AX - SBBQ 40(SI), AX - MOVQ AX, 8(SP) - MOVQ R11, AX - SBBQ 48(SI), AX - MOVQ AX, 16(SP) - SBBQ $0x00, R9 + MOVQ R12, CX + SUBQ (SI), CX + MOVQ R13, AX + SBBQ 8(SI), AX + MOVQ R14, BX + SBBQ 16(SI), BX + MOVQ R15, R8 + SBBQ 24(SI), R8 + MOVQ DI, R9 + SBBQ 32(SI), R9 + SBBQ $0x00, R11 - // | - MOVQ c+0(FP), R9 - CMOVQCC DX, R13 - MOVQ R13, (R9) - CMOVQCC DI, R14 - MOVQ R14, 8(R9) - CMOVQCC R8, BX - MOVQ BX, 16(R9) - CMOVQCC R12, R15 - MOVQ R15, 24(R9) - CMOVQCC (SP), CX - MOVQ CX, 32(R9) - CMOVQCC 8(SP), R10 - MOVQ R10, 40(R9) - CMOVQCC 16(SP), R11 - MOVQ R11, 48(R9) + // | + +/* out */ + + MOVQ c+0(FP), R11 + CMOVQCC CX, R12 + MOVQ R12, (R11) + CMOVQCC AX, R13 + MOVQ R13, 8(R11) + CMOVQCC BX, R14 + MOVQ R14, 16(R11) + CMOVQCC R8, R15 + MOVQ R15, 24(R11) + CMOVQCC R9, DI + MOVQ DI, 32(R11) RET // | -/* end */ +/* end */ -// func cpy8(dst *[8]uint64, src *[8]uint64) -TEXT ·cpy8(SB), NOSPLIT, $0-16 - MOVQ dst+0(FP), DI - MOVQ src+8(FP), SI - MOVQ (SI), R8 - MOVQ R8, (DI) - MOVQ 8(SI), R8 - MOVQ R8, 8(DI) - MOVQ 16(SI), R8 - MOVQ R8, 16(DI) - MOVQ 24(SI), R8 - MOVQ R8, 24(DI) - MOVQ 32(SI), R8 - MOVQ R8, 32(DI) - MOVQ 40(SI), R8 - MOVQ R8, 40(DI) - MOVQ 48(SI), R8 - MOVQ R8, 48(DI) - MOVQ 56(SI), R8 - MOVQ R8, 56(DI) - RET +// func mul_no_adx_bmi2_5(c *[5]uint64, a *[5]uint64, b *[5]uint64, p *[5]uint64, inp uint64) +TEXT ·mul_no_adx_bmi2_5(SB), NOSPLIT, $16-40 + // | -// func eq8(a *[8]uint64, b *[8]uint64) bool -TEXT ·eq8(SB), NOSPLIT, $0-17 - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - MOVB $0x00, ret+16(FP) - MOVQ (DI), R8 - CMPQ (SI), R8 - JNE ret - MOVQ 8(DI), R8 - CMPQ 8(SI), R8 - JNE ret - MOVQ 16(DI), R8 - CMPQ 16(SI), R8 - JNE ret - MOVQ 24(DI), R8 - CMPQ 24(SI), R8 - JNE ret - MOVQ 32(DI), R8 - CMPQ 32(SI), R8 - JNE ret - MOVQ 40(DI), R8 - CMPQ 40(SI), R8 - JNE ret - MOVQ 48(DI), R8 - CMPQ 48(SI), R8 - JNE ret - MOVQ 56(DI), R8 - CMPQ 56(SI), R8 - JNE ret - MOVB $0x01, ret+16(FP) +/* inputs */ -ret: - RET + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + MOVQ $0x00, R9 + MOVQ $0x00, R10 + MOVQ $0x00, R11 + MOVQ $0x00, R12 + MOVQ $0x00, R13 + MOVQ $0x00, R14 + MOVQ $0x00, R15 -// func cmp8(a *[8]uint64, b *[8]uint64) int8 -TEXT ·cmp8(SB), NOSPLIT, $0-17 - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - MOVQ 56(DI), R8 - CMPQ 56(SI), R8 - JB gt - JA lt - MOVQ 48(DI), R8 - CMPQ 48(SI), R8 - JB gt - JA lt - MOVQ 40(DI), R8 - CMPQ 40(SI), R8 - JB gt - JA lt - MOVQ 32(DI), R8 - CMPQ 32(SI), R8 - JB gt - JA lt - MOVQ 24(DI), R8 - CMPQ 24(SI), R8 - JB gt - JA lt - MOVQ 16(DI), R8 - CMPQ 16(SI), R8 - JB gt - JA lt - MOVQ 8(DI), R8 - CMPQ 8(SI), R8 - JB gt - JA lt - MOVQ (DI), R8 - CMPQ (SI), R8 - JB gt - JA lt - MOVB $0x00, ret+16(FP) - JMP ret + // | -gt: - MOVB $0x01, ret+16(FP) - JMP ret +/* i = 0 */ -lt: - MOVB $0xff, ret+16(FP) + // | a0 @ CX + MOVQ (DI), CX -ret: - RET + // | a0 * b0 + MOVQ (SI), AX + MULQ CX + MOVQ AX, (SP) + MOVQ DX, R8 -// func add8(c *[8]uint64, a *[8]uint64, b *[8]uint64, p *[8]uint64) -TEXT ·add8(SB), NOSPLIT, $48-32 - // | - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI - XORQ AX, AX + // | a0 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 - // | - MOVQ (DI), CX - ADDQ (SI), CX - MOVQ 8(DI), DX - ADCQ 8(SI), DX - MOVQ 16(DI), R8 - ADCQ 16(SI), R8 - MOVQ 24(DI), R9 - ADCQ 24(SI), R9 - MOVQ 32(DI), R10 - ADCQ 32(SI), R10 - MOVQ 40(DI), R11 - ADCQ 40(SI), R11 - MOVQ 48(DI), R12 - ADCQ 48(SI), R12 - MOVQ 56(DI), R13 - ADCQ 56(SI), R13 - ADCQ $0x00, AX + // | a0 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 - // | - MOVQ p+24(FP), SI - MOVQ CX, R14 - SUBQ (SI), R14 - MOVQ DX, R15 - SBBQ 8(SI), R15 - MOVQ R8, BX - SBBQ 16(SI), BX - MOVQ BX, (SP) - MOVQ R9, BX - SBBQ 24(SI), BX - MOVQ BX, 8(SP) - MOVQ R10, BX - SBBQ 32(SI), BX - MOVQ BX, 16(SP) - MOVQ R11, BX - SBBQ 40(SI), BX - MOVQ BX, 24(SP) - MOVQ R12, BX - SBBQ 48(SI), BX - MOVQ BX, 32(SP) - MOVQ R13, BX - SBBQ 56(SI), BX - MOVQ BX, 40(SP) - SBBQ $0x00, AX + // | a0 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 - // | - MOVQ c+0(FP), DI - CMOVQCC R14, CX - MOVQ CX, (DI) - CMOVQCC R15, DX - MOVQ DX, 8(DI) - CMOVQCC (SP), R8 - MOVQ R8, 16(DI) - CMOVQCC 8(SP), R9 - MOVQ R9, 24(DI) - CMOVQCC 16(SP), R10 - MOVQ R10, 32(DI) - CMOVQCC 24(SP), R11 - MOVQ R11, 40(DI) - CMOVQCC 32(SP), R12 - MOVQ R12, 48(DI) - CMOVQCC 40(SP), R13 - MOVQ R13, 56(DI) - RET + // | a0 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 -// func addn8(a *[8]uint64, b *[8]uint64) uint64 -TEXT ·addn8(SB), NOSPLIT, $0-24 - // | - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI + // | - // | - MOVQ (DI), CX - ADDQ (SI), CX - MOVQ 8(DI), DX - ADCQ 8(SI), DX - MOVQ 16(DI), R8 - ADCQ 16(SI), R8 - MOVQ 24(DI), R9 - ADCQ 24(SI), R9 - MOVQ 32(DI), R10 - ADCQ 32(SI), R10 - MOVQ 40(DI), R11 - ADCQ 40(SI), R11 - MOVQ 48(DI), R12 - ADCQ 48(SI), R12 - MOVQ 56(DI), R13 - ADCQ 56(SI), R13 - ADCQ $0x00, AX +/* i = 1 */ - // | - MOVQ CX, (DI) - MOVQ DX, 8(DI) - MOVQ R8, 16(DI) - MOVQ R9, 24(DI) - MOVQ R10, 32(DI) - MOVQ R11, 40(DI) - MOVQ R12, 48(DI) - MOVQ R13, 56(DI) - MOVQ AX, ret+16(FP) - RET + // | a1 @ CX + MOVQ 8(DI), CX + MOVQ $0x00, BX -// func double8(c *[8]uint64, a *[8]uint64, p *[8]uint64) -TEXT ·double8(SB), NOSPLIT, $48-24 - // | - MOVQ a+8(FP), DI - XORQ AX, AX - MOVQ (DI), CX - ADDQ CX, CX - MOVQ 8(DI), DX - ADCQ DX, DX - MOVQ 16(DI), R8 - ADCQ R8, R8 - MOVQ 24(DI), R9 - ADCQ R9, R9 - MOVQ 32(DI), R10 - ADCQ R10, R10 - MOVQ 40(DI), R11 - ADCQ R11, R11 - MOVQ 48(DI), R12 - ADCQ R12, R12 - MOVQ 56(DI), R13 - ADCQ R13, R13 - ADCQ $0x00, AX + // | a1 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R10 + ADCQ $0x00, BX - // | - MOVQ p+16(FP), SI - MOVQ CX, R14 - SUBQ (SI), R14 - MOVQ DX, R15 - SBBQ 8(SI), R15 - MOVQ R8, BX - SBBQ 16(SI), BX - MOVQ BX, (SP) - MOVQ R9, BX - SBBQ 24(SI), BX - MOVQ BX, 8(SP) - MOVQ R10, BX - SBBQ 32(SI), BX - MOVQ BX, 16(SP) - MOVQ R11, BX - SBBQ 40(SI), BX - MOVQ BX, 24(SP) - MOVQ R12, BX - SBBQ 48(SI), BX - MOVQ BX, 32(SP) - MOVQ R13, BX - SBBQ 56(SI), BX - MOVQ BX, 40(SP) - SBBQ $0x00, AX + // | a1 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | - MOVQ c+0(FP), DI - CMOVQCC R14, CX - MOVQ CX, (DI) - CMOVQCC R15, DX - MOVQ DX, 8(DI) - CMOVQCC (SP), R8 - MOVQ R8, 16(DI) - CMOVQCC 8(SP), R9 - MOVQ R9, 24(DI) - CMOVQCC 16(SP), R10 - MOVQ R10, 32(DI) - CMOVQCC 24(SP), R11 - MOVQ R11, 40(DI) - CMOVQCC 32(SP), R12 - MOVQ R12, 48(DI) - CMOVQCC 40(SP), R13 - MOVQ R13, 56(DI) - RET - -// func sub8(c *[8]uint64, a *[8]uint64, b *[8]uint64, p *[8]uint64) -TEXT ·sub8(SB), NOSPLIT, $48-32 - // | - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI - XORQ AX, AX - MOVQ (DI), CX - SUBQ (SI), CX - MOVQ 8(DI), DX - SBBQ 8(SI), DX - MOVQ 16(DI), R8 - SBBQ 16(SI), R8 - MOVQ 24(DI), R9 - SBBQ 24(SI), R9 - MOVQ 32(DI), R10 - SBBQ 32(SI), R10 - MOVQ 40(DI), R11 - SBBQ 40(SI), R11 - MOVQ 48(DI), R12 - SBBQ 48(SI), R12 - MOVQ 56(DI), R13 - SBBQ 56(SI), R13 + // | a1 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | - MOVQ p+24(FP), SI - MOVQ (SI), R14 - CMOVQCC AX, R14 - MOVQ 8(SI), R15 - CMOVQCC AX, R15 - CMOVQCS 16(SI), AX - MOVQ AX, (SP) - CMOVQCS 24(SI), AX - MOVQ AX, 8(SP) - CMOVQCS 32(SI), AX - MOVQ AX, 16(SP) - CMOVQCS 40(SI), AX - MOVQ AX, 24(SP) - CMOVQCS 48(SI), AX - MOVQ AX, 32(SP) - CMOVQCS 56(SI), AX - MOVQ AX, 40(SP) + // | a1 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 - // | - MOVQ c+0(FP), DI - ADDQ R14, CX - MOVQ CX, (DI) - ADCQ R15, DX - MOVQ DX, 8(DI) - ADCQ (SP), R8 - MOVQ R8, 16(DI) - ADCQ 8(SP), R9 - MOVQ R9, 24(DI) - ADCQ 16(SP), R10 - MOVQ R10, 32(DI) - ADCQ 24(SP), R11 - MOVQ R11, 40(DI) - ADCQ 32(SP), R12 - MOVQ R12, 48(DI) - ADCQ 40(SP), R13 - MOVQ R13, 56(DI) - RET + // | a1 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 -// func subn8(a *[8]uint64, b *[8]uint64) uint64 -TEXT ·subn8(SB), NOSPLIT, $0-24 - // | - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - XORQ AX, AX + // | - // | - MOVQ (DI), CX - SUBQ (SI), CX - MOVQ 8(DI), DX - SBBQ 8(SI), DX - MOVQ 16(DI), R8 - SBBQ 16(SI), R8 - MOVQ 24(DI), R9 - SBBQ 24(SI), R9 - MOVQ 32(DI), R10 - SBBQ 32(SI), R10 - MOVQ 40(DI), R11 - SBBQ 40(SI), R11 - MOVQ 48(DI), R12 - SBBQ 48(SI), R12 - MOVQ 56(DI), R13 - SBBQ 56(SI), R13 - ADCQ $0x00, AX +/* i = 2 */ - // | - MOVQ CX, (DI) - MOVQ DX, 8(DI) - MOVQ R8, 16(DI) - MOVQ R9, 24(DI) - MOVQ R10, 32(DI) - MOVQ R11, 40(DI) - MOVQ R12, 48(DI) - MOVQ R13, 56(DI) - MOVQ AX, ret+16(FP) - RET + // | a2 @ CX + MOVQ 16(DI), CX + MOVQ $0x00, BX -// func _neg8(c *[8]uint64, a *[8]uint64, p *[8]uint64) -TEXT ·_neg8(SB), NOSPLIT, $0-24 - // | - MOVQ a+8(FP), DI + // | a2 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ $0x00, R11 + ADCQ $0x00, BX - // | - MOVQ p+16(FP), SI - MOVQ (SI), CX - SUBQ (DI), CX - MOVQ 8(SI), DX - SBBQ 8(DI), DX - MOVQ 16(SI), R8 - SBBQ 16(DI), R8 - MOVQ 24(SI), R9 - SBBQ 24(DI), R9 - MOVQ 32(SI), R10 - SBBQ 32(DI), R10 - MOVQ 40(SI), R11 - SBBQ 40(DI), R11 - MOVQ 48(SI), R12 - SBBQ 48(DI), R12 - MOVQ 56(SI), R13 - SBBQ 56(DI), R13 + // | a2 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | - MOVQ c+0(FP), DI - MOVQ CX, (DI) - MOVQ DX, 8(DI) - MOVQ R8, 16(DI) - MOVQ R9, 24(DI) - MOVQ R10, 32(DI) - MOVQ R11, 40(DI) - MOVQ R12, 48(DI) - MOVQ R13, 56(DI) - RET + // | a2 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX -// func mul_two_8(a *[8]uint64) -TEXT ·mul_two_8(SB), NOSPLIT, $0-8 - MOVQ a+0(FP), DI - XORQ AX, AX - RCLQ $0x01, (DI) - RCLQ $0x01, 8(DI) - RCLQ $0x01, 16(DI) - RCLQ $0x01, 24(DI) - RCLQ $0x01, 32(DI) - RCLQ $0x01, 40(DI) - RCLQ $0x01, 48(DI) - RCLQ $0x01, 56(DI) - RET + // | a2 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 -// func div_two_8(a *[8]uint64) -TEXT ·div_two_8(SB), NOSPLIT, $0-8 - MOVQ a+0(FP), DI - XORQ AX, AX - RCRQ $0x01, 56(DI) - RCRQ $0x01, 48(DI) - RCRQ $0x01, 40(DI) - RCRQ $0x01, 32(DI) - RCRQ $0x01, 24(DI) - RCRQ $0x01, 16(DI) - RCRQ $0x01, 8(DI) - RCRQ $0x01, (DI) - RET + // | a2 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 -// func mul8(c *[16]uint64, a *[8]uint64, b *[8]uint64, p *[8]uint64, inp uint64) -TEXT ·mul8(SB), NOSPLIT, $56-40 // | -/* inputs */ - - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI - XORQ AX, AX +/* i = 3 */ - // | - // | - MOVQ (SI), DX + // | a3 @ CX + MOVQ 24(DI), CX + MOVQ $0x00, BX - // | - MULXQ (DI), AX, CX - MOVQ AX, (SP) + // | a3 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ $0x00, R12 + ADCQ $0x00, BX - // | - MULXQ 8(DI), AX, R8 - ADCXQ AX, CX + // | a3 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | - MULXQ 16(DI), AX, R9 - ADCXQ AX, R8 + // | a3 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | - MULXQ 24(DI), AX, R10 - ADCXQ AX, R9 + // | a3 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 - // | - MULXQ 32(DI), AX, R11 - ADCXQ AX, R10 + // | a3 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 - // | - MULXQ 40(DI), AX, R12 - ADCXQ AX, R11 + // | - // | - MULXQ 48(DI), AX, R13 - ADCXQ AX, R12 +/* i = 4 */ - // | - MULXQ 56(DI), AX, R14 - ADCXQ AX, R13 - ADCQ $0x00, R14 + // | a4 @ CX + MOVQ 32(DI), CX + MOVQ $0x00, BX - // | - // | - MOVQ 8(SI), DX - XORQ R15, R15 + // | a4 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ $0x00, R13 + ADCQ $0x00, BX - // | - MULXQ (DI), AX, BX - ADOXQ AX, CX - ADCXQ BX, R8 - MOVQ CX, 8(SP) + // | a4 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | - MULXQ 8(DI), AX, BX - ADOXQ AX, R8 - ADCXQ BX, R9 + // | a4 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | - MULXQ 16(DI), AX, BX - ADOXQ AX, R9 - ADCXQ BX, R10 + // | a4 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ $0x00, BX - // | - MULXQ 24(DI), AX, BX - ADOXQ AX, R10 - ADCXQ BX, R11 + // | a4 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, BX - // | - MULXQ 32(DI), AX, BX - ADOXQ AX, R11 - ADCXQ BX, R12 + // | - // | - MULXQ 40(DI), AX, BX - ADOXQ AX, R12 - ADCXQ BX, R13 +/* */ - // | - MULXQ 48(DI), AX, BX - ADOXQ AX, R13 - ADCXQ BX, R14 + // | + // | W + // | 0 (SP) | 1 R8 | 2 R9 | 3 R10 | 4 R11 + // | 5 R12 | 6 R13 | 7 R14 | 8 R15 | 9 BX - // | - MULXQ 56(DI), AX, BX - ADOXQ AX, R14 - ADOXQ R15, R15 - ADCXQ BX, R15 - // | - // | - MOVQ 16(SI), DX - XORQ CX, CX + MOVQ (SP), CX + MOVQ BX, (SP) - // | - MULXQ (DI), AX, BX - ADOXQ AX, R8 - ADCXQ BX, R9 - MOVQ R8, 16(SP) + // | fetch modulus + MOVQ p+24(FP), SI - // | - MULXQ 8(DI), AX, BX - ADOXQ AX, R9 - ADCXQ BX, R10 + // | - // | - MULXQ 16(DI), AX, BX - ADOXQ AX, R10 - ADCXQ BX, R11 +/* montgomery reduction */ - // | - MULXQ 24(DI), AX, BX - ADOXQ AX, R11 - ADCXQ BX, R12 + // | - // | - MULXQ 32(DI), AX, BX - ADOXQ AX, R12 - ADCXQ BX, R13 +/* i = 0 */ - // | - MULXQ 40(DI), AX, BX - ADOXQ AX, R13 - ADCXQ BX, R14 + // | + // | W + // | 0 CX | 1 R8 | 2 R9 | 3 R10 | 4 R11 + // | 5 R12 | 6 R13 | 7 R14 | 8 R15 | 9 (SP) - // | - MULXQ 48(DI), AX, BX - ADOXQ AX, R14 - ADCXQ BX, R15 - // | - MULXQ 56(DI), AX, BX - ADOXQ AX, R15 - ADOXQ CX, CX - ADCXQ BX, CX + // | | u0 = w0 * inp + MOVQ CX, AX + MULQ inp+32(FP) + MOVQ AX, DI + MOVQ $0x00, BX - // | - // | - MOVQ 24(SI), DX - XORQ R8, R8 + // | - // | - MULXQ (DI), AX, BX - ADOXQ AX, R9 - ADCXQ BX, R10 - MOVQ R9, 24(SP) +/* */ - // | - MULXQ 8(DI), AX, BX - ADOXQ AX, R10 - ADCXQ BX, R11 + // | j0 - // | - MULXQ 16(DI), AX, BX - ADOXQ AX, R11 - ADCXQ BX, R12 + // | w0 @ CX + MOVQ (SI), AX + MULQ DI + ADDQ AX, CX + ADCQ DX, BX - // | - MULXQ 24(DI), AX, BX - ADOXQ AX, R12 - ADCXQ BX, R13 + // | j1 - // | - MULXQ 32(DI), AX, BX - ADOXQ AX, R13 - ADCXQ BX, R14 + // | w1 @ R8 + MOVQ 8(SI), AX + MULQ DI + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ BX, R8 + MOVQ $0x00, BX + ADCQ DX, BX - // | - MULXQ 40(DI), AX, BX - ADOXQ AX, R14 - ADCXQ BX, R15 + // | j2 - // | - MULXQ 48(DI), AX, BX - ADOXQ AX, R15 - ADCXQ BX, CX + // | w2 @ R9 + MOVQ 16(SI), AX + MULQ DI + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ BX, R9 + MOVQ $0x00, BX + ADCQ DX, BX - // | - MULXQ 56(DI), AX, BX - ADOXQ AX, CX - ADOXQ R8, R8 - ADCXQ BX, R8 + // | j3 - // | - // | - MOVQ 32(SI), DX - XORQ R9, R9 + // | w3 @ R10 + MOVQ 24(SI), AX + MULQ DI + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ BX, R10 + MOVQ $0x00, BX + ADCQ DX, BX - // | - MULXQ (DI), AX, BX - ADOXQ AX, R10 - ADCXQ BX, R11 - MOVQ R10, 32(SP) + // | j4 - // | - MULXQ 8(DI), AX, BX - ADOXQ AX, R11 - ADCXQ BX, R12 + // | w4 @ R11 + MOVQ 32(SI), AX + MULQ DI + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ BX, R11 - // | - MULXQ 16(DI), AX, BX - ADOXQ AX, R12 - ADCXQ BX, R13 + // | w5 @ R12 + ADCQ DX, R12 + ADCQ $0x00, CX - // | - MULXQ 24(DI), AX, BX - ADOXQ AX, R13 - ADCXQ BX, R14 + // | - // | - MULXQ 32(DI), AX, BX - ADOXQ AX, R14 - ADCXQ BX, R15 +/* i = 1 */ - // | - MULXQ 40(DI), AX, BX - ADOXQ AX, R15 - ADCXQ BX, CX + // | + // | W + // | 0 - | 1 R8 | 2 R9 | 3 R10 | 4 R11 + // | 5 R12 | 6 R13 | 7 R14 | 8 R15 | 9 (SP) - // | - MULXQ 48(DI), AX, BX - ADOXQ AX, CX - ADCXQ BX, R8 - // | - MULXQ 56(DI), AX, BX - ADOXQ AX, R8 - ADOXQ R9, R9 - ADCXQ BX, R9 + // | | u1 = w1 * inp + MOVQ R8, AX + MULQ inp+32(FP) + MOVQ AX, DI + MOVQ $0x00, BX - // | - // | - MOVQ 40(SI), DX - XORQ R10, R10 + // | - // | - MULXQ (DI), AX, BX - ADOXQ AX, R11 - ADCXQ BX, R12 - MOVQ R11, 40(SP) +/* */ - // | - MULXQ 8(DI), AX, BX - ADOXQ AX, R12 - ADCXQ BX, R13 + // | j0 - // | - MULXQ 16(DI), AX, BX - ADOXQ AX, R13 - ADCXQ BX, R14 + // | w1 @ R8 + MOVQ (SI), AX + MULQ DI + ADDQ AX, R8 + ADCQ DX, BX - // | - MULXQ 24(DI), AX, BX - ADOXQ AX, R14 - ADCXQ BX, R15 + // | j1 - // | - MULXQ 32(DI), AX, BX - ADOXQ AX, R15 - ADCXQ BX, CX + // | w2 @ R9 + MOVQ 8(SI), AX + MULQ DI + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ BX, R9 + MOVQ $0x00, BX + ADCQ DX, BX - // | - MULXQ 40(DI), AX, BX - ADOXQ AX, CX - ADCXQ BX, R8 + // | j2 - // | - MULXQ 48(DI), AX, BX - ADOXQ AX, R8 - ADCXQ BX, R9 + // | w3 @ R10 + MOVQ 16(SI), AX + MULQ DI + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ BX, R10 + MOVQ $0x00, BX + ADCQ DX, BX - // | - MULXQ 56(DI), AX, BX - ADOXQ AX, R9 - ADOXQ R10, R10 - ADCXQ BX, R10 + // | j3 - // | - // | - MOVQ 48(SI), DX - XORQ R11, R11 + // | w4 @ R11 + MOVQ 24(SI), AX + MULQ DI + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ BX, R11 + MOVQ $0x00, BX + ADCQ DX, BX - // | - MULXQ (DI), AX, BX - ADOXQ AX, R12 - ADCXQ BX, R13 + // | j4 - // | - MULXQ 8(DI), AX, BX - ADOXQ AX, R13 - ADCXQ BX, R14 + // | w5 @ R12 + MOVQ 32(SI), AX + MULQ DI + ADDQ AX, R12 + ADCQ DX, CX + ADDQ BX, R12 - // | - MULXQ 16(DI), AX, BX - ADOXQ AX, R14 - ADCXQ BX, R15 + // | w6 @ R13 + ADCQ CX, R13 + MOVQ $0x00, CX + ADCQ $0x00, CX - // | - MULXQ 24(DI), AX, BX - ADOXQ AX, R15 - ADCXQ BX, CX + // | - // | - MULXQ 32(DI), AX, BX - ADOXQ AX, CX - ADCXQ BX, R8 +/* i = 2 */ - // | - MULXQ 40(DI), AX, BX - ADOXQ AX, R8 - ADCXQ BX, R9 + // | + // | W + // | 0 - | 1 - | 2 R9 | 3 R10 | 4 R11 + // | 5 R12 | 6 R13 | 7 R14 | 8 R15 | 9 (SP) - // | - MULXQ 48(DI), AX, BX - ADOXQ AX, R9 - ADCXQ BX, R10 - // | - MULXQ 56(DI), AX, BX - ADOXQ AX, R10 - ADOXQ R11, R11 - ADCXQ BX, R11 + // | | u2 = w2 * inp + MOVQ R9, AX + MULQ inp+32(FP) + MOVQ AX, DI + MOVQ $0x00, BX - // | - // | - MOVQ 56(SI), DX - XORQ SI, SI + // | - // | - MULXQ (DI), AX, BX - ADOXQ AX, R13 - ADCXQ BX, R14 +/* */ - // | - MULXQ 8(DI), AX, BX - ADOXQ AX, R14 - ADCXQ BX, R15 + // | j0 - // | - MULXQ 16(DI), AX, BX - ADOXQ AX, R15 - ADCXQ BX, CX + // | w2 @ R9 + MOVQ (SI), AX + MULQ DI + ADDQ AX, R9 + ADCQ DX, BX - // | - MULXQ 24(DI), AX, BX - ADOXQ AX, CX - ADCXQ BX, R8 + // | j1 - // | - MULXQ 32(DI), AX, BX - ADOXQ AX, R8 - ADCXQ BX, R9 + // | w3 @ R10 + MOVQ 8(SI), AX + MULQ DI + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ BX, R10 + MOVQ $0x00, BX + ADCQ DX, BX - // | - MULXQ 40(DI), AX, BX - ADOXQ AX, R9 - ADCXQ BX, R10 + // | j2 - // | - MULXQ 48(DI), AX, BX - ADOXQ AX, R10 - ADCXQ BX, R11 + // | w4 @ R11 + MOVQ 16(SI), AX + MULQ DI + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ BX, R11 + MOVQ $0x00, BX + ADCQ DX, BX - // | - MULXQ 56(DI), AX, BX - ADOXQ AX, R11 - ADOXQ BX, SI - ADCQ $0x00, SI + // | j3 - // | - MOVQ (SP), DI - MOVQ 8(SP), BX - MOVQ R15, (SP) - MOVQ 16(SP), R15 - MOVQ CX, 8(SP) - MOVQ 24(SP), CX - MOVQ R8, 16(SP) - MOVQ 32(SP), R8 - MOVQ R9, 24(SP) - MOVQ 40(SP), R9 - MOVQ R10, 32(SP) - MOVQ R11, 40(SP) - MOVQ SI, 48(SP) - MOVQ p+24(FP), SI + // | w5 @ R12 + MOVQ 24(SI), AX + MULQ DI + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ BX, R12 + MOVQ $0x00, BX + ADCQ DX, BX - // | - // | - XORQ R11, R11 - MOVQ DI, DX - MULXQ inp+32(FP), DX, R10 + // | j4 - // | - MULXQ (SI), AX, R10 - ADOXQ AX, DI - ADCXQ R10, BX + // | w6 @ R13 + MOVQ 32(SI), AX + MULQ DI + ADDQ AX, R13 + ADCQ DX, CX + ADDQ BX, R13 - // | - MULXQ 8(SI), AX, R10 - ADOXQ AX, BX - ADCXQ R10, R15 + // | w7 @ R14 + ADCQ CX, R14 + MOVQ $0x00, CX + ADCQ $0x00, CX - // | - MULXQ 16(SI), AX, R10 - ADOXQ AX, R15 - ADCXQ R10, CX + // | - // | - MULXQ 24(SI), AX, R10 - ADOXQ AX, CX - ADCXQ R10, R8 +/* i = 3 */ - // | - MULXQ 32(SI), AX, R10 - ADOXQ AX, R8 - ADCXQ R10, R9 + // | + // | W + // | 0 - | 1 - | 2 - | 3 R10 | 4 R11 + // | 5 R12 | 6 R13 | 7 R14 | 8 R15 | 9 (SP) - // | - MULXQ 40(SI), AX, R10 - ADOXQ AX, R9 - ADCXQ R10, R12 - // | - MULXQ 48(SI), AX, R10 - ADOXQ AX, R12 - ADCXQ R10, R13 + // | | u3 = w3 * inp + MOVQ R10, AX + MULQ inp+32(FP) + MOVQ AX, DI + MOVQ $0x00, BX - // | - MULXQ 56(SI), AX, R10 - ADOXQ AX, R13 - ADCXQ R10, R14 - ADOXQ R11, R14 - ADCXQ R11, R11 - XORQ DI, DI - MOVQ (SP), DI + // | - // | - // | - MOVQ BX, DX - MULXQ inp+32(FP), DX, R10 +/* */ - // | - MULXQ (SI), AX, R10 - ADOXQ AX, BX - ADCXQ R10, R15 + // | j0 - // | - MULXQ 8(SI), AX, R10 - ADOXQ AX, R15 - ADCXQ R10, CX + // | w3 @ R10 + MOVQ (SI), AX + MULQ DI + ADDQ AX, R10 + ADCQ DX, BX - // | - MULXQ 16(SI), AX, R10 - ADOXQ AX, CX - ADCXQ R10, R8 + // | j1 - // | - MULXQ 24(SI), AX, R10 - ADOXQ AX, R8 - ADCXQ R10, R9 + // | w4 @ R11 + MOVQ 8(SI), AX + MULQ DI + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ BX, R11 + MOVQ $0x00, BX + ADCQ DX, BX - // | - MULXQ 32(SI), AX, R10 - ADOXQ AX, R9 - ADCXQ R10, R12 + // | j2 - // | - MULXQ 40(SI), AX, R10 - ADOXQ AX, R12 - ADCXQ R10, R13 + // | w5 @ R12 + MOVQ 16(SI), AX + MULQ DI + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ BX, R12 + MOVQ $0x00, BX + ADCQ DX, BX - // | - MULXQ 48(SI), AX, R10 - ADOXQ AX, R13 - ADCXQ R10, R14 + // | j3 - // | - MULXQ 56(SI), AX, R10 - ADOXQ AX, R14 - ADCXQ R10, DI - ADOXQ R11, DI - MOVQ $0x00, R11 - ADCXQ R11, R11 - XORQ BX, BX - MOVQ 8(SP), BX + // | w6 @ R13 + MOVQ 24(SI), AX + MULQ DI + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ BX, R13 + MOVQ $0x00, BX + ADCQ DX, BX - // | - // | - MOVQ R15, DX - MULXQ inp+32(FP), DX, R10 + // | j4 - // | - MULXQ (SI), AX, R10 - ADOXQ AX, R15 - ADCXQ R10, CX + // | w7 @ R14 + MOVQ 32(SI), AX + MULQ DI + ADDQ AX, R14 + ADCQ DX, CX + ADDQ BX, R14 - // | - MULXQ 8(SI), AX, R10 - ADOXQ AX, CX - ADCXQ R10, R8 + // | w8 @ R15 + ADCQ CX, R15 + MOVQ $0x00, CX + ADCQ $0x00, CX - // | - MULXQ 16(SI), AX, R10 - ADOXQ AX, R8 - ADCXQ R10, R9 + // | - // | - MULXQ 24(SI), AX, R10 - ADOXQ AX, R9 - ADCXQ R10, R12 +/* i = 4 */ - // | - MULXQ 32(SI), AX, R10 - ADOXQ AX, R12 - ADCXQ R10, R13 + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 R11 + // | 5 R12 | 6 R13 | 7 R14 | 8 R15 | 9 (SP) - // | - MULXQ 40(SI), AX, R10 - ADOXQ AX, R13 - ADCXQ R10, R14 - // | - MULXQ 48(SI), AX, R10 - ADOXQ AX, R14 - ADCXQ R10, DI + // | | u4 = w4 * inp + MOVQ R11, AX + MULQ inp+32(FP) + MOVQ AX, DI + MOVQ $0x00, BX - // | - MULXQ 56(SI), AX, R10 - ADOXQ AX, DI - ADCXQ R10, BX - ADOXQ R11, BX - MOVQ $0x00, R11 - ADCXQ R11, R11 - XORQ R15, R15 - MOVQ 16(SP), R15 + // | - // | - // | - MOVQ CX, DX - MULXQ inp+32(FP), DX, R10 +/* */ - // | - MULXQ (SI), AX, R10 - ADOXQ AX, CX - ADCXQ R10, R8 + // | j0 - // | - MULXQ 8(SI), AX, R10 - ADOXQ AX, R8 - ADCXQ R10, R9 + // | w4 @ R11 + MOVQ (SI), AX + MULQ DI + ADDQ AX, R11 + ADCQ DX, BX - // | - MULXQ 16(SI), AX, R10 - ADOXQ AX, R9 - ADCXQ R10, R12 + // | j1 - // | - MULXQ 24(SI), AX, R10 - ADOXQ AX, R12 - ADCXQ R10, R13 + // | w5 @ R12 + MOVQ 8(SI), AX + MULQ DI + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ BX, R12 + MOVQ $0x00, BX + ADCQ DX, BX - // | - MULXQ 32(SI), AX, R10 - ADOXQ AX, R13 - ADCXQ R10, R14 + // | j2 - // | - MULXQ 40(SI), AX, R10 - ADOXQ AX, R14 - ADCXQ R10, DI + // | w6 @ R13 + MOVQ 16(SI), AX + MULQ DI + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ BX, R13 + MOVQ $0x00, BX + ADCQ DX, BX - // | - MULXQ 48(SI), AX, R10 - ADOXQ AX, DI - ADCXQ R10, BX + // | j3 - // | - MULXQ 56(SI), AX, R10 - ADOXQ AX, BX - ADCXQ R10, R15 - ADOXQ R11, R15 - MOVQ $0x00, R11 - ADCXQ R11, R11 - XORQ CX, CX - MOVQ 24(SP), CX + // | w7 @ R14 + MOVQ 24(SI), AX + MULQ DI + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ BX, R14 + MOVQ $0x00, BX + ADCQ DX, BX - // | - // | - MOVQ R8, DX - MULXQ inp+32(FP), DX, R10 + // | j4 - // | - MULXQ (SI), AX, R10 - ADOXQ AX, R8 - ADCXQ R10, R9 + // | w8 @ R15 + MOVQ 32(SI), AX + MULQ DI + ADDQ AX, R15 + ADCQ DX, CX + ADDQ BX, R15 - // | - MULXQ 8(SI), AX, R10 - ADOXQ AX, R9 - ADCXQ R10, R12 + // | move to idle register + MOVQ (SP), R8 - // | - MULXQ 16(SI), AX, R10 - ADOXQ AX, R12 - ADCXQ R10, R13 + // | w-1 @ R8 + ADCQ CX, R8 + MOVQ $0x00, CX + ADCQ $0x00, CX - // | - MULXQ 24(SI), AX, R10 - ADOXQ AX, R13 - ADCXQ R10, R14 + // | + // | W montgomerry reduction ends + // | 0 - | 1 - | 2 - | 3 - | 4 - + // | 5 R12 | 6 R13 | 7 R14 | 8 R15 | 9 R8 - // | - MULXQ 32(SI), AX, R10 - ADOXQ AX, R14 - ADCXQ R10, DI - // | - MULXQ 40(SI), AX, R10 - ADOXQ AX, DI - ADCXQ R10, BX + // | - // | - MULXQ 48(SI), AX, R10 - ADOXQ AX, BX - ADCXQ R10, R15 +/* modular reduction */ - // | - MULXQ 56(SI), AX, R10 - ADOXQ AX, R15 - ADCXQ R10, CX - ADOXQ R11, CX - MOVQ $0x00, R11 - ADCXQ R11, R11 - XORQ R8, R8 - MOVQ 32(SP), R8 + MOVQ R12, R9 + SUBQ (SI), R9 + MOVQ R13, R10 + SBBQ 8(SI), R10 + MOVQ R14, R11 + SBBQ 16(SI), R11 + MOVQ R15, DX + SBBQ 24(SI), DX + MOVQ DX, (SP) + MOVQ R8, DX + SBBQ 32(SI), DX + MOVQ DX, 8(SP) + SBBQ $0x00, CX - // | - // | - MOVQ R9, DX - MULXQ inp+32(FP), DX, R10 + // | - // | - MULXQ (SI), AX, R10 - ADOXQ AX, R9 - ADCXQ R10, R12 +/* out */ - // | - MULXQ 8(SI), AX, R10 - ADOXQ AX, R12 - ADCXQ R10, R13 + MOVQ c+0(FP), CX + CMOVQCC R9, R12 + MOVQ R12, (CX) + CMOVQCC R10, R13 + MOVQ R13, 8(CX) + CMOVQCC R11, R14 + MOVQ R14, 16(CX) + CMOVQCC (SP), R15 + MOVQ R15, 24(CX) + CMOVQCC 8(SP), R8 + MOVQ R8, 32(CX) + RET - // | - MULXQ 16(SI), AX, R10 - ADOXQ AX, R13 - ADCXQ R10, R14 + // | - // | - MULXQ 24(SI), AX, R10 - ADOXQ AX, R14 - ADCXQ R10, DI +/* end */ - // | - MULXQ 32(SI), AX, R10 - ADOXQ AX, DI - ADCXQ R10, BX - // | - MULXQ 40(SI), AX, R10 - ADOXQ AX, BX - ADCXQ R10, R15 +// func cpy6(dst *[6]uint64, src *[6]uint64) +TEXT ·cpy6(SB), NOSPLIT, $0-16 + MOVQ dst+0(FP), DI + MOVQ src+8(FP), SI + MOVQ (SI), R8 + MOVQ R8, (DI) + MOVQ 8(SI), R8 + MOVQ R8, 8(DI) + MOVQ 16(SI), R8 + MOVQ R8, 16(DI) + MOVQ 24(SI), R8 + MOVQ R8, 24(DI) + MOVQ 32(SI), R8 + MOVQ R8, 32(DI) + MOVQ 40(SI), R8 + MOVQ R8, 40(DI) + RET - // | - MULXQ 48(SI), AX, R10 - ADOXQ AX, R15 - ADCXQ R10, CX - - // | - MULXQ 56(SI), AX, R10 - ADOXQ AX, CX - ADCXQ R10, R8 - ADOXQ R11, R8 - MOVQ $0x00, R11 - ADCXQ R11, R11 - XORQ R9, R9 - MOVQ 40(SP), R9 - - // | - // | - MOVQ R12, DX - MULXQ inp+32(FP), DX, R10 - - // | - MULXQ (SI), AX, R10 - ADOXQ AX, R12 - ADCXQ R10, R13 - - // | - MULXQ 8(SI), AX, R10 - ADOXQ AX, R13 - ADCXQ R10, R14 - - // | - MULXQ 16(SI), AX, R10 - ADOXQ AX, R14 - ADCXQ R10, DI - - // | - MULXQ 24(SI), AX, R10 - ADOXQ AX, DI - ADCXQ R10, BX - - // | - MULXQ 32(SI), AX, R10 - ADOXQ AX, BX - ADCXQ R10, R15 - - // | - MULXQ 40(SI), AX, R10 - ADOXQ AX, R15 - ADCXQ R10, CX - - // | - MULXQ 48(SI), AX, R10 - ADOXQ AX, CX - ADCXQ R10, R8 - - // | - MULXQ 56(SI), AX, R10 - ADOXQ AX, R8 - ADCXQ R10, R9 - ADOXQ R11, R9 - MOVQ $0x00, R11 - ADCXQ R11, R11 - XORQ R12, R12 - MOVQ 48(SP), R12 - - // | - // | - MOVQ R13, DX - MULXQ inp+32(FP), DX, R10 - - // | - MULXQ (SI), AX, R10 - ADOXQ AX, R13 - ADCXQ R10, R14 - - // | - MULXQ 8(SI), AX, R10 - ADOXQ AX, R14 - ADCXQ R10, DI - - // | - MULXQ 16(SI), AX, R10 - ADOXQ AX, DI - ADCXQ R10, BX - - // | - MULXQ 24(SI), AX, R10 - ADOXQ AX, BX - ADCXQ R10, R15 - - // | - MULXQ 32(SI), AX, R10 - ADOXQ AX, R15 - ADCXQ R10, CX - - // | - MULXQ 40(SI), AX, R10 - ADOXQ AX, CX - ADCXQ R10, R8 - - // | - MULXQ 48(SI), AX, R10 - ADOXQ AX, R8 - ADCXQ R10, R9 - - // | - MULXQ 56(SI), AX, R10 - ADOXQ AX, R9 - ADCXQ R10, R12 - ADOXQ R11, R12 - MOVQ $0x00, R11 - ADCXQ R11, R11 - ADOXQ R13, R11 - - // | - -/* reduction */ - - MOVQ R14, DX - SUBQ (SI), DX - MOVQ DI, R10 - SBBQ 8(SI), R10 - MOVQ BX, R13 - SBBQ 16(SI), R13 - MOVQ R15, AX - SBBQ 24(SI), AX - MOVQ AX, (SP) - MOVQ CX, AX - SBBQ 32(SI), AX - MOVQ AX, 8(SP) - MOVQ R8, AX - SBBQ 40(SI), AX - MOVQ AX, 16(SP) - MOVQ R9, AX - SBBQ 48(SI), AX - MOVQ AX, 24(SP) - MOVQ R12, AX - SBBQ 56(SI), AX - MOVQ AX, 32(SP) - SBBQ $0x00, R11 - - // | - MOVQ c+0(FP), R11 - CMOVQCC DX, R14 - MOVQ R14, (R11) - CMOVQCC R10, DI - MOVQ DI, 8(R11) - CMOVQCC R13, BX - MOVQ BX, 16(R11) - CMOVQCC (SP), R15 - MOVQ R15, 24(R11) - CMOVQCC 8(SP), CX - MOVQ CX, 32(R11) - CMOVQCC 16(SP), R8 - MOVQ R8, 40(R11) - CMOVQCC 24(SP), R9 - MOVQ R9, 48(R11) - CMOVQCC 32(SP), R12 - MOVQ R12, 56(R11) - RET - - // | - -/* end */ - - - -// func cpy9(dst *[9]uint64, src *[9]uint64) -TEXT ·cpy9(SB), NOSPLIT, $0-16 - MOVQ dst+0(FP), DI - MOVQ src+8(FP), SI - MOVQ (SI), R8 - MOVQ R8, (DI) - MOVQ 8(SI), R8 - MOVQ R8, 8(DI) - MOVQ 16(SI), R8 - MOVQ R8, 16(DI) - MOVQ 24(SI), R8 - MOVQ R8, 24(DI) - MOVQ 32(SI), R8 - MOVQ R8, 32(DI) - MOVQ 40(SI), R8 - MOVQ R8, 40(DI) - MOVQ 48(SI), R8 - MOVQ R8, 48(DI) - MOVQ 56(SI), R8 - MOVQ R8, 56(DI) - MOVQ 64(SI), R8 - MOVQ R8, 64(DI) - RET - -// func eq9(a *[9]uint64, b *[9]uint64) bool -TEXT ·eq9(SB), NOSPLIT, $0-17 - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - MOVB $0x00, ret+16(FP) - MOVQ (DI), R8 - CMPQ (SI), R8 - JNE ret - MOVQ 8(DI), R8 - CMPQ 8(SI), R8 - JNE ret - MOVQ 16(DI), R8 - CMPQ 16(SI), R8 - JNE ret - MOVQ 24(DI), R8 - CMPQ 24(SI), R8 - JNE ret - MOVQ 32(DI), R8 - CMPQ 32(SI), R8 - JNE ret - MOVQ 40(DI), R8 - CMPQ 40(SI), R8 - JNE ret - MOVQ 48(DI), R8 - CMPQ 48(SI), R8 - JNE ret - MOVQ 56(DI), R8 - CMPQ 56(SI), R8 - JNE ret - MOVQ 64(DI), R8 - CMPQ 64(SI), R8 - JNE ret - MOVB $0x01, ret+16(FP) +// func eq6(a *[6]uint64, b *[6]uint64) bool +TEXT ·eq6(SB), NOSPLIT, $0-17 + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + MOVB $0x00, ret+16(FP) + MOVQ (DI), R8 + CMPQ (SI), R8 + JNE ret + MOVQ 8(DI), R8 + CMPQ 8(SI), R8 + JNE ret + MOVQ 16(DI), R8 + CMPQ 16(SI), R8 + JNE ret + MOVQ 24(DI), R8 + CMPQ 24(SI), R8 + JNE ret + MOVQ 32(DI), R8 + CMPQ 32(SI), R8 + JNE ret + MOVQ 40(DI), R8 + CMPQ 40(SI), R8 + JNE ret + MOVB $0x01, ret+16(FP) ret: RET -// func cmp9(a *[9]uint64, b *[9]uint64) int8 -TEXT ·cmp9(SB), NOSPLIT, $0-17 +// func cmp6(a *[6]uint64, b *[6]uint64) int8 +TEXT ·cmp6(SB), NOSPLIT, $0-17 MOVQ a+0(FP), DI MOVQ b+8(FP), SI - MOVQ 64(DI), R8 - CMPQ 64(SI), R8 - JB gt - JA lt - MOVQ 56(DI), R8 - CMPQ 56(SI), R8 - JB gt - JA lt - MOVQ 48(DI), R8 - CMPQ 48(SI), R8 - JB gt - JA lt MOVQ 40(DI), R8 CMPQ 40(SI), R8 JB gt @@ -5299,8 +4523,8 @@ lt: ret: RET -// func add9(c *[9]uint64, a *[9]uint64, b *[9]uint64, p *[9]uint64) -TEXT ·add9(SB), NOSPLIT, $64-32 +// func add6(c *[6]uint64, a *[6]uint64, b *[6]uint64, p *[6]uint64) +TEXT ·add6(SB), NOSPLIT, $16-32 // | MOVQ a+8(FP), DI MOVQ b+16(FP), SI @@ -5319,71 +4543,53 @@ TEXT ·add9(SB), NOSPLIT, $64-32 ADCQ 32(SI), R10 MOVQ 40(DI), R11 ADCQ 40(SI), R11 - MOVQ 48(DI), R12 - ADCQ 48(SI), R12 - MOVQ 56(DI), R13 - ADCQ 56(SI), R13 - MOVQ 64(DI), R14 - ADCQ 64(SI), R14 ADCQ $0x00, AX // | MOVQ p+24(FP), SI - MOVQ CX, R15 - SUBQ (SI), R15 - MOVQ DX, BX - SBBQ 8(SI), BX - MOVQ BX, (SP) - MOVQ R8, BX - SBBQ 16(SI), BX - MOVQ BX, 8(SP) - MOVQ R9, BX - SBBQ 24(SI), BX - MOVQ BX, 16(SP) + MOVQ CX, R12 + SUBQ (SI), R12 + MOVQ DX, R13 + SBBQ 8(SI), R13 + MOVQ R8, R14 + SBBQ 16(SI), R14 + MOVQ R9, R15 + SBBQ 24(SI), R15 MOVQ R10, BX SBBQ 32(SI), BX - MOVQ BX, 24(SP) + MOVQ BX, (SP) MOVQ R11, BX SBBQ 40(SI), BX - MOVQ BX, 32(SP) - MOVQ R12, BX - SBBQ 48(SI), BX - MOVQ BX, 40(SP) - MOVQ R13, BX - SBBQ 56(SI), BX - MOVQ BX, 48(SP) - MOVQ R14, BX - SBBQ 64(SI), BX - MOVQ BX, 56(SP) + MOVQ BX, 8(SP) SBBQ $0x00, AX // | MOVQ c+0(FP), DI - CMOVQCC R15, CX + CMOVQCC R12, CX MOVQ CX, (DI) - CMOVQCC (SP), DX + CMOVQCC R13, DX MOVQ DX, 8(DI) - CMOVQCC 8(SP), R8 + CMOVQCC R14, R8 MOVQ R8, 16(DI) - CMOVQCC 16(SP), R9 + CMOVQCC R15, R9 MOVQ R9, 24(DI) - CMOVQCC 24(SP), R10 + CMOVQCC (SP), R10 MOVQ R10, 32(DI) - CMOVQCC 32(SP), R11 + CMOVQCC 8(SP), R11 MOVQ R11, 40(DI) - CMOVQCC 40(SP), R12 - MOVQ R12, 48(DI) - CMOVQCC 48(SP), R13 - MOVQ R13, 56(DI) - CMOVQCC 56(SP), R14 - MOVQ R14, 64(DI) RET -// func addn9(a *[9]uint64, b *[9]uint64) uint64 -TEXT ·addn9(SB), NOSPLIT, $0-24 - // | - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI + // | + +/* end */ + + RET + +// func addn6(a *[6]uint64, b *[6]uint64) uint64 +TEXT ·addn6(SB), NOSPLIT, $0-24 + // | + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI // | MOVQ (DI), CX @@ -5398,12 +4604,6 @@ TEXT ·addn9(SB), NOSPLIT, $0-24 ADCQ 32(SI), R10 MOVQ 40(DI), R11 ADCQ 40(SI), R11 - MOVQ 48(DI), R12 - ADCQ 48(SI), R12 - MOVQ 56(DI), R13 - ADCQ 56(SI), R13 - MOVQ 64(DI), R14 - ADCQ 64(SI), R14 ADCQ $0x00, AX // | @@ -5413,14 +4613,17 @@ TEXT ·addn9(SB), NOSPLIT, $0-24 MOVQ R9, 24(DI) MOVQ R10, 32(DI) MOVQ R11, 40(DI) - MOVQ R12, 48(DI) - MOVQ R13, 56(DI) - MOVQ R14, 64(DI) MOVQ AX, ret+16(FP) RET -// func double9(c *[9]uint64, a *[9]uint64, p *[9]uint64) -TEXT ·double9(SB), NOSPLIT, $64-24 + // | + +/* end */ + + RET + +// func double6(c *[6]uint64, a *[6]uint64, p *[6]uint64) +TEXT ·double6(SB), NOSPLIT, $16-24 // | MOVQ a+8(FP), DI XORQ AX, AX @@ -5436,68 +4639,50 @@ TEXT ·double9(SB), NOSPLIT, $64-24 ADCQ R10, R10 MOVQ 40(DI), R11 ADCQ R11, R11 - MOVQ 48(DI), R12 - ADCQ R12, R12 - MOVQ 56(DI), R13 - ADCQ R13, R13 - MOVQ 64(DI), R14 - ADCQ R14, R14 ADCQ $0x00, AX // | MOVQ p+16(FP), SI - MOVQ CX, R15 - SUBQ (SI), R15 - MOVQ DX, BX - SBBQ 8(SI), BX - MOVQ BX, (SP) - MOVQ R8, BX - SBBQ 16(SI), BX - MOVQ BX, 8(SP) - MOVQ R9, BX - SBBQ 24(SI), BX - MOVQ BX, 16(SP) + MOVQ CX, R12 + SUBQ (SI), R12 + MOVQ DX, R13 + SBBQ 8(SI), R13 + MOVQ R8, R14 + SBBQ 16(SI), R14 + MOVQ R9, R15 + SBBQ 24(SI), R15 MOVQ R10, BX SBBQ 32(SI), BX - MOVQ BX, 24(SP) + MOVQ BX, (SP) MOVQ R11, BX SBBQ 40(SI), BX - MOVQ BX, 32(SP) - MOVQ R12, BX - SBBQ 48(SI), BX - MOVQ BX, 40(SP) - MOVQ R13, BX - SBBQ 56(SI), BX - MOVQ BX, 48(SP) - MOVQ R14, BX - SBBQ 64(SI), BX - MOVQ BX, 56(SP) + MOVQ BX, 8(SP) SBBQ $0x00, AX // | MOVQ c+0(FP), DI - CMOVQCC R15, CX + CMOVQCC R12, CX MOVQ CX, (DI) - CMOVQCC (SP), DX + CMOVQCC R13, DX MOVQ DX, 8(DI) - CMOVQCC 8(SP), R8 + CMOVQCC R14, R8 MOVQ R8, 16(DI) - CMOVQCC 16(SP), R9 + CMOVQCC R15, R9 MOVQ R9, 24(DI) - CMOVQCC 24(SP), R10 + CMOVQCC (SP), R10 MOVQ R10, 32(DI) - CMOVQCC 32(SP), R11 + CMOVQCC 8(SP), R11 MOVQ R11, 40(DI) - CMOVQCC 40(SP), R12 - MOVQ R12, 48(DI) - CMOVQCC 48(SP), R13 - MOVQ R13, 56(DI) - CMOVQCC 56(SP), R14 - MOVQ R14, 64(DI) RET -// func sub9(c *[9]uint64, a *[9]uint64, b *[9]uint64, p *[9]uint64) -TEXT ·sub9(SB), NOSPLIT, $64-32 + // | + +/* end */ + + RET + +// func sub6(c *[6]uint64, a *[6]uint64, b *[6]uint64, p *[6]uint64) +TEXT ·sub6(SB), NOSPLIT, $16-32 // | MOVQ a+8(FP), DI MOVQ b+16(FP), SI @@ -5514,58 +4699,46 @@ TEXT ·sub9(SB), NOSPLIT, $64-32 SBBQ 32(SI), R10 MOVQ 40(DI), R11 SBBQ 40(SI), R11 - MOVQ 48(DI), R12 - SBBQ 48(SI), R12 - MOVQ 56(DI), R13 - SBBQ 56(SI), R13 - MOVQ 64(DI), R14 - SBBQ 64(SI), R14 // | MOVQ p+24(FP), SI - MOVQ (SI), R15 + MOVQ (SI), R12 + CMOVQCC AX, R12 + MOVQ 8(SI), R13 + CMOVQCC AX, R13 + MOVQ 16(SI), R14 + CMOVQCC AX, R14 + MOVQ 24(SI), R15 CMOVQCC AX, R15 - CMOVQCS 8(SI), AX - MOVQ AX, (SP) - CMOVQCS 16(SI), AX - MOVQ AX, 8(SP) - CMOVQCS 24(SI), AX - MOVQ AX, 16(SP) CMOVQCS 32(SI), AX - MOVQ AX, 24(SP) + MOVQ AX, (SP) CMOVQCS 40(SI), AX - MOVQ AX, 32(SP) - CMOVQCS 48(SI), AX - MOVQ AX, 40(SP) - CMOVQCS 56(SI), AX - MOVQ AX, 48(SP) - CMOVQCS 64(SI), AX - MOVQ AX, 56(SP) + MOVQ AX, 8(SP) // | MOVQ c+0(FP), DI - ADDQ R15, CX + ADDQ R12, CX MOVQ CX, (DI) - ADCQ (SP), DX + ADCQ R13, DX MOVQ DX, 8(DI) - ADCQ 8(SP), R8 + ADCQ R14, R8 MOVQ R8, 16(DI) - ADCQ 16(SP), R9 + ADCQ R15, R9 MOVQ R9, 24(DI) - ADCQ 24(SP), R10 + ADCQ (SP), R10 MOVQ R10, 32(DI) - ADCQ 32(SP), R11 + ADCQ 8(SP), R11 MOVQ R11, 40(DI) - ADCQ 40(SP), R12 - MOVQ R12, 48(DI) - ADCQ 48(SP), R13 - MOVQ R13, 56(DI) - ADCQ 56(SP), R14 - MOVQ R14, 64(DI) RET -// func subn9(a *[9]uint64, b *[9]uint64) uint64 -TEXT ·subn9(SB), NOSPLIT, $0-24 + // | + +/* end */ + + RET + +// func subn6(a *[6]uint64, b *[6]uint64) uint64 +TEXT ·subn6(SB), NOSPLIT, $0-24 // | MOVQ a+0(FP), DI MOVQ b+8(FP), SI @@ -5584,12 +4757,6 @@ TEXT ·subn9(SB), NOSPLIT, $0-24 SBBQ 32(SI), R10 MOVQ 40(DI), R11 SBBQ 40(SI), R11 - MOVQ 48(DI), R12 - SBBQ 48(SI), R12 - MOVQ 56(DI), R13 - SBBQ 56(SI), R13 - MOVQ 64(DI), R14 - SBBQ 64(SI), R14 ADCQ $0x00, AX // | @@ -5599,14 +4766,17 @@ TEXT ·subn9(SB), NOSPLIT, $0-24 MOVQ R9, 24(DI) MOVQ R10, 32(DI) MOVQ R11, 40(DI) - MOVQ R12, 48(DI) - MOVQ R13, 56(DI) - MOVQ R14, 64(DI) MOVQ AX, ret+16(FP) RET -// func _neg9(c *[9]uint64, a *[9]uint64, p *[9]uint64) -TEXT ·_neg9(SB), NOSPLIT, $0-24 + // | + +/* end */ + + RET + +// func _neg6(c *[6]uint64, a *[6]uint64, p *[6]uint64) +TEXT ·_neg6(SB), NOSPLIT, $0-24 // | MOVQ a+8(FP), DI @@ -5624,12 +4794,6 @@ TEXT ·_neg9(SB), NOSPLIT, $0-24 SBBQ 32(DI), R10 MOVQ 40(SI), R11 SBBQ 40(DI), R11 - MOVQ 48(SI), R12 - SBBQ 48(DI), R12 - MOVQ 56(SI), R13 - SBBQ 56(DI), R13 - MOVQ 64(SI), R14 - SBBQ 64(DI), R14 // | MOVQ c+0(FP), DI @@ -5639,13 +4803,16 @@ TEXT ·_neg9(SB), NOSPLIT, $0-24 MOVQ R9, 24(DI) MOVQ R10, 32(DI) MOVQ R11, 40(DI) - MOVQ R12, 48(DI) - MOVQ R13, 56(DI) - MOVQ R14, 64(DI) RET -// func mul_two_9(a *[9]uint64) -TEXT ·mul_two_9(SB), NOSPLIT, $0-8 + // | + +/* end */ + + RET + +// func mul_two_6(a *[6]uint64) +TEXT ·mul_two_6(SB), NOSPLIT, $0-8 MOVQ a+0(FP), DI XORQ AX, AX RCLQ $0x01, (DI) @@ -5654,18 +4821,12 @@ TEXT ·mul_two_9(SB), NOSPLIT, $0-8 RCLQ $0x01, 24(DI) RCLQ $0x01, 32(DI) RCLQ $0x01, 40(DI) - RCLQ $0x01, 48(DI) - RCLQ $0x01, 56(DI) - RCLQ $0x01, 64(DI) RET -// func div_two_9(a *[9]uint64) -TEXT ·div_two_9(SB), NOSPLIT, $0-8 +// func div_two_6(a *[6]uint64) +TEXT ·div_two_6(SB), NOSPLIT, $0-8 MOVQ a+0(FP), DI XORQ AX, AX - RCRQ $0x01, 64(DI) - RCRQ $0x01, 56(DI) - RCRQ $0x01, 48(DI) RCRQ $0x01, 40(DI) RCRQ $0x01, 32(DI) RCRQ $0x01, 24(DI) @@ -5674,31084 +4835,70936 @@ TEXT ·div_two_9(SB), NOSPLIT, $0-8 RCRQ $0x01, (DI) RET -// func cpy10(dst *[10]uint64, src *[10]uint64) -TEXT ·cpy10(SB), NOSPLIT, $0-16 - MOVQ dst+0(FP), DI - MOVQ src+8(FP), SI - MOVQ (SI), R8 - MOVQ R8, (DI) - MOVQ 8(SI), R8 - MOVQ R8, 8(DI) - MOVQ 16(SI), R8 - MOVQ R8, 16(DI) - MOVQ 24(SI), R8 - MOVQ R8, 24(DI) - MOVQ 32(SI), R8 - MOVQ R8, 32(DI) - MOVQ 40(SI), R8 - MOVQ R8, 40(DI) - MOVQ 48(SI), R8 - MOVQ R8, 48(DI) - MOVQ 56(SI), R8 - MOVQ R8, 56(DI) - MOVQ 64(SI), R8 - MOVQ R8, 64(DI) - MOVQ 72(SI), R8 - MOVQ R8, 72(DI) - RET +// func mul6(c *[6]uint64, a *[6]uint64, b *[6]uint64, p *[6]uint64, inp uint64) +TEXT ·mul6(SB), NOSPLIT, $16-40 + // | -// func eq10(a *[10]uint64, b *[10]uint64) bool -TEXT ·eq10(SB), NOSPLIT, $0-17 - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - MOVB $0x00, ret+16(FP) - MOVQ (DI), R8 - CMPQ (SI), R8 - JNE ret - MOVQ 8(DI), R8 - CMPQ 8(SI), R8 - JNE ret - MOVQ 16(DI), R8 - CMPQ 16(SI), R8 - JNE ret - MOVQ 24(DI), R8 - CMPQ 24(SI), R8 - JNE ret - MOVQ 32(DI), R8 - CMPQ 32(SI), R8 - JNE ret - MOVQ 40(DI), R8 - CMPQ 40(SI), R8 - JNE ret - MOVQ 48(DI), R8 - CMPQ 48(SI), R8 - JNE ret - MOVQ 56(DI), R8 - CMPQ 56(SI), R8 - JNE ret - MOVQ 64(DI), R8 - CMPQ 64(SI), R8 - JNE ret - MOVQ 72(DI), R8 - CMPQ 72(SI), R8 - JNE ret - MOVB $0x01, ret+16(FP) +/* inputs */ -ret: - RET + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + XORQ AX, AX -// func cmp10(a *[10]uint64, b *[10]uint64) int8 -TEXT ·cmp10(SB), NOSPLIT, $0-17 - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - MOVQ 72(DI), R8 - CMPQ 72(SI), R8 - JB gt - JA lt - MOVQ 64(DI), R8 - CMPQ 64(SI), R8 - JB gt - JA lt - MOVQ 56(DI), R8 - CMPQ 56(SI), R8 - JB gt - JA lt - MOVQ 48(DI), R8 - CMPQ 48(SI), R8 - JB gt - JA lt - MOVQ 40(DI), R8 - CMPQ 40(SI), R8 - JB gt - JA lt - MOVQ 32(DI), R8 - CMPQ 32(SI), R8 - JB gt - JA lt - MOVQ 24(DI), R8 - CMPQ 24(SI), R8 - JB gt - JA lt - MOVQ 16(DI), R8 - CMPQ 16(SI), R8 - JB gt - JA lt - MOVQ 8(DI), R8 - CMPQ 8(SI), R8 - JB gt - JA lt - MOVQ (DI), R8 - CMPQ (SI), R8 - JB gt - JA lt - MOVB $0x00, ret+16(FP) - JMP ret + // | -gt: - MOVB $0x01, ret+16(FP) - JMP ret +/* i = 0 */ -lt: - MOVB $0xff, ret+16(FP) + // | a0 @ DX + MOVQ (DI), DX -ret: - RET + // | a0 * b0 + MULXQ (SI), AX, CX + MOVQ AX, (SP) -// func add10(c *[10]uint64, a *[10]uint64, b *[10]uint64, p *[10]uint64) -TEXT ·add10(SB), NOSPLIT, $80-32 - // | - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI - XORQ AX, AX + // | a0 * b1 + MULXQ 8(SI), AX, R8 + ADCXQ AX, CX - // | - MOVQ (DI), CX - ADDQ (SI), CX - MOVQ 8(DI), DX - ADCQ 8(SI), DX - MOVQ 16(DI), R8 - ADCQ 16(SI), R8 - MOVQ 24(DI), R9 - ADCQ 24(SI), R9 - MOVQ 32(DI), R10 - ADCQ 32(SI), R10 - MOVQ 40(DI), R11 - ADCQ 40(SI), R11 - MOVQ 48(DI), R12 - ADCQ 48(SI), R12 - MOVQ 56(DI), R13 - ADCQ 56(SI), R13 - MOVQ 64(DI), R14 - ADCQ 64(SI), R14 - MOVQ 72(DI), R15 - ADCQ 72(SI), R15 - ADCQ $0x00, AX + // | a0 * b2 + MULXQ 16(SI), AX, R9 + ADCXQ AX, R8 - // | - MOVQ p+24(FP), SI - MOVQ CX, BX - SUBQ (SI), BX - MOVQ BX, (SP) - MOVQ DX, BX - SBBQ 8(SI), BX - MOVQ BX, 8(SP) - MOVQ R8, BX - SBBQ 16(SI), BX - MOVQ BX, 16(SP) - MOVQ R9, BX - SBBQ 24(SI), BX - MOVQ BX, 24(SP) - MOVQ R10, BX - SBBQ 32(SI), BX - MOVQ BX, 32(SP) - MOVQ R11, BX - SBBQ 40(SI), BX - MOVQ BX, 40(SP) - MOVQ R12, BX - SBBQ 48(SI), BX - MOVQ BX, 48(SP) - MOVQ R13, BX - SBBQ 56(SI), BX - MOVQ BX, 56(SP) - MOVQ R14, BX - SBBQ 64(SI), BX - MOVQ BX, 64(SP) - MOVQ R15, BX - SBBQ 72(SI), BX - MOVQ BX, 72(SP) - SBBQ $0x00, AX + // | a0 * b3 + MULXQ 24(SI), AX, R10 + ADCXQ AX, R9 - // | - MOVQ c+0(FP), DI - CMOVQCC (SP), CX - MOVQ CX, (DI) - CMOVQCC 8(SP), DX - MOVQ DX, 8(DI) - CMOVQCC 16(SP), R8 - MOVQ R8, 16(DI) - CMOVQCC 24(SP), R9 - MOVQ R9, 24(DI) - CMOVQCC 32(SP), R10 - MOVQ R10, 32(DI) - CMOVQCC 40(SP), R11 - MOVQ R11, 40(DI) - CMOVQCC 48(SP), R12 - MOVQ R12, 48(DI) - CMOVQCC 56(SP), R13 - MOVQ R13, 56(DI) - CMOVQCC 64(SP), R14 - MOVQ R14, 64(DI) - CMOVQCC 72(SP), R15 - MOVQ R15, 72(DI) - RET + // | a0 * b4 + MULXQ 32(SI), AX, R11 + ADCXQ AX, R10 -// func addn10(a *[10]uint64, b *[10]uint64) uint64 -TEXT ·addn10(SB), NOSPLIT, $0-24 - // | - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI + // | a0 * b5 + MULXQ 40(SI), AX, R12 + ADCXQ AX, R11 + ADCQ $0x00, R12 - // | - MOVQ (DI), CX - ADDQ (SI), CX - MOVQ 8(DI), DX - ADCQ 8(SI), DX - MOVQ 16(DI), R8 - ADCQ 16(SI), R8 - MOVQ 24(DI), R9 - ADCQ 24(SI), R9 - MOVQ 32(DI), R10 - ADCQ 32(SI), R10 - MOVQ 40(DI), R11 - ADCQ 40(SI), R11 - MOVQ 48(DI), R12 - ADCQ 48(SI), R12 - MOVQ 56(DI), R13 - ADCQ 56(SI), R13 - MOVQ 64(DI), R14 - ADCQ 64(SI), R14 - MOVQ 72(DI), R15 - ADCQ 72(SI), R15 - ADCQ $0x00, AX + // | - // | - MOVQ CX, (DI) - MOVQ DX, 8(DI) - MOVQ R8, 16(DI) - MOVQ R9, 24(DI) - MOVQ R10, 32(DI) - MOVQ R11, 40(DI) - MOVQ R12, 48(DI) - MOVQ R13, 56(DI) - MOVQ R14, 64(DI) - MOVQ R15, 72(DI) - MOVQ AX, ret+16(FP) - RET +/* i = 1 */ -// func double10(c *[10]uint64, a *[10]uint64, p *[10]uint64) -TEXT ·double10(SB), NOSPLIT, $80-24 - // | - MOVQ a+8(FP), DI - XORQ AX, AX - MOVQ (DI), CX - ADDQ CX, CX + // | a1 @ DX MOVQ 8(DI), DX - ADCQ DX, DX - MOVQ 16(DI), R8 - ADCQ R8, R8 - MOVQ 24(DI), R9 - ADCQ R9, R9 - MOVQ 32(DI), R10 - ADCQ R10, R10 - MOVQ 40(DI), R11 - ADCQ R11, R11 - MOVQ 48(DI), R12 - ADCQ R12, R12 - MOVQ 56(DI), R13 - ADCQ R13, R13 - MOVQ 64(DI), R14 - ADCQ R14, R14 - MOVQ 72(DI), R15 - ADCQ R15, R15 - ADCQ $0x00, AX - - // | - MOVQ p+16(FP), SI - MOVQ CX, BX - SUBQ (SI), BX - MOVQ BX, (SP) - MOVQ DX, BX - SBBQ 8(SI), BX - MOVQ BX, 8(SP) - MOVQ R8, BX - SBBQ 16(SI), BX - MOVQ BX, 16(SP) - MOVQ R9, BX - SBBQ 24(SI), BX - MOVQ BX, 24(SP) - MOVQ R10, BX - SBBQ 32(SI), BX - MOVQ BX, 32(SP) - MOVQ R11, BX - SBBQ 40(SI), BX - MOVQ BX, 40(SP) - MOVQ R12, BX - SBBQ 48(SI), BX - MOVQ BX, 48(SP) - MOVQ R13, BX - SBBQ 56(SI), BX - MOVQ BX, 56(SP) - MOVQ R14, BX - SBBQ 64(SI), BX - MOVQ BX, 64(SP) - MOVQ R15, BX - SBBQ 72(SI), BX - MOVQ BX, 72(SP) - SBBQ $0x00, AX + XORQ R13, R13 - // | - MOVQ c+0(FP), DI - CMOVQCC (SP), CX - MOVQ CX, (DI) - CMOVQCC 8(SP), DX - MOVQ DX, 8(DI) - CMOVQCC 16(SP), R8 - MOVQ R8, 16(DI) - CMOVQCC 24(SP), R9 - MOVQ R9, 24(DI) - CMOVQCC 32(SP), R10 - MOVQ R10, 32(DI) - CMOVQCC 40(SP), R11 - MOVQ R11, 40(DI) - CMOVQCC 48(SP), R12 - MOVQ R12, 48(DI) - CMOVQCC 56(SP), R13 - MOVQ R13, 56(DI) - CMOVQCC 64(SP), R14 - MOVQ R14, 64(DI) - CMOVQCC 72(SP), R15 - MOVQ R15, 72(DI) - RET + // | a1 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + MOVQ CX, 8(SP) -// func sub10(c *[10]uint64, a *[10]uint64, b *[10]uint64, p *[10]uint64) -TEXT ·sub10(SB), NOSPLIT, $80-32 - // | - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI - XORQ AX, AX - MOVQ (DI), CX - SUBQ (SI), CX - MOVQ 8(DI), DX - SBBQ 8(SI), DX - MOVQ 16(DI), R8 - SBBQ 16(SI), R8 - MOVQ 24(DI), R9 - SBBQ 24(SI), R9 - MOVQ 32(DI), R10 - SBBQ 32(SI), R10 - MOVQ 40(DI), R11 - SBBQ 40(SI), R11 - MOVQ 48(DI), R12 - SBBQ 48(SI), R12 - MOVQ 56(DI), R13 - SBBQ 56(SI), R13 - MOVQ 64(DI), R14 - SBBQ 64(SI), R14 - MOVQ 72(DI), R15 - SBBQ 72(SI), R15 + // | a1 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 - // | - MOVQ p+24(FP), SI - CMOVQCS (SI), AX - MOVQ AX, (SP) - CMOVQCS 8(SI), AX - MOVQ AX, 8(SP) - CMOVQCS 16(SI), AX - MOVQ AX, 16(SP) - CMOVQCS 24(SI), AX - MOVQ AX, 24(SP) - CMOVQCS 32(SI), AX - MOVQ AX, 32(SP) - CMOVQCS 40(SI), AX - MOVQ AX, 40(SP) - CMOVQCS 48(SI), AX - MOVQ AX, 48(SP) - CMOVQCS 56(SI), AX - MOVQ AX, 56(SP) - CMOVQCS 64(SI), AX - MOVQ AX, 64(SP) - CMOVQCS 72(SI), AX - MOVQ AX, 72(SP) + // | a1 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 - // | - MOVQ c+0(FP), DI - ADDQ (SP), CX - MOVQ CX, (DI) - ADCQ 8(SP), DX - MOVQ DX, 8(DI) - ADCQ 16(SP), R8 - MOVQ R8, 16(DI) - ADCQ 24(SP), R9 - MOVQ R9, 24(DI) - ADCQ 32(SP), R10 - MOVQ R10, 32(DI) - ADCQ 40(SP), R11 - MOVQ R11, 40(DI) - ADCQ 48(SP), R12 - MOVQ R12, 48(DI) - ADCQ 56(SP), R13 - MOVQ R13, 56(DI) - ADCQ 64(SP), R14 - MOVQ R14, 64(DI) - ADCQ 72(SP), R15 - MOVQ R15, 72(DI) - RET + // | a1 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 -// func subn10(a *[10]uint64, b *[10]uint64) uint64 -TEXT ·subn10(SB), NOSPLIT, $0-24 - // | - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - XORQ AX, AX + // | a1 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 - // | - MOVQ (DI), CX - SUBQ (SI), CX - MOVQ 8(DI), DX - SBBQ 8(SI), DX - MOVQ 16(DI), R8 - SBBQ 16(SI), R8 - MOVQ 24(DI), R9 - SBBQ 24(SI), R9 - MOVQ 32(DI), R10 - SBBQ 32(SI), R10 - MOVQ 40(DI), R11 - SBBQ 40(SI), R11 - MOVQ 48(DI), R12 - SBBQ 48(SI), R12 - MOVQ 56(DI), R13 - SBBQ 56(SI), R13 - MOVQ 64(DI), R14 - SBBQ 64(SI), R14 - MOVQ 72(DI), R15 - SBBQ 72(SI), R15 - ADCQ $0x00, AX + // | a1 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R12 + ADOXQ R13, R13 + ADCXQ BX, R13 - // | - MOVQ CX, (DI) - MOVQ DX, 8(DI) - MOVQ R8, 16(DI) - MOVQ R9, 24(DI) - MOVQ R10, 32(DI) - MOVQ R11, 40(DI) - MOVQ R12, 48(DI) - MOVQ R13, 56(DI) - MOVQ R14, 64(DI) - MOVQ R15, 72(DI) - MOVQ AX, ret+16(FP) - RET + // | -// func _neg10(c *[10]uint64, a *[10]uint64, p *[10]uint64) -TEXT ·_neg10(SB), NOSPLIT, $0-24 - // | - MOVQ a+8(FP), DI +/* i = 2 */ - // | - MOVQ p+16(FP), SI - MOVQ (SI), CX - SUBQ (DI), CX - MOVQ 8(SI), DX - SBBQ 8(DI), DX - MOVQ 16(SI), R8 - SBBQ 16(DI), R8 - MOVQ 24(SI), R9 - SBBQ 24(DI), R9 - MOVQ 32(SI), R10 - SBBQ 32(DI), R10 - MOVQ 40(SI), R11 - SBBQ 40(DI), R11 - MOVQ 48(SI), R12 - SBBQ 48(DI), R12 - MOVQ 56(SI), R13 - SBBQ 56(DI), R13 - MOVQ 64(SI), R14 - SBBQ 64(DI), R14 - MOVQ 72(SI), R15 - SBBQ 72(DI), R15 + // | a2 @ DX + MOVQ 16(DI), DX + XORQ R14, R14 - // | - MOVQ c+0(FP), DI - MOVQ CX, (DI) - MOVQ DX, 8(DI) - MOVQ R8, 16(DI) - MOVQ R9, 24(DI) - MOVQ R10, 32(DI) - MOVQ R11, 40(DI) - MOVQ R12, 48(DI) - MOVQ R13, 56(DI) - MOVQ R14, 64(DI) - MOVQ R15, 72(DI) - RET + // | a2 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 -// func mul_two_10(a *[10]uint64) -TEXT ·mul_two_10(SB), NOSPLIT, $0-8 - MOVQ a+0(FP), DI - XORQ AX, AX - RCLQ $0x01, (DI) - RCLQ $0x01, 8(DI) - RCLQ $0x01, 16(DI) - RCLQ $0x01, 24(DI) - RCLQ $0x01, 32(DI) - RCLQ $0x01, 40(DI) - RCLQ $0x01, 48(DI) - RCLQ $0x01, 56(DI) - RCLQ $0x01, 64(DI) - RCLQ $0x01, 72(DI) - RET + // | a2 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 -// func div_two_10(a *[10]uint64) -TEXT ·div_two_10(SB), NOSPLIT, $0-8 - MOVQ a+0(FP), DI - XORQ AX, AX - RCRQ $0x01, 72(DI) - RCRQ $0x01, 64(DI) - RCRQ $0x01, 56(DI) - RCRQ $0x01, 48(DI) - RCRQ $0x01, 40(DI) - RCRQ $0x01, 32(DI) - RCRQ $0x01, 24(DI) - RCRQ $0x01, 16(DI) - RCRQ $0x01, 8(DI) - RCRQ $0x01, (DI) - RET + // | a2 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 -// func cpy11(dst *[11]uint64, src *[11]uint64) -TEXT ·cpy11(SB), NOSPLIT, $0-16 - MOVQ dst+0(FP), DI - MOVQ src+8(FP), SI - MOVQ (SI), R8 - MOVQ R8, (DI) - MOVQ 8(SI), R8 - MOVQ R8, 8(DI) - MOVQ 16(SI), R8 - MOVQ R8, 16(DI) - MOVQ 24(SI), R8 - MOVQ R8, 24(DI) - MOVQ 32(SI), R8 - MOVQ R8, 32(DI) - MOVQ 40(SI), R8 - MOVQ R8, 40(DI) - MOVQ 48(SI), R8 - MOVQ R8, 48(DI) - MOVQ 56(SI), R8 - MOVQ R8, 56(DI) - MOVQ 64(SI), R8 - MOVQ R8, 64(DI) - MOVQ 72(SI), R8 - MOVQ R8, 72(DI) - MOVQ 80(SI), R8 - MOVQ R8, 80(DI) - RET + // | a2 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 -// func eq11(a *[11]uint64, b *[11]uint64) bool -TEXT ·eq11(SB), NOSPLIT, $0-17 - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - MOVB $0x00, ret+16(FP) - MOVQ (DI), R8 - CMPQ (SI), R8 - JNE ret - MOVQ 8(DI), R8 - CMPQ 8(SI), R8 - JNE ret - MOVQ 16(DI), R8 - CMPQ 16(SI), R8 - JNE ret - MOVQ 24(DI), R8 - CMPQ 24(SI), R8 - JNE ret - MOVQ 32(DI), R8 - CMPQ 32(SI), R8 - JNE ret - MOVQ 40(DI), R8 - CMPQ 40(SI), R8 - JNE ret - MOVQ 48(DI), R8 - CMPQ 48(SI), R8 - JNE ret - MOVQ 56(DI), R8 - CMPQ 56(SI), R8 - JNE ret - MOVQ 64(DI), R8 - CMPQ 64(SI), R8 - JNE ret - MOVQ 72(DI), R8 - CMPQ 72(SI), R8 - JNE ret - MOVQ 80(DI), R8 - CMPQ 80(SI), R8 - JNE ret - MOVB $0x01, ret+16(FP) + // | a2 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 -ret: - RET + // | a2 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R13 + ADOXQ R14, R14 + ADCXQ BX, R14 -// func cmp11(a *[11]uint64, b *[11]uint64) int8 -TEXT ·cmp11(SB), NOSPLIT, $0-17 - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - MOVQ 80(DI), R8 - CMPQ 80(SI), R8 - JB gt - JA lt - MOVQ 72(DI), R8 - CMPQ 72(SI), R8 - JB gt - JA lt - MOVQ 64(DI), R8 - CMPQ 64(SI), R8 - JB gt - JA lt - MOVQ 56(DI), R8 - CMPQ 56(SI), R8 - JB gt - JA lt - MOVQ 48(DI), R8 - CMPQ 48(SI), R8 - JB gt - JA lt - MOVQ 40(DI), R8 - CMPQ 40(SI), R8 - JB gt - JA lt - MOVQ 32(DI), R8 - CMPQ 32(SI), R8 - JB gt - JA lt - MOVQ 24(DI), R8 - CMPQ 24(SI), R8 - JB gt - JA lt - MOVQ 16(DI), R8 - CMPQ 16(SI), R8 - JB gt - JA lt - MOVQ 8(DI), R8 - CMPQ 8(SI), R8 - JB gt - JA lt - MOVQ (DI), R8 - CMPQ (SI), R8 - JB gt - JA lt - MOVB $0x00, ret+16(FP) - JMP ret + // | -gt: - MOVB $0x01, ret+16(FP) - JMP ret +/* i = 3 */ -lt: - MOVB $0xff, ret+16(FP) + // | a3 @ DX + MOVQ 24(DI), DX + XORQ R15, R15 -ret: - RET + // | a3 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 -// func add11(c *[11]uint64, a *[11]uint64, b *[11]uint64, p *[11]uint64) -TEXT ·add11(SB), NOSPLIT, $96-32 - // | - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI - XORQ AX, AX + // | a3 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 - // | - MOVQ (DI), CX - ADDQ (SI), CX - MOVQ 8(DI), DX - ADCQ 8(SI), DX - MOVQ 16(DI), R8 - ADCQ 16(SI), R8 - MOVQ 24(DI), R9 - ADCQ 24(SI), R9 - MOVQ 32(DI), R10 - ADCQ 32(SI), R10 - MOVQ 40(DI), R11 - ADCQ 40(SI), R11 - MOVQ 48(DI), R12 - ADCQ 48(SI), R12 - MOVQ 56(DI), R13 - ADCQ 56(SI), R13 - MOVQ 64(DI), R14 - ADCQ 64(SI), R14 - MOVQ 72(DI), R15 - ADCQ 72(SI), R15 - MOVQ 80(DI), BX - ADCQ 80(SI), BX - MOVQ BX, (SP) - ADCQ $0x00, AX + // | a3 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 - // | - MOVQ p+24(FP), SI - MOVQ CX, BX - SUBQ (SI), BX - MOVQ BX, 8(SP) - MOVQ DX, BX - SBBQ 8(SI), BX - MOVQ BX, 16(SP) - MOVQ R8, BX - SBBQ 16(SI), BX - MOVQ BX, 24(SP) - MOVQ R9, BX - SBBQ 24(SI), BX - MOVQ BX, 32(SP) - MOVQ R10, BX - SBBQ 32(SI), BX - MOVQ BX, 40(SP) - MOVQ R11, BX - SBBQ 40(SI), BX - MOVQ BX, 48(SP) - MOVQ R12, BX - SBBQ 48(SI), BX - MOVQ BX, 56(SP) - MOVQ R13, BX - SBBQ 56(SI), BX - MOVQ BX, 64(SP) - MOVQ R14, BX - SBBQ 64(SI), BX - MOVQ BX, 72(SP) - MOVQ R15, BX - SBBQ 72(SI), BX - MOVQ BX, 80(SP) - MOVQ (SP), BX - SBBQ 80(SI), BX - MOVQ BX, 88(SP) - SBBQ $0x00, AX + // | a3 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 - // | - MOVQ c+0(FP), DI - CMOVQCC 8(SP), CX - MOVQ CX, (DI) - CMOVQCC 16(SP), DX - MOVQ DX, 8(DI) - CMOVQCC 24(SP), R8 - MOVQ R8, 16(DI) - CMOVQCC 32(SP), R9 - MOVQ R9, 24(DI) - CMOVQCC 40(SP), R10 - MOVQ R10, 32(DI) - CMOVQCC 48(SP), R11 - MOVQ R11, 40(DI) - CMOVQCC 56(SP), R12 - MOVQ R12, 48(DI) - CMOVQCC 64(SP), R13 - MOVQ R13, 56(DI) - CMOVQCC 72(SP), R14 - MOVQ R14, 64(DI) - CMOVQCC 80(SP), R15 - MOVQ R15, 72(DI) - MOVQ (SP), BX - CMOVQCC 88(SP), BX - MOVQ BX, 80(DI) - RET + // | a3 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 -// func addn11(a *[11]uint64, b *[11]uint64) uint64 -TEXT ·addn11(SB), NOSPLIT, $8-24 - // | - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI + // | a3 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R14 + ADOXQ R15, R15 + ADCXQ BX, R15 - // | - MOVQ (DI), CX - ADDQ (SI), CX - MOVQ 8(DI), DX - ADCQ 8(SI), DX - MOVQ 16(DI), R8 - ADCQ 16(SI), R8 - MOVQ 24(DI), R9 - ADCQ 24(SI), R9 - MOVQ 32(DI), R10 - ADCQ 32(SI), R10 - MOVQ 40(DI), R11 - ADCQ 40(SI), R11 - MOVQ 48(DI), R12 - ADCQ 48(SI), R12 - MOVQ 56(DI), R13 - ADCQ 56(SI), R13 - MOVQ 64(DI), R14 - ADCQ 64(SI), R14 - MOVQ 72(DI), R15 - ADCQ 72(SI), R15 - MOVQ 80(DI), BX - ADCQ 80(SI), BX - MOVQ BX, (SP) - ADCQ $0x00, AX + // | - // | - MOVQ CX, (DI) - MOVQ DX, 8(DI) - MOVQ R8, 16(DI) - MOVQ R9, 24(DI) - MOVQ R10, 32(DI) - MOVQ R11, 40(DI) - MOVQ R12, 48(DI) - MOVQ R13, 56(DI) - MOVQ R14, 64(DI) - MOVQ R15, 72(DI) - MOVQ (SP), BX - MOVQ BX, 80(DI) - MOVQ AX, ret+16(FP) - RET +/* i = 4 */ -// func double11(c *[11]uint64, a *[11]uint64, p *[11]uint64) -TEXT ·double11(SB), NOSPLIT, $96-24 - // | - MOVQ a+8(FP), DI - XORQ AX, AX - MOVQ (DI), CX - ADDQ CX, CX - MOVQ 8(DI), DX - ADCQ DX, DX - MOVQ 16(DI), R8 - ADCQ R8, R8 - MOVQ 24(DI), R9 - ADCQ R9, R9 - MOVQ 32(DI), R10 - ADCQ R10, R10 - MOVQ 40(DI), R11 - ADCQ R11, R11 - MOVQ 48(DI), R12 - ADCQ R12, R12 - MOVQ 56(DI), R13 - ADCQ R13, R13 - MOVQ 64(DI), R14 - ADCQ R14, R14 - MOVQ 72(DI), R15 - ADCQ R15, R15 - MOVQ 80(DI), BX - ADCQ BX, BX - MOVQ BX, (SP) - ADCQ $0x00, AX + // | a4 @ DX + MOVQ 32(DI), DX + XORQ CX, CX - // | - MOVQ p+16(FP), SI - MOVQ CX, BX - SUBQ (SI), BX - MOVQ BX, 8(SP) - MOVQ DX, BX - SBBQ 8(SI), BX - MOVQ BX, 16(SP) - MOVQ R8, BX - SBBQ 16(SI), BX - MOVQ BX, 24(SP) - MOVQ R9, BX - SBBQ 24(SI), BX - MOVQ BX, 32(SP) - MOVQ R10, BX - SBBQ 32(SI), BX - MOVQ BX, 40(SP) - MOVQ R11, BX - SBBQ 40(SI), BX - MOVQ BX, 48(SP) - MOVQ R12, BX - SBBQ 48(SI), BX - MOVQ BX, 56(SP) - MOVQ R13, BX - SBBQ 56(SI), BX - MOVQ BX, 64(SP) - MOVQ R14, BX - SBBQ 64(SI), BX - MOVQ BX, 72(SP) - MOVQ R15, BX - SBBQ 72(SI), BX - MOVQ BX, 80(SP) - MOVQ (SP), BX - SBBQ 80(SI), BX - MOVQ BX, 88(SP) - SBBQ $0x00, AX + // | a4 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 - // | - MOVQ c+0(FP), DI - CMOVQCC 8(SP), CX - MOVQ CX, (DI) - CMOVQCC 16(SP), DX - MOVQ DX, 8(DI) - CMOVQCC 24(SP), R8 - MOVQ R8, 16(DI) - CMOVQCC 32(SP), R9 - MOVQ R9, 24(DI) - CMOVQCC 40(SP), R10 - MOVQ R10, 32(DI) - CMOVQCC 48(SP), R11 - MOVQ R11, 40(DI) - CMOVQCC 56(SP), R12 - MOVQ R12, 48(DI) - CMOVQCC 64(SP), R13 - MOVQ R13, 56(DI) - CMOVQCC 72(SP), R14 - MOVQ R14, 64(DI) - CMOVQCC 80(SP), R15 - MOVQ R15, 72(DI) - MOVQ (SP), BX - CMOVQCC 88(SP), BX - MOVQ BX, 80(DI) - RET + // | a4 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 -// func sub11(c *[11]uint64, a *[11]uint64, b *[11]uint64, p *[11]uint64) -TEXT ·sub11(SB), NOSPLIT, $96-32 - // | - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI - XORQ AX, AX - MOVQ (DI), CX - SUBQ (SI), CX - MOVQ 8(DI), DX - SBBQ 8(SI), DX - MOVQ 16(DI), R8 - SBBQ 16(SI), R8 - MOVQ 24(DI), R9 - SBBQ 24(SI), R9 - MOVQ 32(DI), R10 - SBBQ 32(SI), R10 - MOVQ 40(DI), R11 - SBBQ 40(SI), R11 - MOVQ 48(DI), R12 - SBBQ 48(SI), R12 - MOVQ 56(DI), R13 - SBBQ 56(SI), R13 - MOVQ 64(DI), R14 - SBBQ 64(SI), R14 - MOVQ 72(DI), R15 - SBBQ 72(SI), R15 - MOVQ 80(DI), BX - SBBQ 80(SI), BX - MOVQ BX, (SP) + // | a4 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 - // | - MOVQ p+24(FP), SI - CMOVQCS (SI), AX - MOVQ AX, 8(SP) - CMOVQCS 8(SI), AX - MOVQ AX, 16(SP) - CMOVQCS 16(SI), AX - MOVQ AX, 24(SP) - CMOVQCS 24(SI), AX - MOVQ AX, 32(SP) - CMOVQCS 32(SI), AX - MOVQ AX, 40(SP) - CMOVQCS 40(SI), AX - MOVQ AX, 48(SP) - CMOVQCS 48(SI), AX - MOVQ AX, 56(SP) - CMOVQCS 56(SI), AX - MOVQ AX, 64(SP) - CMOVQCS 64(SI), AX - MOVQ AX, 72(SP) - CMOVQCS 72(SI), AX - MOVQ AX, 80(SP) - CMOVQCS 80(SI), AX - MOVQ AX, 88(SP) + // | a4 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 - // | - MOVQ c+0(FP), DI - ADDQ 8(SP), CX - MOVQ CX, (DI) - ADCQ 16(SP), DX - MOVQ DX, 8(DI) - ADCQ 24(SP), R8 - MOVQ R8, 16(DI) - ADCQ 32(SP), R9 - MOVQ R9, 24(DI) - ADCQ 40(SP), R10 - MOVQ R10, 32(DI) - ADCQ 48(SP), R11 - MOVQ R11, 40(DI) - ADCQ 56(SP), R12 - MOVQ R12, 48(DI) - ADCQ 64(SP), R13 - MOVQ R13, 56(DI) - ADCQ 72(SP), R14 - MOVQ R14, 64(DI) - ADCQ 80(SP), R15 - MOVQ R15, 72(DI) - MOVQ (SP), BX - ADCQ 88(SP), BX - MOVQ BX, 80(DI) - RET + // | a4 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 -// func subn11(a *[11]uint64, b *[11]uint64) uint64 -TEXT ·subn11(SB), NOSPLIT, $8-24 - // | - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - XORQ AX, AX + // | a4 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R15 + ADOXQ CX, CX + ADCXQ BX, CX - // | - MOVQ (DI), CX - SUBQ (SI), CX - MOVQ 8(DI), DX - SBBQ 8(SI), DX - MOVQ 16(DI), R8 - SBBQ 16(SI), R8 - MOVQ 24(DI), R9 - SBBQ 24(SI), R9 - MOVQ 32(DI), R10 - SBBQ 32(SI), R10 - MOVQ 40(DI), R11 - SBBQ 40(SI), R11 - MOVQ 48(DI), R12 - SBBQ 48(SI), R12 - MOVQ 56(DI), R13 - SBBQ 56(SI), R13 - MOVQ 64(DI), R14 - SBBQ 64(SI), R14 - MOVQ 72(DI), R15 - SBBQ 72(SI), R15 - MOVQ 80(DI), BX - SBBQ 80(SI), BX - MOVQ BX, (SP) - ADCQ $0x00, AX + // | - // | - MOVQ CX, (DI) - MOVQ DX, 8(DI) - MOVQ R8, 16(DI) - MOVQ R9, 24(DI) - MOVQ R10, 32(DI) - MOVQ R11, 40(DI) - MOVQ R12, 48(DI) - MOVQ R13, 56(DI) - MOVQ R14, 64(DI) - MOVQ R15, 72(DI) - MOVQ (SP), BX - MOVQ BX, 80(DI) - MOVQ AX, ret+16(FP) - RET +/* i = 5 */ -// func _neg11(c *[11]uint64, a *[11]uint64, p *[11]uint64) -TEXT ·_neg11(SB), NOSPLIT, $8-24 - // | - MOVQ a+8(FP), DI + // | a5 @ DX + MOVQ 40(DI), DX + XORQ DI, DI + + // | a5 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a5 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a5 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a5 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a5 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a5 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, CX + ADOXQ BX, DI + ADCQ $0x00, DI + + // | + +/* */ + + // | + // | W + // | 0 (SP) | 1 8(SP) | 2 R8 | 3 R9 | 4 R10 | 5 R11 + // | 6 R12 | 7 R13 | 8 R14 | 9 R15 | 10 CX | 11 DI - // | - MOVQ p+16(FP), SI - MOVQ (SI), CX - SUBQ (DI), CX - MOVQ 8(SI), DX - SBBQ 8(DI), DX - MOVQ 16(SI), R8 - SBBQ 16(DI), R8 - MOVQ 24(SI), R9 - SBBQ 24(DI), R9 - MOVQ 32(SI), R10 - SBBQ 32(DI), R10 - MOVQ 40(SI), R11 - SBBQ 40(DI), R11 - MOVQ 48(SI), R12 - SBBQ 48(DI), R12 - MOVQ 56(SI), R13 - SBBQ 56(DI), R13 - MOVQ 64(SI), R14 - SBBQ 64(DI), R14 - MOVQ 72(SI), R15 - SBBQ 72(DI), R15 - MOVQ 80(SI), BX - SBBQ 80(DI), BX - MOVQ BX, (SP) - // | - MOVQ c+0(FP), DI - MOVQ CX, (DI) - MOVQ DX, 8(DI) - MOVQ R8, 16(DI) - MOVQ R9, 24(DI) - MOVQ R10, 32(DI) - MOVQ R11, 40(DI) - MOVQ R12, 48(DI) - MOVQ R13, 56(DI) - MOVQ R14, 64(DI) - MOVQ R15, 72(DI) MOVQ (SP), BX - MOVQ BX, 80(DI) - RET + MOVQ 8(SP), SI + MOVQ DI, (SP) + MOVQ CX, 8(SP) -// func mul_two_11(a *[11]uint64) -TEXT ·mul_two_11(SB), NOSPLIT, $0-8 - MOVQ a+0(FP), DI - XORQ AX, AX - RCLQ $0x01, (DI) - RCLQ $0x01, 8(DI) - RCLQ $0x01, 16(DI) - RCLQ $0x01, 24(DI) - RCLQ $0x01, 32(DI) - RCLQ $0x01, 40(DI) - RCLQ $0x01, 48(DI) - RCLQ $0x01, 56(DI) - RCLQ $0x01, 64(DI) - RCLQ $0x01, 72(DI) - RCLQ $0x01, 80(DI) - RET + // | fetch modulus + MOVQ p+24(FP), CX -// func div_two_11(a *[11]uint64) -TEXT ·div_two_11(SB), NOSPLIT, $0-8 - MOVQ a+0(FP), DI + // | + // | W ready to mont + // | 0 BX | 1 SI | 2 R8 | 3 R9 | 4 R10 | 5 R11 + // | 6 R12 | 7 R13 | 8 R14 | 9 R15 | 10 8(SP) | 11 (SP) + + + // | + +/* montgomery reduction */ + + // | clear flags XORQ AX, AX - RCRQ $0x01, 80(DI) - RCRQ $0x01, 72(DI) - RCRQ $0x01, 64(DI) - RCRQ $0x01, 56(DI) - RCRQ $0x01, 48(DI) - RCRQ $0x01, 40(DI) - RCRQ $0x01, 32(DI) - RCRQ $0x01, 24(DI) - RCRQ $0x01, 16(DI) - RCRQ $0x01, 8(DI) - RCRQ $0x01, (DI) - RET + // | -// func cpy12(dst *[12]uint64, src *[12]uint64) -TEXT ·cpy12(SB), NOSPLIT, $0-16 - MOVQ dst+0(FP), DI - MOVQ src+8(FP), SI - MOVQ (SI), R8 - MOVQ R8, (DI) - MOVQ 8(SI), R8 - MOVQ R8, 8(DI) - MOVQ 16(SI), R8 - MOVQ R8, 16(DI) - MOVQ 24(SI), R8 - MOVQ R8, 24(DI) - MOVQ 32(SI), R8 - MOVQ R8, 32(DI) - MOVQ 40(SI), R8 - MOVQ R8, 40(DI) - MOVQ 48(SI), R8 - MOVQ R8, 48(DI) - MOVQ 56(SI), R8 - MOVQ R8, 56(DI) - MOVQ 64(SI), R8 - MOVQ R8, 64(DI) - MOVQ 72(SI), R8 - MOVQ R8, 72(DI) - MOVQ 80(SI), R8 - MOVQ R8, 80(DI) - MOVQ 88(SI), R8 - MOVQ R8, 88(DI) - RET +/* i = 0 */ -// func eq12(a *[12]uint64, b *[12]uint64) bool -TEXT ·eq12(SB), NOSPLIT, $0-17 - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - MOVB $0x00, ret+16(FP) - MOVQ (DI), R8 - CMPQ (SI), R8 - JNE ret - MOVQ 8(DI), R8 - CMPQ 8(SI), R8 - JNE ret - MOVQ 16(DI), R8 - CMPQ 16(SI), R8 - JNE ret - MOVQ 24(DI), R8 - CMPQ 24(SI), R8 - JNE ret - MOVQ 32(DI), R8 - CMPQ 32(SI), R8 - JNE ret - MOVQ 40(DI), R8 - CMPQ 40(SI), R8 - JNE ret - MOVQ 48(DI), R8 - CMPQ 48(SI), R8 - JNE ret - MOVQ 56(DI), R8 - CMPQ 56(SI), R8 - JNE ret - MOVQ 64(DI), R8 - CMPQ 64(SI), R8 - JNE ret - MOVQ 72(DI), R8 - CMPQ 72(SI), R8 - JNE ret - MOVQ 80(DI), R8 - CMPQ 80(SI), R8 - JNE ret - MOVQ 88(DI), R8 - CMPQ 88(SI), R8 - JNE ret - MOVB $0x01, ret+16(FP) + // | + // | W + // | 0 BX | 1 SI | 2 R8 | 3 R9 | 4 R10 | 5 R11 + // | 6 R12 | 7 R13 | 8 R14 | 9 R15 | 10 8(SP) | 11 (SP) -ret: - RET -// func cmp12(a *[12]uint64, b *[12]uint64) int8 -TEXT ·cmp12(SB), NOSPLIT, $0-17 - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - MOVQ 88(DI), R8 - CMPQ 88(SI), R8 - JB gt - JA lt - MOVQ 80(DI), R8 - CMPQ 80(SI), R8 - JB gt - JA lt - MOVQ 72(DI), R8 - CMPQ 72(SI), R8 - JB gt - JA lt - MOVQ 64(DI), R8 - CMPQ 64(SI), R8 - JB gt - JA lt - MOVQ 56(DI), R8 - CMPQ 56(SI), R8 - JB gt - JA lt - MOVQ 48(DI), R8 - CMPQ 48(SI), R8 - JB gt - JA lt - MOVQ 40(DI), R8 - CMPQ 40(SI), R8 - JB gt - JA lt - MOVQ 32(DI), R8 - CMPQ 32(SI), R8 - JB gt - JA lt - MOVQ 24(DI), R8 - CMPQ 24(SI), R8 - JB gt - JA lt - MOVQ 16(DI), R8 - CMPQ 16(SI), R8 - JB gt - JA lt - MOVQ 8(DI), R8 - CMPQ 8(SI), R8 - JB gt - JA lt - MOVQ (DI), R8 - CMPQ (SI), R8 - JB gt - JA lt - MOVB $0x00, ret+16(FP) - JMP ret + // | | u0 = w0 * inp + MOVQ BX, DX + MULXQ inp+32(FP), DX, DI -gt: - MOVB $0x01, ret+16(FP) - JMP ret + // | -lt: - MOVB $0xff, ret+16(FP) +/* */ -ret: - RET + // | j0 -// func add12(c *[12]uint64, a *[12]uint64, b *[12]uint64, p *[12]uint64) -TEXT ·add12(SB), NOSPLIT, $112-32 - // | - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI - XORQ AX, AX + // | w0 @ BX + MULXQ (CX), AX, DI + ADOXQ AX, BX + ADCXQ DI, SI - // | - MOVQ (DI), CX - ADDQ (SI), CX - MOVQ 8(DI), DX - ADCQ 8(SI), DX - MOVQ 16(DI), R8 - ADCQ 16(SI), R8 - MOVQ 24(DI), R9 - ADCQ 24(SI), R9 - MOVQ 32(DI), R10 - ADCQ 32(SI), R10 - MOVQ 40(DI), R11 - ADCQ 40(SI), R11 - MOVQ 48(DI), R12 - ADCQ 48(SI), R12 - MOVQ 56(DI), R13 - ADCQ 56(SI), R13 - MOVQ 64(DI), R14 - ADCQ 64(SI), R14 - MOVQ 72(DI), R15 - ADCQ 72(SI), R15 - MOVQ 80(DI), BX - ADCQ 80(SI), BX - MOVQ BX, (SP) - MOVQ 88(DI), BX - ADCQ 88(SI), BX - MOVQ BX, 8(SP) - ADCQ $0x00, AX + // | j1 - // | - MOVQ p+24(FP), SI - MOVQ CX, BX - SUBQ (SI), BX - MOVQ BX, 16(SP) - MOVQ DX, BX - SBBQ 8(SI), BX - MOVQ BX, 24(SP) - MOVQ R8, BX - SBBQ 16(SI), BX - MOVQ BX, 32(SP) - MOVQ R9, BX - SBBQ 24(SI), BX - MOVQ BX, 40(SP) - MOVQ R10, BX - SBBQ 32(SI), BX - MOVQ BX, 48(SP) - MOVQ R11, BX - SBBQ 40(SI), BX - MOVQ BX, 56(SP) - MOVQ R12, BX - SBBQ 48(SI), BX - MOVQ BX, 64(SP) - MOVQ R13, BX - SBBQ 56(SI), BX - MOVQ BX, 72(SP) - MOVQ R14, BX - SBBQ 64(SI), BX - MOVQ BX, 80(SP) - MOVQ R15, BX - SBBQ 72(SI), BX - MOVQ BX, 88(SP) - MOVQ (SP), BX - SBBQ 80(SI), BX - MOVQ BX, 96(SP) - MOVQ 8(SP), BX - SBBQ 88(SI), BX - MOVQ BX, 104(SP) - SBBQ $0x00, AX + // | w1 @ SI + MULXQ 8(CX), AX, DI + ADOXQ AX, SI + ADCXQ DI, R8 - // | - MOVQ c+0(FP), DI - CMOVQCC 16(SP), CX - MOVQ CX, (DI) - CMOVQCC 24(SP), DX - MOVQ DX, 8(DI) - CMOVQCC 32(SP), R8 - MOVQ R8, 16(DI) - CMOVQCC 40(SP), R9 - MOVQ R9, 24(DI) - CMOVQCC 48(SP), R10 - MOVQ R10, 32(DI) - CMOVQCC 56(SP), R11 - MOVQ R11, 40(DI) - CMOVQCC 64(SP), R12 - MOVQ R12, 48(DI) - CMOVQCC 72(SP), R13 - MOVQ R13, 56(DI) - CMOVQCC 80(SP), R14 - MOVQ R14, 64(DI) - CMOVQCC 88(SP), R15 - MOVQ R15, 72(DI) - MOVQ (SP), BX - CMOVQCC 96(SP), BX - MOVQ BX, 80(DI) - MOVQ 8(SP), BX - CMOVQCC 104(SP), BX - MOVQ BX, 88(DI) - RET + // | j2 -// func addn12(a *[12]uint64, b *[12]uint64) uint64 -TEXT ·addn12(SB), NOSPLIT, $16-24 - // | - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI + // | w2 @ R8 + MULXQ 16(CX), AX, DI + ADOXQ AX, R8 + ADCXQ DI, R9 - // | - MOVQ (DI), CX - ADDQ (SI), CX - MOVQ 8(DI), DX - ADCQ 8(SI), DX - MOVQ 16(DI), R8 - ADCQ 16(SI), R8 - MOVQ 24(DI), R9 - ADCQ 24(SI), R9 - MOVQ 32(DI), R10 - ADCQ 32(SI), R10 - MOVQ 40(DI), R11 - ADCQ 40(SI), R11 - MOVQ 48(DI), R12 - ADCQ 48(SI), R12 - MOVQ 56(DI), R13 - ADCQ 56(SI), R13 - MOVQ 64(DI), R14 - ADCQ 64(SI), R14 - MOVQ 72(DI), R15 - ADCQ 72(SI), R15 - MOVQ 80(DI), BX - ADCQ 80(SI), BX - MOVQ BX, (SP) - MOVQ 88(DI), BX - ADCQ 88(SI), BX - MOVQ BX, 8(SP) - ADCQ $0x00, AX + // | j3 - // | - MOVQ CX, (DI) - MOVQ DX, 8(DI) - MOVQ R8, 16(DI) - MOVQ R9, 24(DI) - MOVQ R10, 32(DI) - MOVQ R11, 40(DI) - MOVQ R12, 48(DI) - MOVQ R13, 56(DI) - MOVQ R14, 64(DI) - MOVQ R15, 72(DI) - MOVQ (SP), BX - MOVQ BX, 80(DI) - MOVQ 8(SP), BX - MOVQ BX, 88(DI) - MOVQ AX, ret+16(FP) - RET + // | w3 @ R9 + MULXQ 24(CX), AX, DI + ADOXQ AX, R9 + ADCXQ DI, R10 -// func double12(c *[12]uint64, a *[12]uint64, p *[12]uint64) -TEXT ·double12(SB), NOSPLIT, $112-24 - // | - MOVQ a+8(FP), DI - XORQ AX, AX - MOVQ (DI), CX - ADDQ CX, CX - MOVQ 8(DI), DX - ADCQ DX, DX - MOVQ 16(DI), R8 - ADCQ R8, R8 - MOVQ 24(DI), R9 - ADCQ R9, R9 - MOVQ 32(DI), R10 - ADCQ R10, R10 - MOVQ 40(DI), R11 - ADCQ R11, R11 - MOVQ 48(DI), R12 - ADCQ R12, R12 - MOVQ 56(DI), R13 - ADCQ R13, R13 - MOVQ 64(DI), R14 - ADCQ R14, R14 - MOVQ 72(DI), R15 - ADCQ R15, R15 - MOVQ 80(DI), BX - ADCQ BX, BX - MOVQ BX, (SP) - MOVQ 88(DI), BX - ADCQ BX, BX - MOVQ BX, 8(SP) - ADCQ $0x00, AX + // | j4 - // | - MOVQ p+16(FP), SI - MOVQ CX, BX - SUBQ (SI), BX - MOVQ BX, 16(SP) - MOVQ DX, BX - SBBQ 8(SI), BX - MOVQ BX, 24(SP) - MOVQ R8, BX - SBBQ 16(SI), BX - MOVQ BX, 32(SP) - MOVQ R9, BX - SBBQ 24(SI), BX - MOVQ BX, 40(SP) - MOVQ R10, BX - SBBQ 32(SI), BX - MOVQ BX, 48(SP) - MOVQ R11, BX - SBBQ 40(SI), BX - MOVQ BX, 56(SP) - MOVQ R12, BX - SBBQ 48(SI), BX - MOVQ BX, 64(SP) - MOVQ R13, BX - SBBQ 56(SI), BX - MOVQ BX, 72(SP) - MOVQ R14, BX - SBBQ 64(SI), BX - MOVQ BX, 80(SP) - MOVQ R15, BX - SBBQ 72(SI), BX - MOVQ BX, 88(SP) - MOVQ (SP), BX - SBBQ 80(SI), BX - MOVQ BX, 96(SP) - MOVQ 8(SP), BX - SBBQ 88(SI), BX - MOVQ BX, 104(SP) - SBBQ $0x00, AX + // | w4 @ R10 + MULXQ 32(CX), AX, DI + ADOXQ AX, R10 + ADCXQ DI, R11 - // | - MOVQ c+0(FP), DI - CMOVQCC 16(SP), CX - MOVQ CX, (DI) - CMOVQCC 24(SP), DX - MOVQ DX, 8(DI) - CMOVQCC 32(SP), R8 - MOVQ R8, 16(DI) - CMOVQCC 40(SP), R9 - MOVQ R9, 24(DI) - CMOVQCC 48(SP), R10 - MOVQ R10, 32(DI) - CMOVQCC 56(SP), R11 - MOVQ R11, 40(DI) - CMOVQCC 64(SP), R12 - MOVQ R12, 48(DI) - CMOVQCC 72(SP), R13 - MOVQ R13, 56(DI) - CMOVQCC 80(SP), R14 - MOVQ R14, 64(DI) - CMOVQCC 88(SP), R15 - MOVQ R15, 72(DI) - MOVQ (SP), BX - CMOVQCC 96(SP), BX - MOVQ BX, 80(DI) - MOVQ 8(SP), BX - CMOVQCC 104(SP), BX - MOVQ BX, 88(DI) - RET + // | j5 -// func sub12(c *[12]uint64, a *[12]uint64, b *[12]uint64, p *[12]uint64) -TEXT ·sub12(SB), NOSPLIT, $112-32 - // | - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI + // | w5 @ R11 + MULXQ 40(CX), AX, DI + ADOXQ AX, R11 + ADCXQ DI, R12 + ADOXQ BX, R12 + ADCXQ BX, BX + MOVQ $0x00, AX + ADOXQ AX, BX + + // | clear flags XORQ AX, AX - MOVQ (DI), CX - SUBQ (SI), CX - MOVQ 8(DI), DX - SBBQ 8(SI), DX - MOVQ 16(DI), R8 - SBBQ 16(SI), R8 - MOVQ 24(DI), R9 - SBBQ 24(SI), R9 - MOVQ 32(DI), R10 - SBBQ 32(SI), R10 - MOVQ 40(DI), R11 - SBBQ 40(SI), R11 - MOVQ 48(DI), R12 - SBBQ 48(SI), R12 - MOVQ 56(DI), R13 - SBBQ 56(SI), R13 - MOVQ 64(DI), R14 - SBBQ 64(SI), R14 - MOVQ 72(DI), R15 - SBBQ 72(SI), R15 - MOVQ 80(DI), BX - SBBQ 80(SI), BX - MOVQ BX, (SP) - MOVQ 88(DI), BX - SBBQ 88(SI), BX - MOVQ BX, 8(SP) - // | - MOVQ p+24(FP), SI - CMOVQCS (SI), AX - MOVQ AX, 16(SP) - CMOVQCS 8(SI), AX - MOVQ AX, 24(SP) - CMOVQCS 16(SI), AX - MOVQ AX, 32(SP) - CMOVQCS 24(SI), AX - MOVQ AX, 40(SP) - CMOVQCS 32(SI), AX - MOVQ AX, 48(SP) - CMOVQCS 40(SI), AX - MOVQ AX, 56(SP) - CMOVQCS 48(SI), AX - MOVQ AX, 64(SP) - CMOVQCS 56(SI), AX - MOVQ AX, 72(SP) - CMOVQCS 64(SI), AX - MOVQ AX, 80(SP) - CMOVQCS 72(SI), AX - MOVQ AX, 88(SP) - CMOVQCS 80(SI), AX - MOVQ AX, 96(SP) - CMOVQCS 88(SI), AX - MOVQ AX, 104(SP) + // | - // | - MOVQ c+0(FP), DI - ADDQ 16(SP), CX - MOVQ CX, (DI) - ADCQ 24(SP), DX - MOVQ DX, 8(DI) - ADCQ 32(SP), R8 - MOVQ R8, 16(DI) - ADCQ 40(SP), R9 - MOVQ R9, 24(DI) - ADCQ 48(SP), R10 - MOVQ R10, 32(DI) - ADCQ 56(SP), R11 - MOVQ R11, 40(DI) - ADCQ 64(SP), R12 - MOVQ R12, 48(DI) - ADCQ 72(SP), R13 - MOVQ R13, 56(DI) - ADCQ 80(SP), R14 - MOVQ R14, 64(DI) - ADCQ 88(SP), R15 - MOVQ R15, 72(DI) - MOVQ (SP), BX - ADCQ 96(SP), BX - MOVQ BX, 80(DI) - MOVQ 8(SP), BX - ADCQ 104(SP), BX - MOVQ BX, 88(DI) - RET +/* i = 1 */ -// func subn12(a *[12]uint64, b *[12]uint64) uint64 -TEXT ·subn12(SB), NOSPLIT, $16-24 - // | - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - XORQ AX, AX + // | + // | W + // | 0 - | 1 SI | 2 R8 | 3 R9 | 4 R10 | 5 R11 + // | 6 R12 | 7 R13 | 8 R14 | 9 R15 | 10 8(SP) | 11 (SP) - // | - MOVQ (DI), CX - SUBQ (SI), CX - MOVQ 8(DI), DX - SBBQ 8(SI), DX - MOVQ 16(DI), R8 - SBBQ 16(SI), R8 - MOVQ 24(DI), R9 - SBBQ 24(SI), R9 - MOVQ 32(DI), R10 - SBBQ 32(SI), R10 - MOVQ 40(DI), R11 - SBBQ 40(SI), R11 - MOVQ 48(DI), R12 - SBBQ 48(SI), R12 - MOVQ 56(DI), R13 - SBBQ 56(SI), R13 - MOVQ 64(DI), R14 - SBBQ 64(SI), R14 - MOVQ 72(DI), R15 - SBBQ 72(SI), R15 - MOVQ 80(DI), BX - SBBQ 80(SI), BX - MOVQ BX, (SP) - MOVQ 88(DI), BX - SBBQ 88(SI), BX - MOVQ BX, 8(SP) - ADCQ $0x00, AX - // | - MOVQ CX, (DI) - MOVQ DX, 8(DI) - MOVQ R8, 16(DI) - MOVQ R9, 24(DI) - MOVQ R10, 32(DI) - MOVQ R11, 40(DI) - MOVQ R12, 48(DI) - MOVQ R13, 56(DI) - MOVQ R14, 64(DI) - MOVQ R15, 72(DI) - MOVQ (SP), BX - MOVQ BX, 80(DI) - MOVQ 8(SP), BX - MOVQ BX, 88(DI) - MOVQ AX, ret+16(FP) - RET + // | | u1 = w1 * inp + MOVQ SI, DX + MULXQ inp+32(FP), DX, DI -// func _neg12(c *[12]uint64, a *[12]uint64, p *[12]uint64) -TEXT ·_neg12(SB), NOSPLIT, $16-24 - // | - MOVQ a+8(FP), DI + // | - // | - MOVQ p+16(FP), SI - MOVQ (SI), CX - SUBQ (DI), CX - MOVQ 8(SI), DX - SBBQ 8(DI), DX - MOVQ 16(SI), R8 - SBBQ 16(DI), R8 - MOVQ 24(SI), R9 - SBBQ 24(DI), R9 - MOVQ 32(SI), R10 - SBBQ 32(DI), R10 - MOVQ 40(SI), R11 - SBBQ 40(DI), R11 - MOVQ 48(SI), R12 - SBBQ 48(DI), R12 - MOVQ 56(SI), R13 - SBBQ 56(DI), R13 - MOVQ 64(SI), R14 - SBBQ 64(DI), R14 - MOVQ 72(SI), R15 - SBBQ 72(DI), R15 - MOVQ 80(SI), BX - SBBQ 80(DI), BX - MOVQ BX, (SP) - MOVQ 88(SI), BX - SBBQ 88(DI), BX - MOVQ BX, 8(SP) +/* */ - // | - MOVQ c+0(FP), DI - MOVQ CX, (DI) - MOVQ DX, 8(DI) - MOVQ R8, 16(DI) - MOVQ R9, 24(DI) - MOVQ R10, 32(DI) - MOVQ R11, 40(DI) - MOVQ R12, 48(DI) - MOVQ R13, 56(DI) - MOVQ R14, 64(DI) - MOVQ R15, 72(DI) - MOVQ (SP), BX - MOVQ BX, 80(DI) - MOVQ 8(SP), BX - MOVQ BX, 88(DI) - RET + // | j0 -// func mul_two_12(a *[12]uint64) -TEXT ·mul_two_12(SB), NOSPLIT, $0-8 - MOVQ a+0(FP), DI - XORQ AX, AX - RCLQ $0x01, (DI) - RCLQ $0x01, 8(DI) - RCLQ $0x01, 16(DI) - RCLQ $0x01, 24(DI) - RCLQ $0x01, 32(DI) - RCLQ $0x01, 40(DI) - RCLQ $0x01, 48(DI) - RCLQ $0x01, 56(DI) - RCLQ $0x01, 64(DI) - RCLQ $0x01, 72(DI) - RCLQ $0x01, 80(DI) - RCLQ $0x01, 88(DI) - RET + // | w1 @ SI + MULXQ (CX), AX, DI + ADOXQ AX, SI + ADCXQ DI, R8 -// func div_two_12(a *[12]uint64) -TEXT ·div_two_12(SB), NOSPLIT, $0-8 - MOVQ a+0(FP), DI + // | j1 + + // | w2 @ R8 + MULXQ 8(CX), AX, DI + ADOXQ AX, R8 + ADCXQ DI, R9 + + // | j2 + + // | w3 @ R9 + MULXQ 16(CX), AX, DI + ADOXQ AX, R9 + ADCXQ DI, R10 + + // | j3 + + // | w4 @ R10 + MULXQ 24(CX), AX, DI + ADOXQ AX, R10 + ADCXQ DI, R11 + + // | j4 + + // | w5 @ R11 + MULXQ 32(CX), AX, DI + ADOXQ AX, R11 + ADCXQ DI, R12 + + // | j5 + + // | w6 @ R12 + MULXQ 40(CX), AX, DI + ADOXQ AX, R12 + ADCXQ DI, R13 + ADOXQ BX, R13 + ADCXQ SI, SI + MOVQ $0x00, AX + ADOXQ AX, SI + + // | clear flags XORQ AX, AX - RCRQ $0x01, 88(DI) - RCRQ $0x01, 80(DI) - RCRQ $0x01, 72(DI) - RCRQ $0x01, 64(DI) - RCRQ $0x01, 56(DI) - RCRQ $0x01, 48(DI) - RCRQ $0x01, 40(DI) - RCRQ $0x01, 32(DI) - RCRQ $0x01, 24(DI) - RCRQ $0x01, 16(DI) - RCRQ $0x01, 8(DI) - RCRQ $0x01, (DI) - RET + // | -// func cpy13(dst *[13]uint64, src *[13]uint64) -TEXT ·cpy13(SB), NOSPLIT, $0-16 - MOVQ dst+0(FP), DI - MOVQ src+8(FP), SI - MOVQ (SI), R8 - MOVQ R8, (DI) - MOVQ 8(SI), R8 - MOVQ R8, 8(DI) - MOVQ 16(SI), R8 - MOVQ R8, 16(DI) - MOVQ 24(SI), R8 - MOVQ R8, 24(DI) - MOVQ 32(SI), R8 - MOVQ R8, 32(DI) - MOVQ 40(SI), R8 - MOVQ R8, 40(DI) - MOVQ 48(SI), R8 - MOVQ R8, 48(DI) - MOVQ 56(SI), R8 - MOVQ R8, 56(DI) - MOVQ 64(SI), R8 - MOVQ R8, 64(DI) - MOVQ 72(SI), R8 - MOVQ R8, 72(DI) - MOVQ 80(SI), R8 - MOVQ R8, 80(DI) - MOVQ 88(SI), R8 - MOVQ R8, 88(DI) - MOVQ 96(SI), R8 - MOVQ R8, 96(DI) - RET +/* i = 2 */ -// func eq13(a *[13]uint64, b *[13]uint64) bool -TEXT ·eq13(SB), NOSPLIT, $0-17 - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - MOVB $0x00, ret+16(FP) - MOVQ (DI), R8 - CMPQ (SI), R8 - JNE ret - MOVQ 8(DI), R8 - CMPQ 8(SI), R8 - JNE ret - MOVQ 16(DI), R8 - CMPQ 16(SI), R8 - JNE ret - MOVQ 24(DI), R8 - CMPQ 24(SI), R8 - JNE ret - MOVQ 32(DI), R8 - CMPQ 32(SI), R8 - JNE ret - MOVQ 40(DI), R8 - CMPQ 40(SI), R8 - JNE ret - MOVQ 48(DI), R8 - CMPQ 48(SI), R8 - JNE ret - MOVQ 56(DI), R8 - CMPQ 56(SI), R8 - JNE ret - MOVQ 64(DI), R8 - CMPQ 64(SI), R8 - JNE ret - MOVQ 72(DI), R8 - CMPQ 72(SI), R8 - JNE ret - MOVQ 80(DI), R8 - CMPQ 80(SI), R8 - JNE ret - MOVQ 88(DI), R8 - CMPQ 88(SI), R8 - JNE ret - MOVQ 96(DI), R8 - CMPQ 96(SI), R8 - JNE ret - MOVB $0x01, ret+16(FP) + // | + // | W + // | 0 - | 1 - | 2 R8 | 3 R9 | 4 R10 | 5 R11 + // | 6 R12 | 7 R13 | 8 R14 | 9 R15 | 10 8(SP) | 11 (SP) -ret: - RET -// func cmp13(a *[13]uint64, b *[13]uint64) int8 -TEXT ·cmp13(SB), NOSPLIT, $0-17 - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - MOVQ 96(DI), R8 - CMPQ 96(SI), R8 - JB gt - JA lt - MOVQ 88(DI), R8 - CMPQ 88(SI), R8 - JB gt - JA lt - MOVQ 80(DI), R8 - CMPQ 80(SI), R8 - JB gt - JA lt - MOVQ 72(DI), R8 - CMPQ 72(SI), R8 - JB gt - JA lt - MOVQ 64(DI), R8 - CMPQ 64(SI), R8 - JB gt - JA lt - MOVQ 56(DI), R8 - CMPQ 56(SI), R8 - JB gt - JA lt - MOVQ 48(DI), R8 - CMPQ 48(SI), R8 - JB gt - JA lt - MOVQ 40(DI), R8 - CMPQ 40(SI), R8 - JB gt - JA lt - MOVQ 32(DI), R8 - CMPQ 32(SI), R8 - JB gt - JA lt - MOVQ 24(DI), R8 - CMPQ 24(SI), R8 - JB gt - JA lt - MOVQ 16(DI), R8 - CMPQ 16(SI), R8 - JB gt - JA lt - MOVQ 8(DI), R8 - CMPQ 8(SI), R8 - JB gt - JA lt - MOVQ (DI), R8 - CMPQ (SI), R8 - JB gt - JA lt - MOVB $0x00, ret+16(FP) - JMP ret + // | | u2 = w2 * inp + MOVQ R8, DX + MULXQ inp+32(FP), DX, DI -gt: - MOVB $0x01, ret+16(FP) - JMP ret + // | -lt: - MOVB $0xff, ret+16(FP) +/* */ -ret: - RET + // | j0 -// func add13(c *[13]uint64, a *[13]uint64, b *[13]uint64, p *[13]uint64) -TEXT ·add13(SB), NOSPLIT, $128-32 - // | - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI + // | w2 @ R8 + MULXQ (CX), AX, DI + ADOXQ AX, R8 + ADCXQ DI, R9 + + // | j1 + + // | w3 @ R9 + MULXQ 8(CX), AX, DI + ADOXQ AX, R9 + ADCXQ DI, R10 + + // | j2 + + // | w4 @ R10 + MULXQ 16(CX), AX, DI + ADOXQ AX, R10 + ADCXQ DI, R11 + + // | j3 + + // | w5 @ R11 + MULXQ 24(CX), AX, DI + ADOXQ AX, R11 + ADCXQ DI, R12 + + // | j4 + + // | w6 @ R12 + MULXQ 32(CX), AX, DI + ADOXQ AX, R12 + ADCXQ DI, R13 + + // | j5 + + // | w7 @ R13 + MULXQ 40(CX), AX, DI + ADOXQ AX, R13 + ADCXQ DI, R14 + ADOXQ SI, R14 + ADCXQ R8, R8 + MOVQ $0x00, AX + ADOXQ AX, R8 + + // | clear flags XORQ AX, AX - // | - MOVQ (DI), CX - ADDQ (SI), CX - MOVQ 8(DI), DX - ADCQ 8(SI), DX - MOVQ 16(DI), R8 - ADCQ 16(SI), R8 - MOVQ 24(DI), R9 - ADCQ 24(SI), R9 - MOVQ 32(DI), R10 - ADCQ 32(SI), R10 - MOVQ 40(DI), R11 - ADCQ 40(SI), R11 - MOVQ 48(DI), R12 - ADCQ 48(SI), R12 - MOVQ 56(DI), R13 - ADCQ 56(SI), R13 - MOVQ 64(DI), R14 - ADCQ 64(SI), R14 - MOVQ 72(DI), R15 - ADCQ 72(SI), R15 - MOVQ 80(DI), BX - ADCQ 80(SI), BX - MOVQ BX, (SP) - MOVQ 88(DI), BX - ADCQ 88(SI), BX - MOVQ BX, 8(SP) - MOVQ 96(DI), BX - ADCQ 96(SI), BX - MOVQ BX, 16(SP) - ADCQ $0x00, AX + // | - // | - MOVQ p+24(FP), SI - MOVQ CX, BX - SUBQ (SI), BX - MOVQ BX, 24(SP) - MOVQ DX, BX - SBBQ 8(SI), BX - MOVQ BX, 32(SP) - MOVQ R8, BX - SBBQ 16(SI), BX - MOVQ BX, 40(SP) - MOVQ R9, BX - SBBQ 24(SI), BX - MOVQ BX, 48(SP) - MOVQ R10, BX - SBBQ 32(SI), BX - MOVQ BX, 56(SP) - MOVQ R11, BX - SBBQ 40(SI), BX - MOVQ BX, 64(SP) - MOVQ R12, BX - SBBQ 48(SI), BX - MOVQ BX, 72(SP) - MOVQ R13, BX - SBBQ 56(SI), BX - MOVQ BX, 80(SP) - MOVQ R14, BX - SBBQ 64(SI), BX - MOVQ BX, 88(SP) - MOVQ R15, BX - SBBQ 72(SI), BX - MOVQ BX, 96(SP) - MOVQ (SP), BX - SBBQ 80(SI), BX - MOVQ BX, 104(SP) - MOVQ 8(SP), BX - SBBQ 88(SI), BX - MOVQ BX, 112(SP) - MOVQ 16(SP), BX - SBBQ 96(SI), BX - MOVQ BX, 120(SP) - SBBQ $0x00, AX +/* i = 3 */ - // | - MOVQ c+0(FP), DI - CMOVQCC 24(SP), CX - MOVQ CX, (DI) - CMOVQCC 32(SP), DX - MOVQ DX, 8(DI) - CMOVQCC 40(SP), R8 - MOVQ R8, 16(DI) - CMOVQCC 48(SP), R9 - MOVQ R9, 24(DI) - CMOVQCC 56(SP), R10 - MOVQ R10, 32(DI) - CMOVQCC 64(SP), R11 - MOVQ R11, 40(DI) - CMOVQCC 72(SP), R12 - MOVQ R12, 48(DI) - CMOVQCC 80(SP), R13 - MOVQ R13, 56(DI) - CMOVQCC 88(SP), R14 - MOVQ R14, 64(DI) - CMOVQCC 96(SP), R15 - MOVQ R15, 72(DI) - MOVQ (SP), BX - CMOVQCC 104(SP), BX - MOVQ BX, 80(DI) - MOVQ 8(SP), BX - CMOVQCC 112(SP), BX - MOVQ BX, 88(DI) - MOVQ 16(SP), BX - CMOVQCC 120(SP), BX - MOVQ BX, 96(DI) - RET + // | + // | W + // | 0 - | 1 - | 2 - | 3 R9 | 4 R10 | 5 R11 + // | 6 R12 | 7 R13 | 8 R14 | 9 R15 | 10 8(SP) | 11 (SP) -// func addn13(a *[13]uint64, b *[13]uint64) uint64 -TEXT ·addn13(SB), NOSPLIT, $24-24 - // | - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - // | - MOVQ (DI), CX - ADDQ (SI), CX - MOVQ 8(DI), DX - ADCQ 8(SI), DX - MOVQ 16(DI), R8 - ADCQ 16(SI), R8 - MOVQ 24(DI), R9 - ADCQ 24(SI), R9 - MOVQ 32(DI), R10 - ADCQ 32(SI), R10 - MOVQ 40(DI), R11 - ADCQ 40(SI), R11 - MOVQ 48(DI), R12 - ADCQ 48(SI), R12 - MOVQ 56(DI), R13 - ADCQ 56(SI), R13 - MOVQ 64(DI), R14 - ADCQ 64(SI), R14 - MOVQ 72(DI), R15 - ADCQ 72(SI), R15 - MOVQ 80(DI), BX - ADCQ 80(SI), BX - MOVQ BX, (SP) - MOVQ 88(DI), BX - ADCQ 88(SI), BX - MOVQ BX, 8(SP) - MOVQ 96(DI), BX - ADCQ 96(SI), BX - MOVQ BX, 16(SP) - ADCQ $0x00, AX + // | | u3 = w3 * inp + MOVQ R9, DX + MULXQ inp+32(FP), DX, DI - // | - MOVQ CX, (DI) - MOVQ DX, 8(DI) - MOVQ R8, 16(DI) - MOVQ R9, 24(DI) - MOVQ R10, 32(DI) - MOVQ R11, 40(DI) - MOVQ R12, 48(DI) - MOVQ R13, 56(DI) - MOVQ R14, 64(DI) - MOVQ R15, 72(DI) - MOVQ (SP), BX - MOVQ BX, 80(DI) - MOVQ 8(SP), BX - MOVQ BX, 88(DI) - MOVQ 16(SP), BX - MOVQ BX, 96(DI) - MOVQ AX, ret+16(FP) - RET + // | -// func double13(c *[13]uint64, a *[13]uint64, p *[13]uint64) -TEXT ·double13(SB), NOSPLIT, $128-24 - // | - MOVQ a+8(FP), DI - XORQ AX, AX - MOVQ (DI), CX - ADDQ CX, CX - MOVQ 8(DI), DX - ADCQ DX, DX - MOVQ 16(DI), R8 - ADCQ R8, R8 - MOVQ 24(DI), R9 - ADCQ R9, R9 - MOVQ 32(DI), R10 - ADCQ R10, R10 - MOVQ 40(DI), R11 - ADCQ R11, R11 - MOVQ 48(DI), R12 - ADCQ R12, R12 - MOVQ 56(DI), R13 - ADCQ R13, R13 - MOVQ 64(DI), R14 - ADCQ R14, R14 - MOVQ 72(DI), R15 - ADCQ R15, R15 - MOVQ 80(DI), BX - ADCQ BX, BX - MOVQ BX, (SP) - MOVQ 88(DI), BX - ADCQ BX, BX - MOVQ BX, 8(SP) - MOVQ 96(DI), BX - ADCQ BX, BX - MOVQ BX, 16(SP) - ADCQ $0x00, AX +/* */ - // | - MOVQ p+16(FP), SI - MOVQ CX, BX - SUBQ (SI), BX - MOVQ BX, 24(SP) - MOVQ DX, BX - SBBQ 8(SI), BX - MOVQ BX, 32(SP) - MOVQ R8, BX - SBBQ 16(SI), BX - MOVQ BX, 40(SP) - MOVQ R9, BX - SBBQ 24(SI), BX - MOVQ BX, 48(SP) - MOVQ R10, BX - SBBQ 32(SI), BX - MOVQ BX, 56(SP) - MOVQ R11, BX - SBBQ 40(SI), BX - MOVQ BX, 64(SP) - MOVQ R12, BX - SBBQ 48(SI), BX - MOVQ BX, 72(SP) - MOVQ R13, BX - SBBQ 56(SI), BX - MOVQ BX, 80(SP) - MOVQ R14, BX - SBBQ 64(SI), BX - MOVQ BX, 88(SP) - MOVQ R15, BX - SBBQ 72(SI), BX - MOVQ BX, 96(SP) - MOVQ (SP), BX - SBBQ 80(SI), BX - MOVQ BX, 104(SP) - MOVQ 8(SP), BX - SBBQ 88(SI), BX - MOVQ BX, 112(SP) - MOVQ 16(SP), BX - SBBQ 96(SI), BX - MOVQ BX, 120(SP) - SBBQ $0x00, AX + // | j0 - // | - MOVQ c+0(FP), DI - CMOVQCC 24(SP), CX - MOVQ CX, (DI) - CMOVQCC 32(SP), DX - MOVQ DX, 8(DI) - CMOVQCC 40(SP), R8 - MOVQ R8, 16(DI) - CMOVQCC 48(SP), R9 - MOVQ R9, 24(DI) - CMOVQCC 56(SP), R10 - MOVQ R10, 32(DI) - CMOVQCC 64(SP), R11 - MOVQ R11, 40(DI) - CMOVQCC 72(SP), R12 - MOVQ R12, 48(DI) - CMOVQCC 80(SP), R13 - MOVQ R13, 56(DI) - CMOVQCC 88(SP), R14 - MOVQ R14, 64(DI) - CMOVQCC 96(SP), R15 - MOVQ R15, 72(DI) - MOVQ (SP), BX - CMOVQCC 104(SP), BX - MOVQ BX, 80(DI) - MOVQ 8(SP), BX - CMOVQCC 112(SP), BX - MOVQ BX, 88(DI) - MOVQ 16(SP), BX - CMOVQCC 120(SP), BX - MOVQ BX, 96(DI) - RET + // | w3 @ R9 + MULXQ (CX), AX, DI + ADOXQ AX, R9 + ADCXQ DI, R10 -// func sub13(c *[13]uint64, a *[13]uint64, b *[13]uint64, p *[13]uint64) -TEXT ·sub13(SB), NOSPLIT, $128-32 - // | - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI - XORQ AX, AX - MOVQ (DI), CX - SUBQ (SI), CX - MOVQ 8(DI), DX - SBBQ 8(SI), DX - MOVQ 16(DI), R8 - SBBQ 16(SI), R8 - MOVQ 24(DI), R9 - SBBQ 24(SI), R9 - MOVQ 32(DI), R10 - SBBQ 32(SI), R10 - MOVQ 40(DI), R11 - SBBQ 40(SI), R11 - MOVQ 48(DI), R12 - SBBQ 48(SI), R12 - MOVQ 56(DI), R13 - SBBQ 56(SI), R13 - MOVQ 64(DI), R14 - SBBQ 64(SI), R14 - MOVQ 72(DI), R15 - SBBQ 72(SI), R15 - MOVQ 80(DI), BX - SBBQ 80(SI), BX - MOVQ BX, (SP) - MOVQ 88(DI), BX - SBBQ 88(SI), BX - MOVQ BX, 8(SP) - MOVQ 96(DI), BX - SBBQ 96(SI), BX - MOVQ BX, 16(SP) + // | j1 - // | - MOVQ p+24(FP), SI - CMOVQCS (SI), AX - MOVQ AX, 24(SP) - CMOVQCS 8(SI), AX - MOVQ AX, 32(SP) - CMOVQCS 16(SI), AX - MOVQ AX, 40(SP) - CMOVQCS 24(SI), AX - MOVQ AX, 48(SP) - CMOVQCS 32(SI), AX - MOVQ AX, 56(SP) - CMOVQCS 40(SI), AX - MOVQ AX, 64(SP) - CMOVQCS 48(SI), AX - MOVQ AX, 72(SP) - CMOVQCS 56(SI), AX - MOVQ AX, 80(SP) - CMOVQCS 64(SI), AX - MOVQ AX, 88(SP) - CMOVQCS 72(SI), AX - MOVQ AX, 96(SP) - CMOVQCS 80(SI), AX - MOVQ AX, 104(SP) - CMOVQCS 88(SI), AX - MOVQ AX, 112(SP) - CMOVQCS 96(SI), AX - MOVQ AX, 120(SP) + // | w4 @ R10 + MULXQ 8(CX), AX, DI + ADOXQ AX, R10 + ADCXQ DI, R11 - // | - MOVQ c+0(FP), DI - ADDQ 24(SP), CX - MOVQ CX, (DI) - ADCQ 32(SP), DX - MOVQ DX, 8(DI) - ADCQ 40(SP), R8 - MOVQ R8, 16(DI) - ADCQ 48(SP), R9 - MOVQ R9, 24(DI) - ADCQ 56(SP), R10 - MOVQ R10, 32(DI) - ADCQ 64(SP), R11 - MOVQ R11, 40(DI) - ADCQ 72(SP), R12 - MOVQ R12, 48(DI) - ADCQ 80(SP), R13 - MOVQ R13, 56(DI) - ADCQ 88(SP), R14 - MOVQ R14, 64(DI) - ADCQ 96(SP), R15 - MOVQ R15, 72(DI) - MOVQ (SP), BX - ADCQ 104(SP), BX - MOVQ BX, 80(DI) - MOVQ 8(SP), BX - ADCQ 112(SP), BX - MOVQ BX, 88(DI) - MOVQ 16(SP), BX - ADCQ 120(SP), BX - MOVQ BX, 96(DI) - RET + // | j2 -// func subn13(a *[13]uint64, b *[13]uint64) uint64 -TEXT ·subn13(SB), NOSPLIT, $24-24 - // | - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - XORQ AX, AX + // | w5 @ R11 + MULXQ 16(CX), AX, DI + ADOXQ AX, R11 + ADCXQ DI, R12 - // | - MOVQ (DI), CX - SUBQ (SI), CX - MOVQ 8(DI), DX - SBBQ 8(SI), DX - MOVQ 16(DI), R8 - SBBQ 16(SI), R8 - MOVQ 24(DI), R9 - SBBQ 24(SI), R9 - MOVQ 32(DI), R10 - SBBQ 32(SI), R10 - MOVQ 40(DI), R11 - SBBQ 40(SI), R11 - MOVQ 48(DI), R12 - SBBQ 48(SI), R12 - MOVQ 56(DI), R13 - SBBQ 56(SI), R13 - MOVQ 64(DI), R14 - SBBQ 64(SI), R14 - MOVQ 72(DI), R15 - SBBQ 72(SI), R15 - MOVQ 80(DI), BX - SBBQ 80(SI), BX - MOVQ BX, (SP) - MOVQ 88(DI), BX - SBBQ 88(SI), BX - MOVQ BX, 8(SP) - MOVQ 96(DI), BX - SBBQ 96(SI), BX - MOVQ BX, 16(SP) - ADCQ $0x00, AX + // | j3 - // | - MOVQ CX, (DI) - MOVQ DX, 8(DI) - MOVQ R8, 16(DI) - MOVQ R9, 24(DI) - MOVQ R10, 32(DI) - MOVQ R11, 40(DI) - MOVQ R12, 48(DI) - MOVQ R13, 56(DI) - MOVQ R14, 64(DI) - MOVQ R15, 72(DI) - MOVQ (SP), BX - MOVQ BX, 80(DI) - MOVQ 8(SP), BX - MOVQ BX, 88(DI) - MOVQ 16(SP), BX - MOVQ BX, 96(DI) - MOVQ AX, ret+16(FP) - RET + // | w6 @ R12 + MULXQ 24(CX), AX, DI + ADOXQ AX, R12 + ADCXQ DI, R13 -// func _neg13(c *[13]uint64, a *[13]uint64, p *[13]uint64) -TEXT ·_neg13(SB), NOSPLIT, $24-24 - // | - MOVQ a+8(FP), DI + // | j4 - // | - MOVQ p+16(FP), SI - MOVQ (SI), CX - SUBQ (DI), CX - MOVQ 8(SI), DX - SBBQ 8(DI), DX - MOVQ 16(SI), R8 - SBBQ 16(DI), R8 - MOVQ 24(SI), R9 - SBBQ 24(DI), R9 - MOVQ 32(SI), R10 - SBBQ 32(DI), R10 - MOVQ 40(SI), R11 - SBBQ 40(DI), R11 - MOVQ 48(SI), R12 - SBBQ 48(DI), R12 - MOVQ 56(SI), R13 - SBBQ 56(DI), R13 - MOVQ 64(SI), R14 - SBBQ 64(DI), R14 - MOVQ 72(SI), R15 - SBBQ 72(DI), R15 - MOVQ 80(SI), BX - SBBQ 80(DI), BX - MOVQ BX, (SP) - MOVQ 88(SI), BX - SBBQ 88(DI), BX - MOVQ BX, 8(SP) - MOVQ 96(SI), BX - SBBQ 96(DI), BX - MOVQ BX, 16(SP) + // | w7 @ R13 + MULXQ 32(CX), AX, DI + ADOXQ AX, R13 + ADCXQ DI, R14 - // | - MOVQ c+0(FP), DI - MOVQ CX, (DI) - MOVQ DX, 8(DI) - MOVQ R8, 16(DI) - MOVQ R9, 24(DI) - MOVQ R10, 32(DI) - MOVQ R11, 40(DI) - MOVQ R12, 48(DI) - MOVQ R13, 56(DI) - MOVQ R14, 64(DI) - MOVQ R15, 72(DI) - MOVQ (SP), BX - MOVQ BX, 80(DI) - MOVQ 8(SP), BX - MOVQ BX, 88(DI) - MOVQ 16(SP), BX - MOVQ BX, 96(DI) - RET + // | j5 -// func mul_two_13(a *[13]uint64) -TEXT ·mul_two_13(SB), NOSPLIT, $0-8 - MOVQ a+0(FP), DI - XORQ AX, AX - RCLQ $0x01, (DI) - RCLQ $0x01, 8(DI) - RCLQ $0x01, 16(DI) - RCLQ $0x01, 24(DI) - RCLQ $0x01, 32(DI) - RCLQ $0x01, 40(DI) - RCLQ $0x01, 48(DI) - RCLQ $0x01, 56(DI) - RCLQ $0x01, 64(DI) - RCLQ $0x01, 72(DI) - RCLQ $0x01, 80(DI) - RCLQ $0x01, 88(DI) - RCLQ $0x01, 96(DI) - RET + // | w8 @ R14 + MULXQ 40(CX), AX, DI + ADOXQ AX, R14 + ADCXQ DI, R15 + ADOXQ R8, R15 + ADCXQ R9, R9 + MOVQ $0x00, AX + ADOXQ AX, R9 -// func div_two_13(a *[13]uint64) -TEXT ·div_two_13(SB), NOSPLIT, $0-8 - MOVQ a+0(FP), DI + // | clear flags XORQ AX, AX - RCRQ $0x01, 96(DI) - RCRQ $0x01, 88(DI) - RCRQ $0x01, 80(DI) - RCRQ $0x01, 72(DI) - RCRQ $0x01, 64(DI) - RCRQ $0x01, 56(DI) - RCRQ $0x01, 48(DI) - RCRQ $0x01, 40(DI) - RCRQ $0x01, 32(DI) - RCRQ $0x01, 24(DI) - RCRQ $0x01, 16(DI) - RCRQ $0x01, 8(DI) - RCRQ $0x01, (DI) - RET + // | -// func cpy14(dst *[14]uint64, src *[14]uint64) -TEXT ·cpy14(SB), NOSPLIT, $0-16 - MOVQ dst+0(FP), DI - MOVQ src+8(FP), SI - MOVQ (SI), R8 - MOVQ R8, (DI) - MOVQ 8(SI), R8 - MOVQ R8, 8(DI) - MOVQ 16(SI), R8 - MOVQ R8, 16(DI) - MOVQ 24(SI), R8 - MOVQ R8, 24(DI) - MOVQ 32(SI), R8 - MOVQ R8, 32(DI) - MOVQ 40(SI), R8 - MOVQ R8, 40(DI) - MOVQ 48(SI), R8 - MOVQ R8, 48(DI) - MOVQ 56(SI), R8 - MOVQ R8, 56(DI) - MOVQ 64(SI), R8 - MOVQ R8, 64(DI) - MOVQ 72(SI), R8 - MOVQ R8, 72(DI) - MOVQ 80(SI), R8 - MOVQ R8, 80(DI) - MOVQ 88(SI), R8 - MOVQ R8, 88(DI) - MOVQ 96(SI), R8 - MOVQ R8, 96(DI) - MOVQ 104(SI), R8 - MOVQ R8, 104(DI) - RET +/* i = 4 */ -// func eq14(a *[14]uint64, b *[14]uint64) bool -TEXT ·eq14(SB), NOSPLIT, $0-17 - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - MOVB $0x00, ret+16(FP) - MOVQ (DI), R8 - CMPQ (SI), R8 - JNE ret - MOVQ 8(DI), R8 - CMPQ 8(SI), R8 - JNE ret - MOVQ 16(DI), R8 - CMPQ 16(SI), R8 - JNE ret - MOVQ 24(DI), R8 - CMPQ 24(SI), R8 - JNE ret - MOVQ 32(DI), R8 - CMPQ 32(SI), R8 - JNE ret - MOVQ 40(DI), R8 - CMPQ 40(SI), R8 - JNE ret - MOVQ 48(DI), R8 - CMPQ 48(SI), R8 - JNE ret - MOVQ 56(DI), R8 - CMPQ 56(SI), R8 - JNE ret - MOVQ 64(DI), R8 - CMPQ 64(SI), R8 - JNE ret - MOVQ 72(DI), R8 - CMPQ 72(SI), R8 - JNE ret - MOVQ 80(DI), R8 - CMPQ 80(SI), R8 - JNE ret - MOVQ 88(DI), R8 - CMPQ 88(SI), R8 - JNE ret - MOVQ 96(DI), R8 - CMPQ 96(SI), R8 - JNE ret - MOVQ 104(DI), R8 - CMPQ 104(SI), R8 - JNE ret - MOVB $0x01, ret+16(FP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 R10 | 5 R11 + // | 6 R12 | 7 R13 | 8 R14 | 9 R15 | 10 8(SP) | 11 (SP) -ret: - RET -// func cmp14(a *[14]uint64, b *[14]uint64) int8 -TEXT ·cmp14(SB), NOSPLIT, $0-17 - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - MOVQ 104(DI), R8 - CMPQ 104(SI), R8 - JB gt - JA lt - MOVQ 96(DI), R8 - CMPQ 96(SI), R8 - JB gt - JA lt - MOVQ 88(DI), R8 - CMPQ 88(SI), R8 - JB gt - JA lt - MOVQ 80(DI), R8 - CMPQ 80(SI), R8 - JB gt - JA lt - MOVQ 72(DI), R8 - CMPQ 72(SI), R8 - JB gt - JA lt - MOVQ 64(DI), R8 - CMPQ 64(SI), R8 - JB gt - JA lt - MOVQ 56(DI), R8 - CMPQ 56(SI), R8 - JB gt - JA lt - MOVQ 48(DI), R8 - CMPQ 48(SI), R8 - JB gt - JA lt - MOVQ 40(DI), R8 - CMPQ 40(SI), R8 - JB gt - JA lt - MOVQ 32(DI), R8 - CMPQ 32(SI), R8 - JB gt - JA lt - MOVQ 24(DI), R8 - CMPQ 24(SI), R8 - JB gt - JA lt - MOVQ 16(DI), R8 - CMPQ 16(SI), R8 - JB gt - JA lt - MOVQ 8(DI), R8 - CMPQ 8(SI), R8 - JB gt - JA lt - MOVQ (DI), R8 - CMPQ (SI), R8 - JB gt - JA lt - MOVB $0x00, ret+16(FP) - JMP ret + // | | u4 = w4 * inp + MOVQ R10, DX + MULXQ inp+32(FP), DX, DI -gt: - MOVB $0x01, ret+16(FP) - JMP ret + // | -lt: - MOVB $0xff, ret+16(FP) +/* */ -ret: - RET + // | j0 -// func add14(c *[14]uint64, a *[14]uint64, b *[14]uint64, p *[14]uint64) -TEXT ·add14(SB), NOSPLIT, $144-32 - // | - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI - XORQ AX, AX + // | w4 @ R10 + MULXQ (CX), AX, DI + ADOXQ AX, R10 + ADCXQ DI, R11 - // | - MOVQ (DI), CX - ADDQ (SI), CX - MOVQ 8(DI), DX - ADCQ 8(SI), DX - MOVQ 16(DI), R8 - ADCQ 16(SI), R8 - MOVQ 24(DI), R9 - ADCQ 24(SI), R9 - MOVQ 32(DI), R10 - ADCQ 32(SI), R10 - MOVQ 40(DI), R11 - ADCQ 40(SI), R11 - MOVQ 48(DI), R12 - ADCQ 48(SI), R12 - MOVQ 56(DI), R13 - ADCQ 56(SI), R13 - MOVQ 64(DI), R14 - ADCQ 64(SI), R14 - MOVQ 72(DI), R15 - ADCQ 72(SI), R15 - MOVQ 80(DI), BX - ADCQ 80(SI), BX - MOVQ BX, (SP) - MOVQ 88(DI), BX - ADCQ 88(SI), BX - MOVQ BX, 8(SP) - MOVQ 96(DI), BX - ADCQ 96(SI), BX - MOVQ BX, 16(SP) - MOVQ 104(DI), BX - ADCQ 104(SI), BX - MOVQ BX, 24(SP) - ADCQ $0x00, AX + // | j1 - // | - MOVQ p+24(FP), SI - MOVQ CX, BX - SUBQ (SI), BX - MOVQ BX, 32(SP) - MOVQ DX, BX - SBBQ 8(SI), BX - MOVQ BX, 40(SP) - MOVQ R8, BX - SBBQ 16(SI), BX - MOVQ BX, 48(SP) - MOVQ R9, BX - SBBQ 24(SI), BX - MOVQ BX, 56(SP) - MOVQ R10, BX - SBBQ 32(SI), BX - MOVQ BX, 64(SP) - MOVQ R11, BX - SBBQ 40(SI), BX - MOVQ BX, 72(SP) - MOVQ R12, BX - SBBQ 48(SI), BX - MOVQ BX, 80(SP) - MOVQ R13, BX - SBBQ 56(SI), BX - MOVQ BX, 88(SP) - MOVQ R14, BX - SBBQ 64(SI), BX - MOVQ BX, 96(SP) - MOVQ R15, BX - SBBQ 72(SI), BX - MOVQ BX, 104(SP) - MOVQ (SP), BX - SBBQ 80(SI), BX - MOVQ BX, 112(SP) - MOVQ 8(SP), BX - SBBQ 88(SI), BX - MOVQ BX, 120(SP) - MOVQ 16(SP), BX - SBBQ 96(SI), BX - MOVQ BX, 128(SP) - MOVQ 24(SP), BX - SBBQ 104(SI), BX - MOVQ BX, 136(SP) - SBBQ $0x00, AX + // | w5 @ R11 + MULXQ 8(CX), AX, DI + ADOXQ AX, R11 + ADCXQ DI, R12 - // | - MOVQ c+0(FP), DI - CMOVQCC 32(SP), CX - MOVQ CX, (DI) - CMOVQCC 40(SP), DX - MOVQ DX, 8(DI) - CMOVQCC 48(SP), R8 - MOVQ R8, 16(DI) - CMOVQCC 56(SP), R9 - MOVQ R9, 24(DI) - CMOVQCC 64(SP), R10 - MOVQ R10, 32(DI) - CMOVQCC 72(SP), R11 - MOVQ R11, 40(DI) - CMOVQCC 80(SP), R12 - MOVQ R12, 48(DI) - CMOVQCC 88(SP), R13 - MOVQ R13, 56(DI) - CMOVQCC 96(SP), R14 - MOVQ R14, 64(DI) - CMOVQCC 104(SP), R15 - MOVQ R15, 72(DI) - MOVQ (SP), BX - CMOVQCC 112(SP), BX - MOVQ BX, 80(DI) - MOVQ 8(SP), BX - CMOVQCC 120(SP), BX - MOVQ BX, 88(DI) - MOVQ 16(SP), BX - CMOVQCC 128(SP), BX - MOVQ BX, 96(DI) - MOVQ 24(SP), BX - CMOVQCC 136(SP), BX - MOVQ BX, 104(DI) - RET + // | j2 -// func addn14(a *[14]uint64, b *[14]uint64) uint64 -TEXT ·addn14(SB), NOSPLIT, $32-24 - // | - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI + // | w6 @ R12 + MULXQ 16(CX), AX, DI + ADOXQ AX, R12 + ADCXQ DI, R13 - // | - MOVQ (DI), CX - ADDQ (SI), CX - MOVQ 8(DI), DX - ADCQ 8(SI), DX - MOVQ 16(DI), R8 - ADCQ 16(SI), R8 - MOVQ 24(DI), R9 - ADCQ 24(SI), R9 - MOVQ 32(DI), R10 - ADCQ 32(SI), R10 - MOVQ 40(DI), R11 - ADCQ 40(SI), R11 - MOVQ 48(DI), R12 - ADCQ 48(SI), R12 - MOVQ 56(DI), R13 - ADCQ 56(SI), R13 - MOVQ 64(DI), R14 - ADCQ 64(SI), R14 - MOVQ 72(DI), R15 - ADCQ 72(SI), R15 - MOVQ 80(DI), BX - ADCQ 80(SI), BX - MOVQ BX, (SP) - MOVQ 88(DI), BX - ADCQ 88(SI), BX - MOVQ BX, 8(SP) - MOVQ 96(DI), BX - ADCQ 96(SI), BX - MOVQ BX, 16(SP) - MOVQ 104(DI), BX - ADCQ 104(SI), BX - MOVQ BX, 24(SP) - ADCQ $0x00, AX + // | j3 - // | - MOVQ CX, (DI) - MOVQ DX, 8(DI) - MOVQ R8, 16(DI) - MOVQ R9, 24(DI) - MOVQ R10, 32(DI) - MOVQ R11, 40(DI) - MOVQ R12, 48(DI) - MOVQ R13, 56(DI) - MOVQ R14, 64(DI) - MOVQ R15, 72(DI) - MOVQ (SP), BX - MOVQ BX, 80(DI) - MOVQ 8(SP), BX - MOVQ BX, 88(DI) - MOVQ 16(SP), BX - MOVQ BX, 96(DI) - MOVQ 24(SP), BX - MOVQ BX, 104(DI) - MOVQ AX, ret+16(FP) - RET + // | w7 @ R13 + MULXQ 24(CX), AX, DI + ADOXQ AX, R13 + ADCXQ DI, R14 -// func double14(c *[14]uint64, a *[14]uint64, p *[14]uint64) -TEXT ·double14(SB), NOSPLIT, $144-24 - // | - MOVQ a+8(FP), DI - XORQ AX, AX - MOVQ (DI), CX - ADDQ CX, CX - MOVQ 8(DI), DX - ADCQ DX, DX - MOVQ 16(DI), R8 - ADCQ R8, R8 - MOVQ 24(DI), R9 - ADCQ R9, R9 - MOVQ 32(DI), R10 - ADCQ R10, R10 - MOVQ 40(DI), R11 - ADCQ R11, R11 - MOVQ 48(DI), R12 - ADCQ R12, R12 - MOVQ 56(DI), R13 - ADCQ R13, R13 - MOVQ 64(DI), R14 - ADCQ R14, R14 - MOVQ 72(DI), R15 - ADCQ R15, R15 - MOVQ 80(DI), BX - ADCQ BX, BX - MOVQ BX, (SP) - MOVQ 88(DI), BX - ADCQ BX, BX - MOVQ BX, 8(SP) - MOVQ 96(DI), BX - ADCQ BX, BX - MOVQ BX, 16(SP) - MOVQ 104(DI), BX - ADCQ BX, BX - MOVQ BX, 24(SP) - ADCQ $0x00, AX + // | j4 - // | - MOVQ p+16(FP), SI - MOVQ CX, BX - SUBQ (SI), BX - MOVQ BX, 32(SP) - MOVQ DX, BX - SBBQ 8(SI), BX - MOVQ BX, 40(SP) - MOVQ R8, BX - SBBQ 16(SI), BX - MOVQ BX, 48(SP) - MOVQ R9, BX - SBBQ 24(SI), BX - MOVQ BX, 56(SP) - MOVQ R10, BX - SBBQ 32(SI), BX - MOVQ BX, 64(SP) - MOVQ R11, BX - SBBQ 40(SI), BX - MOVQ BX, 72(SP) - MOVQ R12, BX - SBBQ 48(SI), BX - MOVQ BX, 80(SP) - MOVQ R13, BX - SBBQ 56(SI), BX - MOVQ BX, 88(SP) - MOVQ R14, BX - SBBQ 64(SI), BX - MOVQ BX, 96(SP) - MOVQ R15, BX - SBBQ 72(SI), BX - MOVQ BX, 104(SP) - MOVQ (SP), BX - SBBQ 80(SI), BX - MOVQ BX, 112(SP) - MOVQ 8(SP), BX - SBBQ 88(SI), BX - MOVQ BX, 120(SP) - MOVQ 16(SP), BX - SBBQ 96(SI), BX - MOVQ BX, 128(SP) - MOVQ 24(SP), BX - SBBQ 104(SI), BX - MOVQ BX, 136(SP) - SBBQ $0x00, AX + // | w8 @ R14 + MULXQ 32(CX), AX, DI + ADOXQ AX, R14 + ADCXQ DI, R15 - // | - MOVQ c+0(FP), DI - CMOVQCC 32(SP), CX - MOVQ CX, (DI) - CMOVQCC 40(SP), DX - MOVQ DX, 8(DI) - CMOVQCC 48(SP), R8 - MOVQ R8, 16(DI) - CMOVQCC 56(SP), R9 - MOVQ R9, 24(DI) - CMOVQCC 64(SP), R10 - MOVQ R10, 32(DI) - CMOVQCC 72(SP), R11 - MOVQ R11, 40(DI) - CMOVQCC 80(SP), R12 - MOVQ R12, 48(DI) - CMOVQCC 88(SP), R13 - MOVQ R13, 56(DI) - CMOVQCC 96(SP), R14 - MOVQ R14, 64(DI) - CMOVQCC 104(SP), R15 - MOVQ R15, 72(DI) - MOVQ (SP), BX - CMOVQCC 112(SP), BX - MOVQ BX, 80(DI) - MOVQ 8(SP), BX - CMOVQCC 120(SP), BX - MOVQ BX, 88(DI) - MOVQ 16(SP), BX - CMOVQCC 128(SP), BX - MOVQ BX, 96(DI) - MOVQ 24(SP), BX - CMOVQCC 136(SP), BX - MOVQ BX, 104(DI) + // | j5 + + // | w9 @ R15 + MULXQ 40(CX), AX, DI + ADOXQ AX, R15 + + // | w10 @ 8(SP) + // | move to an idle register + MOVQ 8(SP), BX + ADCXQ DI, BX + ADOXQ R9, BX + ADCXQ R10, R10 + MOVQ $0x00, AX + ADOXQ AX, R10 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 5 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 R11 + // | 6 R12 | 7 R13 | 8 R14 | 9 R15 | 10 BX | 11 (SP) + + + // | | u5 = w5 * inp + MOVQ R11, DX + MULXQ inp+32(FP), DX, DI + + // | + +/* */ + + // | j0 + + // | w5 @ R11 + MULXQ (CX), AX, DI + ADOXQ AX, R11 + ADCXQ DI, R12 + + // | j1 + + // | w6 @ R12 + MULXQ 8(CX), AX, DI + ADOXQ AX, R12 + ADCXQ DI, R13 + + // | j2 + + // | w7 @ R13 + MULXQ 16(CX), AX, DI + ADOXQ AX, R13 + ADCXQ DI, R14 + + // | j3 + + // | w8 @ R14 + MULXQ 24(CX), AX, DI + ADOXQ AX, R14 + ADCXQ DI, R15 + + // | j4 + + // | w9 @ R15 + MULXQ 32(CX), AX, DI + ADOXQ AX, R15 + ADCXQ DI, BX + + // | j5 + + // | w10 @ BX + MULXQ 40(CX), AX, DI + ADOXQ AX, BX + + // | w11 @ (SP) + // | move to an idle register + MOVQ (SP), SI + ADCXQ DI, SI + ADOXQ R10, SI + ADCXQ R11, R11 + MOVQ $0x00, AX + ADOXQ AX, R11 + + // | + // | W montgomery reduction ends + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - + // | 6 R12 | 7 R13 | 8 R14 | 9 R15 | 10 BX | 11 SI + + + // | + +/* modular reduction */ + + MOVQ R12, AX + SUBQ (CX), AX + MOVQ R13, DI + SBBQ 8(CX), DI + MOVQ R14, R8 + SBBQ 16(CX), R8 + MOVQ R15, R9 + SBBQ 24(CX), R9 + MOVQ BX, R10 + SBBQ 32(CX), R10 + MOVQ SI, DX + SBBQ 40(CX), DX + MOVQ DX, (SP) + SBBQ $0x00, R11 + + // | + +/* out */ + + MOVQ c+0(FP), R11 + CMOVQCC AX, R12 + MOVQ R12, (R11) + CMOVQCC DI, R13 + MOVQ R13, 8(R11) + CMOVQCC R8, R14 + MOVQ R14, 16(R11) + CMOVQCC R9, R15 + MOVQ R15, 24(R11) + CMOVQCC R10, BX + MOVQ BX, 32(R11) + CMOVQCC (SP), SI + MOVQ SI, 40(R11) RET -// func sub14(c *[14]uint64, a *[14]uint64, b *[14]uint64, p *[14]uint64) -TEXT ·sub14(SB), NOSPLIT, $144-32 - // | + // | + +/* end */ + + +// func mul_no_adx_bmi2_6(c *[6]uint64, a *[6]uint64, b *[6]uint64, p *[6]uint64, inp uint64) +TEXT ·mul_no_adx_bmi2_6(SB), NOSPLIT, $32-40 + // | + +/* inputs */ + MOVQ a+8(FP), DI MOVQ b+16(FP), SI - XORQ AX, AX + MOVQ $0x00, R9 + MOVQ $0x00, R10 + MOVQ $0x00, R11 + MOVQ $0x00, R12 + MOVQ $0x00, R13 + MOVQ $0x00, R14 + MOVQ $0x00, R15 + + // | + +/* i = 0 */ + + // | a0 @ CX MOVQ (DI), CX - SUBQ (SI), CX - MOVQ 8(DI), DX - SBBQ 8(SI), DX - MOVQ 16(DI), R8 - SBBQ 16(SI), R8 - MOVQ 24(DI), R9 - SBBQ 24(SI), R9 - MOVQ 32(DI), R10 - SBBQ 32(SI), R10 - MOVQ 40(DI), R11 - SBBQ 40(SI), R11 - MOVQ 48(DI), R12 - SBBQ 48(SI), R12 - MOVQ 56(DI), R13 - SBBQ 56(SI), R13 - MOVQ 64(DI), R14 - SBBQ 64(SI), R14 - MOVQ 72(DI), R15 - SBBQ 72(SI), R15 - MOVQ 80(DI), BX - SBBQ 80(SI), BX - MOVQ BX, (SP) - MOVQ 88(DI), BX - SBBQ 88(SI), BX - MOVQ BX, 8(SP) - MOVQ 96(DI), BX - SBBQ 96(SI), BX - MOVQ BX, 16(SP) - MOVQ 104(DI), BX - SBBQ 104(SI), BX - MOVQ BX, 24(SP) - // | - MOVQ p+24(FP), SI - CMOVQCS (SI), AX - MOVQ AX, 32(SP) - CMOVQCS 8(SI), AX - MOVQ AX, 40(SP) - CMOVQCS 16(SI), AX - MOVQ AX, 48(SP) - CMOVQCS 24(SI), AX - MOVQ AX, 56(SP) - CMOVQCS 32(SI), AX - MOVQ AX, 64(SP) - CMOVQCS 40(SI), AX - MOVQ AX, 72(SP) - CMOVQCS 48(SI), AX - MOVQ AX, 80(SP) - CMOVQCS 56(SI), AX - MOVQ AX, 88(SP) - CMOVQCS 64(SI), AX - MOVQ AX, 96(SP) - CMOVQCS 72(SI), AX - MOVQ AX, 104(SP) - CMOVQCS 80(SI), AX - MOVQ AX, 112(SP) - CMOVQCS 88(SI), AX - MOVQ AX, 120(SP) - CMOVQCS 96(SI), AX - MOVQ AX, 128(SP) - CMOVQCS 104(SI), AX - MOVQ AX, 136(SP) + // | a0 * b0 + MOVQ (SI), AX + MULQ CX + MOVQ AX, (SP) + MOVQ DX, R8 - // | - MOVQ c+0(FP), DI - ADDQ 32(SP), CX - MOVQ CX, (DI) - ADCQ 40(SP), DX - MOVQ DX, 8(DI) - ADCQ 48(SP), R8 - MOVQ R8, 16(DI) - ADCQ 56(SP), R9 - MOVQ R9, 24(DI) - ADCQ 64(SP), R10 - MOVQ R10, 32(DI) - ADCQ 72(SP), R11 - MOVQ R11, 40(DI) - ADCQ 80(SP), R12 - MOVQ R12, 48(DI) - ADCQ 88(SP), R13 - MOVQ R13, 56(DI) - ADCQ 96(SP), R14 - MOVQ R14, 64(DI) - ADCQ 104(SP), R15 - MOVQ R15, 72(DI) - MOVQ (SP), BX - ADCQ 112(SP), BX - MOVQ BX, 80(DI) - MOVQ 8(SP), BX - ADCQ 120(SP), BX - MOVQ BX, 88(DI) - MOVQ 16(SP), BX - ADCQ 128(SP), BX - MOVQ BX, 96(DI) - MOVQ 24(SP), BX - ADCQ 136(SP), BX - MOVQ BX, 104(DI) - RET + // | a0 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 -// func subn14(a *[14]uint64, b *[14]uint64) uint64 -TEXT ·subn14(SB), NOSPLIT, $32-24 - // | - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - XORQ AX, AX + // | a0 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 - // | - MOVQ (DI), CX - SUBQ (SI), CX - MOVQ 8(DI), DX - SBBQ 8(SI), DX - MOVQ 16(DI), R8 - SBBQ 16(SI), R8 - MOVQ 24(DI), R9 - SBBQ 24(SI), R9 - MOVQ 32(DI), R10 - SBBQ 32(SI), R10 - MOVQ 40(DI), R11 - SBBQ 40(SI), R11 - MOVQ 48(DI), R12 - SBBQ 48(SI), R12 - MOVQ 56(DI), R13 - SBBQ 56(SI), R13 - MOVQ 64(DI), R14 - SBBQ 64(SI), R14 - MOVQ 72(DI), R15 - SBBQ 72(SI), R15 - MOVQ 80(DI), BX - SBBQ 80(SI), BX - MOVQ BX, (SP) - MOVQ 88(DI), BX - SBBQ 88(SI), BX - MOVQ BX, 8(SP) - MOVQ 96(DI), BX - SBBQ 96(SI), BX - MOVQ BX, 16(SP) - MOVQ 104(DI), BX - SBBQ 104(SI), BX - MOVQ BX, 24(SP) - ADCQ $0x00, AX + // | a0 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 - // | - MOVQ CX, (DI) - MOVQ DX, 8(DI) - MOVQ R8, 16(DI) - MOVQ R9, 24(DI) - MOVQ R10, 32(DI) - MOVQ R11, 40(DI) - MOVQ R12, 48(DI) - MOVQ R13, 56(DI) - MOVQ R14, 64(DI) - MOVQ R15, 72(DI) - MOVQ (SP), BX - MOVQ BX, 80(DI) - MOVQ 8(SP), BX - MOVQ BX, 88(DI) - MOVQ 16(SP), BX - MOVQ BX, 96(DI) - MOVQ 24(SP), BX - MOVQ BX, 104(DI) - MOVQ AX, ret+16(FP) - RET + // | a0 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 -// func _neg14(c *[14]uint64, a *[14]uint64, p *[14]uint64) -TEXT ·_neg14(SB), NOSPLIT, $32-24 - // | - MOVQ a+8(FP), DI + // | a0 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 - // | - MOVQ p+16(FP), SI - MOVQ (SI), CX - SUBQ (DI), CX - MOVQ 8(SI), DX - SBBQ 8(DI), DX - MOVQ 16(SI), R8 - SBBQ 16(DI), R8 - MOVQ 24(SI), R9 - SBBQ 24(DI), R9 - MOVQ 32(SI), R10 - SBBQ 32(DI), R10 - MOVQ 40(SI), R11 - SBBQ 40(DI), R11 - MOVQ 48(SI), R12 - SBBQ 48(DI), R12 - MOVQ 56(SI), R13 - SBBQ 56(DI), R13 - MOVQ 64(SI), R14 - SBBQ 64(DI), R14 - MOVQ 72(SI), R15 - SBBQ 72(DI), R15 - MOVQ 80(SI), BX - SBBQ 80(DI), BX - MOVQ BX, (SP) - MOVQ 88(SI), BX - SBBQ 88(DI), BX - MOVQ BX, 8(SP) - MOVQ 96(SI), BX - SBBQ 96(DI), BX - MOVQ BX, 16(SP) - MOVQ 104(SI), BX - SBBQ 104(DI), BX - MOVQ BX, 24(SP) + // | - // | - MOVQ c+0(FP), DI - MOVQ CX, (DI) - MOVQ DX, 8(DI) - MOVQ R8, 16(DI) - MOVQ R9, 24(DI) - MOVQ R10, 32(DI) - MOVQ R11, 40(DI) - MOVQ R12, 48(DI) - MOVQ R13, 56(DI) - MOVQ R14, 64(DI) - MOVQ R15, 72(DI) - MOVQ (SP), BX - MOVQ BX, 80(DI) - MOVQ 8(SP), BX - MOVQ BX, 88(DI) - MOVQ 16(SP), BX - MOVQ BX, 96(DI) - MOVQ 24(SP), BX - MOVQ BX, 104(DI) - RET +/* i = 1 */ -// func mul_two_14(a *[14]uint64) -TEXT ·mul_two_14(SB), NOSPLIT, $0-8 - MOVQ a+0(FP), DI - XORQ AX, AX - RCLQ $0x01, (DI) - RCLQ $0x01, 8(DI) - RCLQ $0x01, 16(DI) - RCLQ $0x01, 24(DI) - RCLQ $0x01, 32(DI) - RCLQ $0x01, 40(DI) - RCLQ $0x01, 48(DI) - RCLQ $0x01, 56(DI) - RCLQ $0x01, 64(DI) - RCLQ $0x01, 72(DI) - RCLQ $0x01, 80(DI) - RCLQ $0x01, 88(DI) - RCLQ $0x01, 96(DI) - RCLQ $0x01, 104(DI) - RET + // | a1 @ CX + MOVQ 8(DI), CX + MOVQ $0x00, BX -// func div_two_14(a *[14]uint64) -TEXT ·div_two_14(SB), NOSPLIT, $0-8 - MOVQ a+0(FP), DI - XORQ AX, AX - RCRQ $0x01, 104(DI) - RCRQ $0x01, 96(DI) - RCRQ $0x01, 88(DI) - RCRQ $0x01, 80(DI) - RCRQ $0x01, 72(DI) - RCRQ $0x01, 64(DI) - RCRQ $0x01, 56(DI) - RCRQ $0x01, 48(DI) - RCRQ $0x01, 40(DI) - RCRQ $0x01, 32(DI) - RCRQ $0x01, 24(DI) - RCRQ $0x01, 16(DI) - RCRQ $0x01, 8(DI) - RCRQ $0x01, (DI) - RET + // | a1 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R10 + ADCQ $0x00, BX + MOVQ R8, 8(SP) + MOVQ $0x00, R8 -// func cpy15(dst *[15]uint64, src *[15]uint64) -TEXT ·cpy15(SB), NOSPLIT, $0-16 - MOVQ dst+0(FP), DI - MOVQ src+8(FP), SI - MOVQ (SI), R8 - MOVQ R8, (DI) - MOVQ 8(SI), R8 - MOVQ R8, 8(DI) - MOVQ 16(SI), R8 - MOVQ R8, 16(DI) - MOVQ 24(SI), R8 - MOVQ R8, 24(DI) - MOVQ 32(SI), R8 - MOVQ R8, 32(DI) - MOVQ 40(SI), R8 - MOVQ R8, 40(DI) - MOVQ 48(SI), R8 - MOVQ R8, 48(DI) - MOVQ 56(SI), R8 - MOVQ R8, 56(DI) - MOVQ 64(SI), R8 - MOVQ R8, 64(DI) - MOVQ 72(SI), R8 - MOVQ R8, 72(DI) - MOVQ 80(SI), R8 - MOVQ R8, 80(DI) - MOVQ 88(SI), R8 - MOVQ R8, 88(DI) - MOVQ 96(SI), R8 - MOVQ R8, 96(DI) - MOVQ 104(SI), R8 - MOVQ R8, 104(DI) - MOVQ 112(SI), R8 - MOVQ R8, 112(DI) - RET + // | a1 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX -// func eq15(a *[15]uint64, b *[15]uint64) bool -TEXT ·eq15(SB), NOSPLIT, $0-17 - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - MOVB $0x00, ret+16(FP) - MOVQ (DI), R8 - CMPQ (SI), R8 - JNE ret - MOVQ 8(DI), R8 - CMPQ 8(SI), R8 - JNE ret - MOVQ 16(DI), R8 - CMPQ 16(SI), R8 - JNE ret - MOVQ 24(DI), R8 - CMPQ 24(SI), R8 - JNE ret - MOVQ 32(DI), R8 - CMPQ 32(SI), R8 - JNE ret - MOVQ 40(DI), R8 - CMPQ 40(SI), R8 - JNE ret - MOVQ 48(DI), R8 - CMPQ 48(SI), R8 - JNE ret - MOVQ 56(DI), R8 - CMPQ 56(SI), R8 - JNE ret - MOVQ 64(DI), R8 - CMPQ 64(SI), R8 - JNE ret - MOVQ 72(DI), R8 - CMPQ 72(SI), R8 - JNE ret - MOVQ 80(DI), R8 - CMPQ 80(SI), R8 - JNE ret - MOVQ 88(DI), R8 - CMPQ 88(SI), R8 - JNE ret - MOVQ 96(DI), R8 - CMPQ 96(SI), R8 - JNE ret - MOVQ 104(DI), R8 - CMPQ 104(SI), R8 - JNE ret - MOVQ 112(DI), R8 - CMPQ 112(SI), R8 - JNE ret - MOVB $0x01, ret+16(FP) + // | a1 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX -ret: - RET + // | a1 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX -// func cmp15(a *[15]uint64, b *[15]uint64) int8 -TEXT ·cmp15(SB), NOSPLIT, $0-17 - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - MOVQ 112(DI), R8 - CMPQ 112(SI), R8 - JB gt - JA lt - MOVQ 104(DI), R8 - CMPQ 104(SI), R8 - JB gt - JA lt - MOVQ 96(DI), R8 - CMPQ 96(SI), R8 - JB gt - JA lt - MOVQ 88(DI), R8 - CMPQ 88(SI), R8 - JB gt - JA lt - MOVQ 80(DI), R8 - CMPQ 80(SI), R8 - JB gt - JA lt - MOVQ 72(DI), R8 - CMPQ 72(SI), R8 - JB gt - JA lt - MOVQ 64(DI), R8 - CMPQ 64(SI), R8 - JB gt - JA lt - MOVQ 56(DI), R8 - CMPQ 56(SI), R8 - JB gt - JA lt - MOVQ 48(DI), R8 - CMPQ 48(SI), R8 - JB gt - JA lt - MOVQ 40(DI), R8 - CMPQ 40(SI), R8 - JB gt - JA lt - MOVQ 32(DI), R8 - CMPQ 32(SI), R8 - JB gt - JA lt - MOVQ 24(DI), R8 - CMPQ 24(SI), R8 - JB gt - JA lt - MOVQ 16(DI), R8 - CMPQ 16(SI), R8 - JB gt - JA lt - MOVQ 8(DI), R8 - CMPQ 8(SI), R8 - JB gt - JA lt - MOVQ (DI), R8 - CMPQ (SI), R8 - JB gt - JA lt - MOVB $0x00, ret+16(FP) - JMP ret + // | a1 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 -gt: - MOVB $0x01, ret+16(FP) - JMP ret + // | a1 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 -lt: - MOVB $0xff, ret+16(FP) + // | -ret: - RET +/* i = 2 */ -// func add15(c *[15]uint64, a *[15]uint64, b *[15]uint64, p *[15]uint64) -TEXT ·add15(SB), NOSPLIT, $160-32 - // | - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI - XORQ AX, AX + // | a2 @ CX + MOVQ 16(DI), CX + MOVQ $0x00, BX - // | - MOVQ (DI), CX - ADDQ (SI), CX - MOVQ 8(DI), DX - ADCQ 8(SI), DX - MOVQ 16(DI), R8 - ADCQ 16(SI), R8 - MOVQ 24(DI), R9 - ADCQ 24(SI), R9 - MOVQ 32(DI), R10 - ADCQ 32(SI), R10 - MOVQ 40(DI), R11 - ADCQ 40(SI), R11 - MOVQ 48(DI), R12 - ADCQ 48(SI), R12 - MOVQ 56(DI), R13 - ADCQ 56(SI), R13 - MOVQ 64(DI), R14 - ADCQ 64(SI), R14 - MOVQ 72(DI), R15 - ADCQ 72(SI), R15 - MOVQ 80(DI), BX - ADCQ 80(SI), BX - MOVQ BX, (SP) - MOVQ 88(DI), BX - ADCQ 88(SI), BX - MOVQ BX, 8(SP) - MOVQ 96(DI), BX - ADCQ 96(SI), BX - MOVQ BX, 16(SP) - MOVQ 104(DI), BX - ADCQ 104(SI), BX - MOVQ BX, 24(SP) - MOVQ 112(DI), BX - ADCQ 112(SI), BX - MOVQ BX, 32(SP) - ADCQ $0x00, AX + // | a2 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ $0x00, R11 + ADCQ $0x00, BX + MOVQ R9, 16(SP) + MOVQ $0x00, R9 - // | - MOVQ p+24(FP), SI - MOVQ CX, BX - SUBQ (SI), BX - MOVQ BX, 40(SP) - MOVQ DX, BX - SBBQ 8(SI), BX - MOVQ BX, 48(SP) - MOVQ R8, BX - SBBQ 16(SI), BX - MOVQ BX, 56(SP) - MOVQ R9, BX - SBBQ 24(SI), BX - MOVQ BX, 64(SP) - MOVQ R10, BX - SBBQ 32(SI), BX - MOVQ BX, 72(SP) - MOVQ R11, BX - SBBQ 40(SI), BX - MOVQ BX, 80(SP) - MOVQ R12, BX - SBBQ 48(SI), BX - MOVQ BX, 88(SP) - MOVQ R13, BX - SBBQ 56(SI), BX - MOVQ BX, 96(SP) - MOVQ R14, BX - SBBQ 64(SI), BX - MOVQ BX, 104(SP) - MOVQ R15, BX - SBBQ 72(SI), BX - MOVQ BX, 112(SP) - MOVQ (SP), BX - SBBQ 80(SI), BX - MOVQ BX, 120(SP) - MOVQ 8(SP), BX - SBBQ 88(SI), BX - MOVQ BX, 128(SP) - MOVQ 16(SP), BX - SBBQ 96(SI), BX - MOVQ BX, 136(SP) - MOVQ 24(SP), BX - SBBQ 104(SI), BX - MOVQ BX, 144(SP) - MOVQ 32(SP), BX - SBBQ 112(SI), BX - MOVQ BX, 152(SP) - SBBQ $0x00, AX + // | a2 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | - MOVQ c+0(FP), DI - CMOVQCC 40(SP), CX - MOVQ CX, (DI) - CMOVQCC 48(SP), DX - MOVQ DX, 8(DI) - CMOVQCC 56(SP), R8 - MOVQ R8, 16(DI) - CMOVQCC 64(SP), R9 - MOVQ R9, 24(DI) - CMOVQCC 72(SP), R10 - MOVQ R10, 32(DI) - CMOVQCC 80(SP), R11 - MOVQ R11, 40(DI) - CMOVQCC 88(SP), R12 - MOVQ R12, 48(DI) - CMOVQCC 96(SP), R13 - MOVQ R13, 56(DI) - CMOVQCC 104(SP), R14 - MOVQ R14, 64(DI) - CMOVQCC 112(SP), R15 - MOVQ R15, 72(DI) - MOVQ (SP), BX - CMOVQCC 120(SP), BX - MOVQ BX, 80(DI) - MOVQ 8(SP), BX - CMOVQCC 128(SP), BX - MOVQ BX, 88(DI) - MOVQ 16(SP), BX - CMOVQCC 136(SP), BX - MOVQ BX, 96(DI) - MOVQ 24(SP), BX - CMOVQCC 144(SP), BX - MOVQ BX, 104(DI) - MOVQ 32(SP), BX - CMOVQCC 152(SP), BX - MOVQ BX, 112(DI) - RET + // | a2 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX -// func addn15(a *[15]uint64, b *[15]uint64) uint64 -TEXT ·addn15(SB), NOSPLIT, $40-24 - // | - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI + // | a2 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | - MOVQ (DI), CX - ADDQ (SI), CX - MOVQ 8(DI), DX - ADCQ 8(SI), DX - MOVQ 16(DI), R8 - ADCQ 16(SI), R8 - MOVQ 24(DI), R9 - ADCQ 24(SI), R9 - MOVQ 32(DI), R10 - ADCQ 32(SI), R10 - MOVQ 40(DI), R11 - ADCQ 40(SI), R11 - MOVQ 48(DI), R12 - ADCQ 48(SI), R12 - MOVQ 56(DI), R13 - ADCQ 56(SI), R13 - MOVQ 64(DI), R14 - ADCQ 64(SI), R14 - MOVQ 72(DI), R15 - ADCQ 72(SI), R15 - MOVQ 80(DI), BX - ADCQ 80(SI), BX - MOVQ BX, (SP) - MOVQ 88(DI), BX - ADCQ 88(SI), BX - MOVQ BX, 8(SP) - MOVQ 96(DI), BX - ADCQ 96(SI), BX - MOVQ BX, 16(SP) - MOVQ 104(DI), BX - ADCQ 104(SI), BX - MOVQ BX, 24(SP) - MOVQ 112(DI), BX - ADCQ 112(SI), BX - MOVQ BX, 32(SP) - ADCQ $0x00, AX + // | a2 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 - // | - MOVQ CX, (DI) - MOVQ DX, 8(DI) - MOVQ R8, 16(DI) - MOVQ R9, 24(DI) - MOVQ R10, 32(DI) - MOVQ R11, 40(DI) - MOVQ R12, 48(DI) - MOVQ R13, 56(DI) - MOVQ R14, 64(DI) - MOVQ R15, 72(DI) - MOVQ (SP), BX - MOVQ BX, 80(DI) - MOVQ 8(SP), BX - MOVQ BX, 88(DI) - MOVQ 16(SP), BX - MOVQ BX, 96(DI) - MOVQ 24(SP), BX - MOVQ BX, 104(DI) - MOVQ 32(SP), BX - MOVQ BX, 112(DI) - MOVQ AX, ret+16(FP) - RET + // | a2 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 -// func double15(c *[15]uint64, a *[15]uint64, p *[15]uint64) -TEXT ·double15(SB), NOSPLIT, $160-24 - // | - MOVQ a+8(FP), DI - XORQ AX, AX - MOVQ (DI), CX - ADDQ CX, CX - MOVQ 8(DI), DX - ADCQ DX, DX - MOVQ 16(DI), R8 - ADCQ R8, R8 - MOVQ 24(DI), R9 - ADCQ R9, R9 - MOVQ 32(DI), R10 - ADCQ R10, R10 - MOVQ 40(DI), R11 - ADCQ R11, R11 - MOVQ 48(DI), R12 - ADCQ R12, R12 - MOVQ 56(DI), R13 - ADCQ R13, R13 - MOVQ 64(DI), R14 - ADCQ R14, R14 - MOVQ 72(DI), R15 - ADCQ R15, R15 - MOVQ 80(DI), BX - ADCQ BX, BX - MOVQ BX, (SP) - MOVQ 88(DI), BX - ADCQ BX, BX - MOVQ BX, 8(SP) - MOVQ 96(DI), BX - ADCQ BX, BX - MOVQ BX, 16(SP) - MOVQ 104(DI), BX - ADCQ BX, BX - MOVQ BX, 24(SP) - MOVQ 112(DI), BX - ADCQ BX, BX - MOVQ BX, 32(SP) - ADCQ $0x00, AX + // | - // | - MOVQ p+16(FP), SI - MOVQ CX, BX - SUBQ (SI), BX - MOVQ BX, 40(SP) - MOVQ DX, BX - SBBQ 8(SI), BX - MOVQ BX, 48(SP) - MOVQ R8, BX - SBBQ 16(SI), BX - MOVQ BX, 56(SP) - MOVQ R9, BX - SBBQ 24(SI), BX - MOVQ BX, 64(SP) - MOVQ R10, BX - SBBQ 32(SI), BX - MOVQ BX, 72(SP) - MOVQ R11, BX - SBBQ 40(SI), BX - MOVQ BX, 80(SP) - MOVQ R12, BX - SBBQ 48(SI), BX - MOVQ BX, 88(SP) - MOVQ R13, BX - SBBQ 56(SI), BX - MOVQ BX, 96(SP) - MOVQ R14, BX - SBBQ 64(SI), BX - MOVQ BX, 104(SP) - MOVQ R15, BX - SBBQ 72(SI), BX - MOVQ BX, 112(SP) - MOVQ (SP), BX - SBBQ 80(SI), BX - MOVQ BX, 120(SP) - MOVQ 8(SP), BX - SBBQ 88(SI), BX - MOVQ BX, 128(SP) - MOVQ 16(SP), BX - SBBQ 96(SI), BX - MOVQ BX, 136(SP) - MOVQ 24(SP), BX - SBBQ 104(SI), BX - MOVQ BX, 144(SP) - MOVQ 32(SP), BX - SBBQ 112(SI), BX - MOVQ BX, 152(SP) - SBBQ $0x00, AX +/* i = 3 */ - // | - MOVQ c+0(FP), DI - CMOVQCC 40(SP), CX - MOVQ CX, (DI) - CMOVQCC 48(SP), DX - MOVQ DX, 8(DI) - CMOVQCC 56(SP), R8 - MOVQ R8, 16(DI) - CMOVQCC 64(SP), R9 - MOVQ R9, 24(DI) - CMOVQCC 72(SP), R10 - MOVQ R10, 32(DI) - CMOVQCC 80(SP), R11 - MOVQ R11, 40(DI) - CMOVQCC 88(SP), R12 - MOVQ R12, 48(DI) - CMOVQCC 96(SP), R13 - MOVQ R13, 56(DI) - CMOVQCC 104(SP), R14 - MOVQ R14, 64(DI) - CMOVQCC 112(SP), R15 - MOVQ R15, 72(DI) - MOVQ (SP), BX - CMOVQCC 120(SP), BX - MOVQ BX, 80(DI) - MOVQ 8(SP), BX - CMOVQCC 128(SP), BX - MOVQ BX, 88(DI) - MOVQ 16(SP), BX - CMOVQCC 136(SP), BX - MOVQ BX, 96(DI) - MOVQ 24(SP), BX - CMOVQCC 144(SP), BX - MOVQ BX, 104(DI) - MOVQ 32(SP), BX - CMOVQCC 152(SP), BX - MOVQ BX, 112(DI) - RET + // | a3 @ CX + MOVQ 24(DI), CX + MOVQ $0x00, BX -// func sub15(c *[15]uint64, a *[15]uint64, b *[15]uint64, p *[15]uint64) -TEXT ·sub15(SB), NOSPLIT, $160-32 - // | - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI - XORQ AX, AX - MOVQ (DI), CX - SUBQ (SI), CX - MOVQ 8(DI), DX - SBBQ 8(SI), DX - MOVQ 16(DI), R8 - SBBQ 16(SI), R8 - MOVQ 24(DI), R9 - SBBQ 24(SI), R9 - MOVQ 32(DI), R10 - SBBQ 32(SI), R10 - MOVQ 40(DI), R11 - SBBQ 40(SI), R11 - MOVQ 48(DI), R12 - SBBQ 48(SI), R12 - MOVQ 56(DI), R13 - SBBQ 56(SI), R13 - MOVQ 64(DI), R14 - SBBQ 64(SI), R14 - MOVQ 72(DI), R15 - SBBQ 72(SI), R15 - MOVQ 80(DI), BX - SBBQ 80(SI), BX - MOVQ BX, (SP) - MOVQ 88(DI), BX - SBBQ 88(SI), BX - MOVQ BX, 8(SP) - MOVQ 96(DI), BX - SBBQ 96(SI), BX - MOVQ BX, 16(SP) - MOVQ 104(DI), BX - SBBQ 104(SI), BX - MOVQ BX, 24(SP) - MOVQ 112(DI), BX - SBBQ 112(SI), BX - MOVQ BX, 32(SP) + // | a3 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ $0x00, R12 + ADCQ $0x00, BX - // | - MOVQ p+24(FP), SI - CMOVQCS (SI), AX - MOVQ AX, 40(SP) - CMOVQCS 8(SI), AX - MOVQ AX, 48(SP) - CMOVQCS 16(SI), AX - MOVQ AX, 56(SP) - CMOVQCS 24(SI), AX - MOVQ AX, 64(SP) - CMOVQCS 32(SI), AX - MOVQ AX, 72(SP) - CMOVQCS 40(SI), AX - MOVQ AX, 80(SP) - CMOVQCS 48(SI), AX - MOVQ AX, 88(SP) - CMOVQCS 56(SI), AX - MOVQ AX, 96(SP) - CMOVQCS 64(SI), AX - MOVQ AX, 104(SP) - CMOVQCS 72(SI), AX - MOVQ AX, 112(SP) - CMOVQCS 80(SI), AX - MOVQ AX, 120(SP) - CMOVQCS 88(SI), AX - MOVQ AX, 128(SP) - CMOVQCS 96(SI), AX - MOVQ AX, 136(SP) - CMOVQCS 104(SI), AX - MOVQ AX, 144(SP) - CMOVQCS 112(SI), AX - MOVQ AX, 152(SP) + // | a3 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | - MOVQ c+0(FP), DI - ADDQ 40(SP), CX - MOVQ CX, (DI) - ADCQ 48(SP), DX - MOVQ DX, 8(DI) - ADCQ 56(SP), R8 - MOVQ R8, 16(DI) - ADCQ 64(SP), R9 - MOVQ R9, 24(DI) - ADCQ 72(SP), R10 - MOVQ R10, 32(DI) - ADCQ 80(SP), R11 - MOVQ R11, 40(DI) - ADCQ 88(SP), R12 - MOVQ R12, 48(DI) - ADCQ 96(SP), R13 - MOVQ R13, 56(DI) - ADCQ 104(SP), R14 - MOVQ R14, 64(DI) - ADCQ 112(SP), R15 - MOVQ R15, 72(DI) - MOVQ (SP), BX - ADCQ 120(SP), BX - MOVQ BX, 80(DI) - MOVQ 8(SP), BX - ADCQ 128(SP), BX - MOVQ BX, 88(DI) - MOVQ 16(SP), BX - ADCQ 136(SP), BX - MOVQ BX, 96(DI) - MOVQ 24(SP), BX - ADCQ 144(SP), BX - MOVQ BX, 104(DI) - MOVQ 32(SP), BX - ADCQ 152(SP), BX - MOVQ BX, 112(DI) - RET + // | a3 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX -// func subn15(a *[15]uint64, b *[15]uint64) uint64 -TEXT ·subn15(SB), NOSPLIT, $40-24 - // | - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - XORQ AX, AX + // | a3 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | - MOVQ (DI), CX - SUBQ (SI), CX - MOVQ 8(DI), DX - SBBQ 8(SI), DX - MOVQ 16(DI), R8 - SBBQ 16(SI), R8 - MOVQ 24(DI), R9 - SBBQ 24(SI), R9 - MOVQ 32(DI), R10 - SBBQ 32(SI), R10 - MOVQ 40(DI), R11 - SBBQ 40(SI), R11 - MOVQ 48(DI), R12 - SBBQ 48(SI), R12 - MOVQ 56(DI), R13 - SBBQ 56(SI), R13 - MOVQ 64(DI), R14 - SBBQ 64(SI), R14 - MOVQ 72(DI), R15 - SBBQ 72(SI), R15 - MOVQ 80(DI), BX - SBBQ 80(SI), BX - MOVQ BX, (SP) - MOVQ 88(DI), BX - SBBQ 88(SI), BX - MOVQ BX, 8(SP) - MOVQ 96(DI), BX - SBBQ 96(SI), BX - MOVQ BX, 16(SP) - MOVQ 104(DI), BX - SBBQ 104(SI), BX - MOVQ BX, 24(SP) - MOVQ 112(DI), BX - SBBQ 112(SI), BX - MOVQ BX, 32(SP) - ADCQ $0x00, AX + // | a3 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 - // | - MOVQ CX, (DI) - MOVQ DX, 8(DI) - MOVQ R8, 16(DI) - MOVQ R9, 24(DI) - MOVQ R10, 32(DI) - MOVQ R11, 40(DI) - MOVQ R12, 48(DI) - MOVQ R13, 56(DI) - MOVQ R14, 64(DI) - MOVQ R15, 72(DI) - MOVQ (SP), BX - MOVQ BX, 80(DI) - MOVQ 8(SP), BX - MOVQ BX, 88(DI) - MOVQ 16(SP), BX - MOVQ BX, 96(DI) - MOVQ 24(SP), BX - MOVQ BX, 104(DI) - MOVQ 32(SP), BX - MOVQ BX, 112(DI) - MOVQ AX, ret+16(FP) - RET + // | a3 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 -// func _neg15(c *[15]uint64, a *[15]uint64, p *[15]uint64) -TEXT ·_neg15(SB), NOSPLIT, $40-24 - // | - MOVQ a+8(FP), DI + // | - // | - MOVQ p+16(FP), SI - MOVQ (SI), CX - SUBQ (DI), CX - MOVQ 8(SI), DX - SBBQ 8(DI), DX - MOVQ 16(SI), R8 - SBBQ 16(DI), R8 - MOVQ 24(SI), R9 - SBBQ 24(DI), R9 - MOVQ 32(SI), R10 - SBBQ 32(DI), R10 - MOVQ 40(SI), R11 - SBBQ 40(DI), R11 - MOVQ 48(SI), R12 - SBBQ 48(DI), R12 - MOVQ 56(SI), R13 - SBBQ 56(DI), R13 - MOVQ 64(SI), R14 - SBBQ 64(DI), R14 - MOVQ 72(SI), R15 - SBBQ 72(DI), R15 - MOVQ 80(SI), BX - SBBQ 80(DI), BX - MOVQ BX, (SP) - MOVQ 88(SI), BX - SBBQ 88(DI), BX - MOVQ BX, 8(SP) - MOVQ 96(SI), BX - SBBQ 96(DI), BX - MOVQ BX, 16(SP) - MOVQ 104(SI), BX - SBBQ 104(DI), BX - MOVQ BX, 24(SP) - MOVQ 112(SI), BX - SBBQ 112(DI), BX - MOVQ BX, 32(SP) +/* i = 4 */ - // | - MOVQ c+0(FP), DI - MOVQ CX, (DI) - MOVQ DX, 8(DI) - MOVQ R8, 16(DI) - MOVQ R9, 24(DI) - MOVQ R10, 32(DI) - MOVQ R11, 40(DI) - MOVQ R12, 48(DI) - MOVQ R13, 56(DI) - MOVQ R14, 64(DI) - MOVQ R15, 72(DI) - MOVQ (SP), BX - MOVQ BX, 80(DI) - MOVQ 8(SP), BX - MOVQ BX, 88(DI) - MOVQ 16(SP), BX - MOVQ BX, 96(DI) - MOVQ 24(SP), BX - MOVQ BX, 104(DI) - MOVQ 32(SP), BX - MOVQ BX, 112(DI) - RET + // | a4 @ CX + MOVQ 32(DI), CX + MOVQ $0x00, BX -// func mul_two_15(a *[15]uint64) -TEXT ·mul_two_15(SB), NOSPLIT, $0-8 - MOVQ a+0(FP), DI - XORQ AX, AX - RCLQ $0x01, (DI) - RCLQ $0x01, 8(DI) - RCLQ $0x01, 16(DI) - RCLQ $0x01, 24(DI) - RCLQ $0x01, 32(DI) - RCLQ $0x01, 40(DI) - RCLQ $0x01, 48(DI) - RCLQ $0x01, 56(DI) - RCLQ $0x01, 64(DI) - RCLQ $0x01, 72(DI) - RCLQ $0x01, 80(DI) - RCLQ $0x01, 88(DI) - RCLQ $0x01, 96(DI) - RCLQ $0x01, 104(DI) - RCLQ $0x01, 112(DI) - RET + // | a4 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ $0x00, R13 + ADCQ $0x00, BX -// func div_two_15(a *[15]uint64) -TEXT ·div_two_15(SB), NOSPLIT, $0-8 - MOVQ a+0(FP), DI - XORQ AX, AX - RCRQ $0x01, 112(DI) - RCRQ $0x01, 104(DI) - RCRQ $0x01, 96(DI) - RCRQ $0x01, 88(DI) - RCRQ $0x01, 80(DI) - RCRQ $0x01, 72(DI) - RCRQ $0x01, 64(DI) - RCRQ $0x01, 56(DI) - RCRQ $0x01, 48(DI) - RCRQ $0x01, 40(DI) - RCRQ $0x01, 32(DI) - RCRQ $0x01, 24(DI) - RCRQ $0x01, 16(DI) - RCRQ $0x01, 8(DI) - RCRQ $0x01, (DI) - RET + // | a4 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX -// func cpy16(dst *[16]uint64, src *[16]uint64) -TEXT ·cpy16(SB), NOSPLIT, $0-16 - MOVQ dst+0(FP), DI - MOVQ src+8(FP), SI - MOVQ (SI), R8 - MOVQ R8, (DI) - MOVQ 8(SI), R8 - MOVQ R8, 8(DI) - MOVQ 16(SI), R8 - MOVQ R8, 16(DI) - MOVQ 24(SI), R8 - MOVQ R8, 24(DI) - MOVQ 32(SI), R8 - MOVQ R8, 32(DI) - MOVQ 40(SI), R8 - MOVQ R8, 40(DI) - MOVQ 48(SI), R8 - MOVQ R8, 48(DI) - MOVQ 56(SI), R8 - MOVQ R8, 56(DI) - MOVQ 64(SI), R8 - MOVQ R8, 64(DI) - MOVQ 72(SI), R8 - MOVQ R8, 72(DI) - MOVQ 80(SI), R8 - MOVQ R8, 80(DI) - MOVQ 88(SI), R8 - MOVQ R8, 88(DI) - MOVQ 96(SI), R8 - MOVQ R8, 96(DI) - MOVQ 104(SI), R8 - MOVQ R8, 104(DI) - MOVQ 112(SI), R8 - MOVQ R8, 112(DI) - MOVQ 120(SI), R8 - MOVQ R8, 120(DI) - RET + // | a4 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX -// func eq16(a *[16]uint64, b *[16]uint64) bool -TEXT ·eq16(SB), NOSPLIT, $0-17 - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - MOVB $0x00, ret+16(FP) - MOVQ (DI), R8 - CMPQ (SI), R8 - JNE ret - MOVQ 8(DI), R8 - CMPQ 8(SI), R8 - JNE ret - MOVQ 16(DI), R8 - CMPQ 16(SI), R8 - JNE ret - MOVQ 24(DI), R8 - CMPQ 24(SI), R8 - JNE ret - MOVQ 32(DI), R8 - CMPQ 32(SI), R8 - JNE ret - MOVQ 40(DI), R8 - CMPQ 40(SI), R8 - JNE ret - MOVQ 48(DI), R8 - CMPQ 48(SI), R8 - JNE ret - MOVQ 56(DI), R8 - CMPQ 56(SI), R8 - JNE ret - MOVQ 64(DI), R8 - CMPQ 64(SI), R8 - JNE ret - MOVQ 72(DI), R8 - CMPQ 72(SI), R8 - JNE ret - MOVQ 80(DI), R8 - CMPQ 80(SI), R8 - JNE ret - MOVQ 88(DI), R8 - CMPQ 88(SI), R8 - JNE ret - MOVQ 96(DI), R8 - CMPQ 96(SI), R8 - JNE ret - MOVQ 104(DI), R8 - CMPQ 104(SI), R8 - JNE ret - MOVQ 112(DI), R8 - CMPQ 112(SI), R8 - JNE ret - MOVQ 120(DI), R8 - CMPQ 120(SI), R8 - JNE ret - MOVB $0x01, ret+16(FP) + // | a4 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX -ret: - RET + // | a4 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 -// func cmp16(a *[16]uint64, b *[16]uint64) int8 -TEXT ·cmp16(SB), NOSPLIT, $0-17 - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - MOVQ 120(DI), R8 - CMPQ 120(SI), R8 - JB gt - JA lt - MOVQ 112(DI), R8 - CMPQ 112(SI), R8 - JB gt - JA lt - MOVQ 104(DI), R8 - CMPQ 104(SI), R8 - JB gt - JA lt - MOVQ 96(DI), R8 - CMPQ 96(SI), R8 - JB gt - JA lt - MOVQ 88(DI), R8 - CMPQ 88(SI), R8 - JB gt - JA lt - MOVQ 80(DI), R8 - CMPQ 80(SI), R8 - JB gt - JA lt - MOVQ 72(DI), R8 - CMPQ 72(SI), R8 - JB gt - JA lt - MOVQ 64(DI), R8 - CMPQ 64(SI), R8 - JB gt - JA lt - MOVQ 56(DI), R8 - CMPQ 56(SI), R8 - JB gt - JA lt - MOVQ 48(DI), R8 - CMPQ 48(SI), R8 - JB gt - JA lt - MOVQ 40(DI), R8 - CMPQ 40(SI), R8 - JB gt - JA lt - MOVQ 32(DI), R8 - CMPQ 32(SI), R8 - JB gt - JA lt - MOVQ 24(DI), R8 - CMPQ 24(SI), R8 - JB gt - JA lt - MOVQ 16(DI), R8 - CMPQ 16(SI), R8 - JB gt - JA lt - MOVQ 8(DI), R8 - CMPQ 8(SI), R8 - JB gt - JA lt - MOVQ (DI), R8 - CMPQ (SI), R8 - JB gt - JA lt - MOVB $0x00, ret+16(FP) - JMP ret + // | a4 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 -gt: - MOVB $0x01, ret+16(FP) - JMP ret + // | -lt: - MOVB $0xff, ret+16(FP) +/* i = 5 */ -ret: - RET + // | a5 @ CX + MOVQ 40(DI), CX + MOVQ $0x00, BX -// func add16(c *[16]uint64, a *[16]uint64, b *[16]uint64, p *[16]uint64) -TEXT ·add16(SB), NOSPLIT, $176-32 - // | - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI - XORQ AX, AX + // | a5 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ $0x00, R14 + ADCQ $0x00, BX - // | - MOVQ (DI), CX - ADDQ (SI), CX - MOVQ 8(DI), DX - ADCQ 8(SI), DX - MOVQ 16(DI), R8 - ADCQ 16(SI), R8 - MOVQ 24(DI), R9 - ADCQ 24(SI), R9 - MOVQ 32(DI), R10 - ADCQ 32(SI), R10 - MOVQ 40(DI), R11 - ADCQ 40(SI), R11 - MOVQ 48(DI), R12 - ADCQ 48(SI), R12 - MOVQ 56(DI), R13 - ADCQ 56(SI), R13 - MOVQ 64(DI), R14 - ADCQ 64(SI), R14 - MOVQ 72(DI), R15 - ADCQ 72(SI), R15 - MOVQ 80(DI), BX - ADCQ 80(SI), BX - MOVQ BX, (SP) - MOVQ 88(DI), BX - ADCQ 88(SI), BX - MOVQ BX, 8(SP) - MOVQ 96(DI), BX - ADCQ 96(SI), BX - MOVQ BX, 16(SP) - MOVQ 104(DI), BX - ADCQ 104(SI), BX - MOVQ BX, 24(SP) - MOVQ 112(DI), BX - ADCQ 112(SI), BX - MOVQ BX, 32(SP) - MOVQ 120(DI), BX - ADCQ 120(SI), BX - MOVQ BX, 40(SP) - ADCQ $0x00, AX + // | a5 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | - MOVQ p+24(FP), SI - MOVQ CX, BX - SUBQ (SI), BX - MOVQ BX, 48(SP) - MOVQ DX, BX - SBBQ 8(SI), BX - MOVQ BX, 56(SP) - MOVQ R8, BX - SBBQ 16(SI), BX - MOVQ BX, 64(SP) - MOVQ R9, BX - SBBQ 24(SI), BX - MOVQ BX, 72(SP) - MOVQ R10, BX - SBBQ 32(SI), BX - MOVQ BX, 80(SP) - MOVQ R11, BX - SBBQ 40(SI), BX - MOVQ BX, 88(SP) - MOVQ R12, BX - SBBQ 48(SI), BX - MOVQ BX, 96(SP) - MOVQ R13, BX - SBBQ 56(SI), BX - MOVQ BX, 104(SP) - MOVQ R14, BX - SBBQ 64(SI), BX - MOVQ BX, 112(SP) - MOVQ R15, BX - SBBQ 72(SI), BX - MOVQ BX, 120(SP) - MOVQ (SP), BX - SBBQ 80(SI), BX - MOVQ BX, 128(SP) - MOVQ 8(SP), BX - SBBQ 88(SI), BX - MOVQ BX, 136(SP) - MOVQ 16(SP), BX - SBBQ 96(SI), BX - MOVQ BX, 144(SP) - MOVQ 24(SP), BX - SBBQ 104(SI), BX - MOVQ BX, 152(SP) - MOVQ 32(SP), BX - SBBQ 112(SI), BX - MOVQ BX, 160(SP) - MOVQ 40(SP), BX - SBBQ 120(SI), BX - MOVQ BX, 168(SP) - SBBQ $0x00, AX + // | a5 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | - MOVQ c+0(FP), DI - CMOVQCC 48(SP), CX - MOVQ CX, (DI) - CMOVQCC 56(SP), DX - MOVQ DX, 8(DI) - CMOVQCC 64(SP), R8 - MOVQ R8, 16(DI) - CMOVQCC 72(SP), R9 - MOVQ R9, 24(DI) - CMOVQCC 80(SP), R10 - MOVQ R10, 32(DI) - CMOVQCC 88(SP), R11 - MOVQ R11, 40(DI) - CMOVQCC 96(SP), R12 - MOVQ R12, 48(DI) - CMOVQCC 104(SP), R13 - MOVQ R13, 56(DI) - CMOVQCC 112(SP), R14 - MOVQ R14, 64(DI) - CMOVQCC 120(SP), R15 - MOVQ R15, 72(DI) - MOVQ (SP), BX - CMOVQCC 128(SP), BX - MOVQ BX, 80(DI) - MOVQ 8(SP), BX - CMOVQCC 136(SP), BX - MOVQ BX, 88(DI) - MOVQ 16(SP), BX - CMOVQCC 144(SP), BX - MOVQ BX, 96(DI) - MOVQ 24(SP), BX - CMOVQCC 152(SP), BX - MOVQ BX, 104(DI) - MOVQ 32(SP), BX - CMOVQCC 160(SP), BX - MOVQ BX, 112(DI) - MOVQ 40(SP), BX - CMOVQCC 168(SP), BX - MOVQ BX, 120(DI) - RET + // | a5 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX -// func addn16(a *[16]uint64, b *[16]uint64) uint64 -TEXT ·addn16(SB), NOSPLIT, $48-24 - // | - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI + // | a5 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, BX - // | - MOVQ (DI), CX - ADDQ (SI), CX - MOVQ 8(DI), DX - ADCQ 8(SI), DX - MOVQ 16(DI), R8 - ADCQ 16(SI), R8 - MOVQ 24(DI), R9 - ADCQ 24(SI), R9 - MOVQ 32(DI), R10 - ADCQ 32(SI), R10 - MOVQ 40(DI), R11 - ADCQ 40(SI), R11 - MOVQ 48(DI), R12 - ADCQ 48(SI), R12 - MOVQ 56(DI), R13 - ADCQ 56(SI), R13 - MOVQ 64(DI), R14 - ADCQ 64(SI), R14 - MOVQ 72(DI), R15 - ADCQ 72(SI), R15 - MOVQ 80(DI), BX - ADCQ 80(SI), BX - MOVQ BX, (SP) - MOVQ 88(DI), BX - ADCQ 88(SI), BX - MOVQ BX, 8(SP) - MOVQ 96(DI), BX - ADCQ 96(SI), BX - MOVQ BX, 16(SP) - MOVQ 104(DI), BX - ADCQ 104(SI), BX - MOVQ BX, 24(SP) - MOVQ 112(DI), BX - ADCQ 112(SI), BX - MOVQ BX, 32(SP) - MOVQ 120(DI), BX - ADCQ 120(SI), BX - MOVQ BX, 40(SP) - ADCQ $0x00, AX + // | a5 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, BX - // | - MOVQ CX, (DI) - MOVQ DX, 8(DI) - MOVQ R8, 16(DI) - MOVQ R9, 24(DI) - MOVQ R10, 32(DI) - MOVQ R11, 40(DI) - MOVQ R12, 48(DI) - MOVQ R13, 56(DI) - MOVQ R14, 64(DI) - MOVQ R15, 72(DI) - MOVQ (SP), BX - MOVQ BX, 80(DI) - MOVQ 8(SP), BX - MOVQ BX, 88(DI) - MOVQ 16(SP), BX - MOVQ BX, 96(DI) - MOVQ 24(SP), BX - MOVQ BX, 104(DI) - MOVQ 32(SP), BX - MOVQ BX, 112(DI) - MOVQ 40(SP), BX - MOVQ BX, 120(DI) - MOVQ AX, ret+16(FP) - RET + // | -// func double16(c *[16]uint64, a *[16]uint64, p *[16]uint64) -TEXT ·double16(SB), NOSPLIT, $176-24 - // | - MOVQ a+8(FP), DI - XORQ AX, AX - MOVQ (DI), CX - ADDQ CX, CX - MOVQ 8(DI), DX - ADCQ DX, DX - MOVQ 16(DI), R8 - ADCQ R8, R8 - MOVQ 24(DI), R9 - ADCQ R9, R9 - MOVQ 32(DI), R10 - ADCQ R10, R10 - MOVQ 40(DI), R11 - ADCQ R11, R11 - MOVQ 48(DI), R12 - ADCQ R12, R12 - MOVQ 56(DI), R13 - ADCQ R13, R13 - MOVQ 64(DI), R14 - ADCQ R14, R14 - MOVQ 72(DI), R15 - ADCQ R15, R15 - MOVQ 80(DI), BX - ADCQ BX, BX +/* */ + + // | + // | W + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 R10 | 4 R11 | 5 R12 + // | 6 R13 | 7 R14 | 8 R15 | 9 R8 | 10 R9 | 11 BX + + + MOVQ (SP), CX + MOVQ 8(SP), DI + MOVQ 16(SP), SI MOVQ BX, (SP) - MOVQ 88(DI), BX - ADCQ BX, BX - MOVQ BX, 8(SP) - MOVQ 96(DI), BX - ADCQ BX, BX - MOVQ BX, 16(SP) - MOVQ 104(DI), BX - ADCQ BX, BX - MOVQ BX, 24(SP) - MOVQ 112(DI), BX - ADCQ BX, BX - MOVQ BX, 32(SP) - MOVQ 120(DI), BX - ADCQ BX, BX - MOVQ BX, 40(SP) - ADCQ $0x00, AX + MOVQ R9, 8(SP) + MOVQ R8, 16(SP) - // | - MOVQ p+16(FP), SI - MOVQ CX, BX - SUBQ (SI), BX - MOVQ BX, 48(SP) - MOVQ DX, BX - SBBQ 8(SI), BX - MOVQ BX, 56(SP) - MOVQ R8, BX - SBBQ 16(SI), BX - MOVQ BX, 64(SP) - MOVQ R9, BX - SBBQ 24(SI), BX - MOVQ BX, 72(SP) - MOVQ R10, BX - SBBQ 32(SI), BX - MOVQ BX, 80(SP) - MOVQ R11, BX - SBBQ 40(SI), BX - MOVQ BX, 88(SP) - MOVQ R12, BX - SBBQ 48(SI), BX - MOVQ BX, 96(SP) - MOVQ R13, BX - SBBQ 56(SI), BX - MOVQ BX, 104(SP) - MOVQ R14, BX - SBBQ 64(SI), BX - MOVQ BX, 112(SP) - MOVQ R15, BX - SBBQ 72(SI), BX - MOVQ BX, 120(SP) - MOVQ (SP), BX - SBBQ 80(SI), BX - MOVQ BX, 128(SP) - MOVQ 8(SP), BX - SBBQ 88(SI), BX - MOVQ BX, 136(SP) - MOVQ 16(SP), BX - SBBQ 96(SI), BX - MOVQ BX, 144(SP) - MOVQ 24(SP), BX - SBBQ 104(SI), BX - MOVQ BX, 152(SP) - MOVQ 32(SP), BX - SBBQ 112(SI), BX - MOVQ BX, 160(SP) - MOVQ 40(SP), BX - SBBQ 120(SI), BX - MOVQ BX, 168(SP) - SBBQ $0x00, AX + // | fetch modulus + MOVQ p+24(FP), R9 - // | - MOVQ c+0(FP), DI - CMOVQCC 48(SP), CX + // | + +/* montgomery reduction */ + + // | + +/* i = 0 */ + + // | + // | W + // | 0 CX | 1 DI | 2 SI | 3 R10 | 4 R11 | 5 R12 + // | 6 R13 | 7 R14 | 8 R15 | 9 16(SP) | 10 8(SP) | 11 (SP) + + + // | | u0 = w0 * inp + MOVQ CX, AX + MULQ inp+32(FP) + MOVQ AX, R8 + MOVQ $0x00, BX + + // | + +/* */ + + // | j0 + + // | w0 @ CX + MOVQ (R9), AX + MULQ R8 + ADDQ AX, CX + ADCQ DX, BX + + // | j1 + + // | w1 @ DI + MOVQ 8(R9), AX + MULQ R8 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ BX, DI + MOVQ $0x00, BX + ADCQ DX, BX + + // | j2 + + // | w2 @ SI + MOVQ 16(R9), AX + MULQ R8 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ BX, SI + MOVQ $0x00, BX + ADCQ DX, BX + + // | j3 + + // | w3 @ R10 + MOVQ 24(R9), AX + MULQ R8 + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ BX, R10 + MOVQ $0x00, BX + ADCQ DX, BX + + // | j4 + + // | w4 @ R11 + MOVQ 32(R9), AX + MULQ R8 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ BX, R11 + MOVQ $0x00, BX + ADCQ DX, BX + + // | j5 + + // | w5 @ R12 + MOVQ 40(R9), AX + MULQ R8 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ BX, R12 + + // | w6 @ R13 + ADCQ DX, R13 + ADCQ $0x00, CX + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 DI | 2 SI | 3 R10 | 4 R11 | 5 R12 + // | 6 R13 | 7 R14 | 8 R15 | 9 16(SP) | 10 8(SP) | 11 (SP) + + + // | | u1 = w1 * inp + MOVQ DI, AX + MULQ inp+32(FP) + MOVQ AX, R8 + MOVQ $0x00, BX + + // | + +/* */ + + // | j0 + + // | w1 @ DI + MOVQ (R9), AX + MULQ R8 + ADDQ AX, DI + ADCQ DX, BX + + // | j1 + + // | w2 @ SI + MOVQ 8(R9), AX + MULQ R8 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ BX, SI + MOVQ $0x00, BX + ADCQ DX, BX + + // | j2 + + // | w3 @ R10 + MOVQ 16(R9), AX + MULQ R8 + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ BX, R10 + MOVQ $0x00, BX + ADCQ DX, BX + + // | j3 + + // | w4 @ R11 + MOVQ 24(R9), AX + MULQ R8 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ BX, R11 + MOVQ $0x00, BX + ADCQ DX, BX + + // | j4 + + // | w5 @ R12 + MOVQ 32(R9), AX + MULQ R8 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ BX, R12 + MOVQ $0x00, BX + ADCQ DX, BX + + // | j5 + + // | w6 @ R13 + MOVQ 40(R9), AX + MULQ R8 + ADDQ AX, R13 + ADCQ DX, CX + ADDQ BX, R13 + + // | w7 @ R14 + ADCQ CX, R14 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 SI | 3 R10 | 4 R11 | 5 R12 + // | 6 R13 | 7 R14 | 8 R15 | 9 16(SP) | 10 8(SP) | 11 (SP) + + + // | | u2 = w2 * inp + MOVQ SI, AX + MULQ inp+32(FP) + MOVQ AX, R8 + MOVQ $0x00, BX + + // | + +/* */ + + // | j0 + + // | w2 @ SI + MOVQ (R9), AX + MULQ R8 + ADDQ AX, SI + ADCQ DX, BX + + // | j1 + + // | w3 @ R10 + MOVQ 8(R9), AX + MULQ R8 + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ BX, R10 + MOVQ $0x00, BX + ADCQ DX, BX + + // | j2 + + // | w4 @ R11 + MOVQ 16(R9), AX + MULQ R8 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ BX, R11 + MOVQ $0x00, BX + ADCQ DX, BX + + // | j3 + + // | w5 @ R12 + MOVQ 24(R9), AX + MULQ R8 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ BX, R12 + MOVQ $0x00, BX + ADCQ DX, BX + + // | j4 + + // | w6 @ R13 + MOVQ 32(R9), AX + MULQ R8 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ BX, R13 + MOVQ $0x00, BX + ADCQ DX, BX + + // | j5 + + // | w7 @ R14 + MOVQ 40(R9), AX + MULQ R8 + ADDQ AX, R14 + ADCQ DX, CX + ADDQ BX, R14 + + // | w8 @ R15 + ADCQ CX, R15 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 R10 | 4 R11 | 5 R12 + // | 6 R13 | 7 R14 | 8 R15 | 9 16(SP) | 10 8(SP) | 11 (SP) + + + // | | u3 = w3 * inp + MOVQ R10, AX + MULQ inp+32(FP) + MOVQ AX, R8 + MOVQ $0x00, BX + + // | + +/* */ + + // | j0 + + // | w3 @ R10 + MOVQ (R9), AX + MULQ R8 + ADDQ AX, R10 + ADCQ DX, BX + + // | j1 + + // | w4 @ R11 + MOVQ 8(R9), AX + MULQ R8 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ BX, R11 + MOVQ $0x00, BX + ADCQ DX, BX + + // | j2 + + // | w5 @ R12 + MOVQ 16(R9), AX + MULQ R8 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ BX, R12 + MOVQ $0x00, BX + ADCQ DX, BX + + // | j3 + + // | w6 @ R13 + MOVQ 24(R9), AX + MULQ R8 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ BX, R13 + MOVQ $0x00, BX + ADCQ DX, BX + + // | j4 + + // | w7 @ R14 + MOVQ 32(R9), AX + MULQ R8 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ BX, R14 + MOVQ $0x00, BX + ADCQ DX, BX + + // | j5 + + // | w8 @ R15 + MOVQ 40(R9), AX + MULQ R8 + ADDQ AX, R15 + ADCQ DX, CX + ADDQ BX, R15 + + // | move to idle register + MOVQ 16(SP), DI + + // | w9 @ DI + ADCQ CX, DI + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 4 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 R11 | 5 R12 + // | 6 R13 | 7 R14 | 8 R15 | 9 DI | 10 8(SP) | 11 (SP) + + + // | | u4 = w4 * inp + MOVQ R11, AX + MULQ inp+32(FP) + MOVQ AX, R8 + MOVQ $0x00, BX + + // | + +/* */ + + // | j0 + + // | w4 @ R11 + MOVQ (R9), AX + MULQ R8 + ADDQ AX, R11 + ADCQ DX, BX + + // | j1 + + // | w5 @ R12 + MOVQ 8(R9), AX + MULQ R8 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ BX, R12 + MOVQ $0x00, BX + ADCQ DX, BX + + // | j2 + + // | w6 @ R13 + MOVQ 16(R9), AX + MULQ R8 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ BX, R13 + MOVQ $0x00, BX + ADCQ DX, BX + + // | j3 + + // | w7 @ R14 + MOVQ 24(R9), AX + MULQ R8 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ BX, R14 + MOVQ $0x00, BX + ADCQ DX, BX + + // | j4 + + // | w8 @ R15 + MOVQ 32(R9), AX + MULQ R8 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ BX, R15 + MOVQ $0x00, BX + ADCQ DX, BX + + // | j5 + + // | w9 @ DI + MOVQ 40(R9), AX + MULQ R8 + ADDQ AX, DI + ADCQ DX, CX + ADDQ BX, DI + + // | move to idle register + MOVQ 8(SP), SI + + // | w10 @ SI + ADCQ CX, SI + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 5 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 R12 + // | 6 R13 | 7 R14 | 8 R15 | 9 DI | 10 SI | 11 (SP) + + + // | | u5 = w5 * inp + MOVQ R12, AX + MULQ inp+32(FP) + MOVQ AX, R8 + MOVQ $0x00, BX + + // | + +/* */ + + // | j0 + + // | w5 @ R12 + MOVQ (R9), AX + MULQ R8 + ADDQ AX, R12 + ADCQ DX, BX + + // | j1 + + // | w6 @ R13 + MOVQ 8(R9), AX + MULQ R8 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ BX, R13 + MOVQ $0x00, BX + ADCQ DX, BX + + // | j2 + + // | w7 @ R14 + MOVQ 16(R9), AX + MULQ R8 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ BX, R14 + MOVQ $0x00, BX + ADCQ DX, BX + + // | j3 + + // | w8 @ R15 + MOVQ 24(R9), AX + MULQ R8 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ BX, R15 + MOVQ $0x00, BX + ADCQ DX, BX + + // | j4 + + // | w9 @ DI + MOVQ 32(R9), AX + MULQ R8 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ BX, DI + MOVQ $0x00, BX + ADCQ DX, BX + + // | j5 + + // | w10 @ SI + MOVQ 40(R9), AX + MULQ R8 + ADDQ AX, SI + ADCQ DX, CX + ADDQ BX, SI + + // | move to idle register + MOVQ (SP), R10 + + // | w-1 @ R10 + ADCQ CX, R10 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + // | W montgomerry reduction ends + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - + // | 6 R13 | 7 R14 | 8 R15 | 9 DI | 10 SI | 11 R10 + + + // | + +/* modular reduction */ + + MOVQ R13, R11 + SUBQ (R9), R11 + MOVQ R14, R12 + SBBQ 8(R9), R12 + MOVQ R15, DX + SBBQ 16(R9), DX + MOVQ DX, (SP) + MOVQ DI, DX + SBBQ 24(R9), DX + MOVQ DX, 8(SP) + MOVQ SI, DX + SBBQ 32(R9), DX + MOVQ DX, 16(SP) + MOVQ R10, DX + SBBQ 40(R9), DX + MOVQ DX, 24(SP) + SBBQ $0x00, CX + + // | + +/* out */ + + MOVQ c+0(FP), CX + CMOVQCC R11, R13 + MOVQ R13, (CX) + CMOVQCC R12, R14 + MOVQ R14, 8(CX) + CMOVQCC (SP), R15 + MOVQ R15, 16(CX) + CMOVQCC 8(SP), DI + MOVQ DI, 24(CX) + CMOVQCC 16(SP), SI + MOVQ SI, 32(CX) + CMOVQCC 24(SP), R10 + MOVQ R10, 40(CX) + RET + + // | + +/* end */ + + +// func cpy7(dst *[7]uint64, src *[7]uint64) +TEXT ·cpy7(SB), NOSPLIT, $0-16 + MOVQ dst+0(FP), DI + MOVQ src+8(FP), SI + MOVQ (SI), R8 + MOVQ R8, (DI) + MOVQ 8(SI), R8 + MOVQ R8, 8(DI) + MOVQ 16(SI), R8 + MOVQ R8, 16(DI) + MOVQ 24(SI), R8 + MOVQ R8, 24(DI) + MOVQ 32(SI), R8 + MOVQ R8, 32(DI) + MOVQ 40(SI), R8 + MOVQ R8, 40(DI) + MOVQ 48(SI), R8 + MOVQ R8, 48(DI) + RET + +// func eq7(a *[7]uint64, b *[7]uint64) bool +TEXT ·eq7(SB), NOSPLIT, $0-17 + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + MOVB $0x00, ret+16(FP) + MOVQ (DI), R8 + CMPQ (SI), R8 + JNE ret + MOVQ 8(DI), R8 + CMPQ 8(SI), R8 + JNE ret + MOVQ 16(DI), R8 + CMPQ 16(SI), R8 + JNE ret + MOVQ 24(DI), R8 + CMPQ 24(SI), R8 + JNE ret + MOVQ 32(DI), R8 + CMPQ 32(SI), R8 + JNE ret + MOVQ 40(DI), R8 + CMPQ 40(SI), R8 + JNE ret + MOVQ 48(DI), R8 + CMPQ 48(SI), R8 + JNE ret + MOVB $0x01, ret+16(FP) + +ret: + RET + +// func cmp7(a *[7]uint64, b *[7]uint64) int8 +TEXT ·cmp7(SB), NOSPLIT, $0-17 + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + MOVQ 48(DI), R8 + CMPQ 48(SI), R8 + JB gt + JA lt + MOVQ 40(DI), R8 + CMPQ 40(SI), R8 + JB gt + JA lt + MOVQ 32(DI), R8 + CMPQ 32(SI), R8 + JB gt + JA lt + MOVQ 24(DI), R8 + CMPQ 24(SI), R8 + JB gt + JA lt + MOVQ 16(DI), R8 + CMPQ 16(SI), R8 + JB gt + JA lt + MOVQ 8(DI), R8 + CMPQ 8(SI), R8 + JB gt + JA lt + MOVQ (DI), R8 + CMPQ (SI), R8 + JB gt + JA lt + MOVB $0x00, ret+16(FP) + JMP ret + +gt: + MOVB $0x01, ret+16(FP) + JMP ret + +lt: + MOVB $0xff, ret+16(FP) + +ret: + RET + +// func add7(c *[7]uint64, a *[7]uint64, b *[7]uint64, p *[7]uint64) +TEXT ·add7(SB), NOSPLIT, $32-32 + // | + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + XORQ AX, AX + + // | + MOVQ (DI), CX + ADDQ (SI), CX + MOVQ 8(DI), DX + ADCQ 8(SI), DX + MOVQ 16(DI), R8 + ADCQ 16(SI), R8 + MOVQ 24(DI), R9 + ADCQ 24(SI), R9 + MOVQ 32(DI), R10 + ADCQ 32(SI), R10 + MOVQ 40(DI), R11 + ADCQ 40(SI), R11 + MOVQ 48(DI), R12 + ADCQ 48(SI), R12 + ADCQ $0x00, AX + + // | + MOVQ p+24(FP), SI + MOVQ CX, R13 + SUBQ (SI), R13 + MOVQ DX, R14 + SBBQ 8(SI), R14 + MOVQ R8, R15 + SBBQ 16(SI), R15 + MOVQ R9, BX + SBBQ 24(SI), BX + MOVQ BX, (SP) + MOVQ R10, BX + SBBQ 32(SI), BX + MOVQ BX, 8(SP) + MOVQ R11, BX + SBBQ 40(SI), BX + MOVQ BX, 16(SP) + MOVQ R12, BX + SBBQ 48(SI), BX + MOVQ BX, 24(SP) + SBBQ $0x00, AX + + // | + MOVQ c+0(FP), DI + CMOVQCC R13, CX + MOVQ CX, (DI) + CMOVQCC R14, DX + MOVQ DX, 8(DI) + CMOVQCC R15, R8 + MOVQ R8, 16(DI) + CMOVQCC (SP), R9 + MOVQ R9, 24(DI) + CMOVQCC 8(SP), R10 + MOVQ R10, 32(DI) + CMOVQCC 16(SP), R11 + MOVQ R11, 40(DI) + CMOVQCC 24(SP), R12 + MOVQ R12, 48(DI) + RET + + // | + +/* end */ + + RET + +// func addn7(a *[7]uint64, b *[7]uint64) uint64 +TEXT ·addn7(SB), NOSPLIT, $0-24 + // | + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + + // | + MOVQ (DI), CX + ADDQ (SI), CX + MOVQ 8(DI), DX + ADCQ 8(SI), DX + MOVQ 16(DI), R8 + ADCQ 16(SI), R8 + MOVQ 24(DI), R9 + ADCQ 24(SI), R9 + MOVQ 32(DI), R10 + ADCQ 32(SI), R10 + MOVQ 40(DI), R11 + ADCQ 40(SI), R11 + MOVQ 48(DI), R12 + ADCQ 48(SI), R12 + ADCQ $0x00, AX + + // | + MOVQ CX, (DI) + MOVQ DX, 8(DI) + MOVQ R8, 16(DI) + MOVQ R9, 24(DI) + MOVQ R10, 32(DI) + MOVQ R11, 40(DI) + MOVQ R12, 48(DI) + MOVQ AX, ret+16(FP) + RET + + // | + +/* end */ + + RET + +// func double7(c *[7]uint64, a *[7]uint64, p *[7]uint64) +TEXT ·double7(SB), NOSPLIT, $32-24 + // | + MOVQ a+8(FP), DI + XORQ AX, AX + MOVQ (DI), CX + ADDQ CX, CX + MOVQ 8(DI), DX + ADCQ DX, DX + MOVQ 16(DI), R8 + ADCQ R8, R8 + MOVQ 24(DI), R9 + ADCQ R9, R9 + MOVQ 32(DI), R10 + ADCQ R10, R10 + MOVQ 40(DI), R11 + ADCQ R11, R11 + MOVQ 48(DI), R12 + ADCQ R12, R12 + ADCQ $0x00, AX + + // | + MOVQ p+16(FP), SI + MOVQ CX, R13 + SUBQ (SI), R13 + MOVQ DX, R14 + SBBQ 8(SI), R14 + MOVQ R8, R15 + SBBQ 16(SI), R15 + MOVQ R9, BX + SBBQ 24(SI), BX + MOVQ BX, (SP) + MOVQ R10, BX + SBBQ 32(SI), BX + MOVQ BX, 8(SP) + MOVQ R11, BX + SBBQ 40(SI), BX + MOVQ BX, 16(SP) + MOVQ R12, BX + SBBQ 48(SI), BX + MOVQ BX, 24(SP) + SBBQ $0x00, AX + + // | + MOVQ c+0(FP), DI + CMOVQCC R13, CX + MOVQ CX, (DI) + CMOVQCC R14, DX + MOVQ DX, 8(DI) + CMOVQCC R15, R8 + MOVQ R8, 16(DI) + CMOVQCC (SP), R9 + MOVQ R9, 24(DI) + CMOVQCC 8(SP), R10 + MOVQ R10, 32(DI) + CMOVQCC 16(SP), R11 + MOVQ R11, 40(DI) + CMOVQCC 24(SP), R12 + MOVQ R12, 48(DI) + RET + + // | + +/* end */ + + RET + +// func sub7(c *[7]uint64, a *[7]uint64, b *[7]uint64, p *[7]uint64) +TEXT ·sub7(SB), NOSPLIT, $32-32 + // | + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + XORQ AX, AX + MOVQ (DI), CX + SUBQ (SI), CX + MOVQ 8(DI), DX + SBBQ 8(SI), DX + MOVQ 16(DI), R8 + SBBQ 16(SI), R8 + MOVQ 24(DI), R9 + SBBQ 24(SI), R9 + MOVQ 32(DI), R10 + SBBQ 32(SI), R10 + MOVQ 40(DI), R11 + SBBQ 40(SI), R11 + MOVQ 48(DI), R12 + SBBQ 48(SI), R12 + + // | + MOVQ p+24(FP), SI + MOVQ (SI), R13 + CMOVQCC AX, R13 + MOVQ 8(SI), R14 + CMOVQCC AX, R14 + MOVQ 16(SI), R15 + CMOVQCC AX, R15 + CMOVQCS 24(SI), AX + MOVQ AX, (SP) + CMOVQCS 32(SI), AX + MOVQ AX, 8(SP) + CMOVQCS 40(SI), AX + MOVQ AX, 16(SP) + CMOVQCS 48(SI), AX + MOVQ AX, 24(SP) + + // | + MOVQ c+0(FP), DI + ADDQ R13, CX + MOVQ CX, (DI) + ADCQ R14, DX + MOVQ DX, 8(DI) + ADCQ R15, R8 + MOVQ R8, 16(DI) + ADCQ (SP), R9 + MOVQ R9, 24(DI) + ADCQ 8(SP), R10 + MOVQ R10, 32(DI) + ADCQ 16(SP), R11 + MOVQ R11, 40(DI) + ADCQ 24(SP), R12 + MOVQ R12, 48(DI) + RET + + // | + +/* end */ + + RET + +// func subn7(a *[7]uint64, b *[7]uint64) uint64 +TEXT ·subn7(SB), NOSPLIT, $0-24 + // | + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + XORQ AX, AX + + // | + MOVQ (DI), CX + SUBQ (SI), CX + MOVQ 8(DI), DX + SBBQ 8(SI), DX + MOVQ 16(DI), R8 + SBBQ 16(SI), R8 + MOVQ 24(DI), R9 + SBBQ 24(SI), R9 + MOVQ 32(DI), R10 + SBBQ 32(SI), R10 + MOVQ 40(DI), R11 + SBBQ 40(SI), R11 + MOVQ 48(DI), R12 + SBBQ 48(SI), R12 + ADCQ $0x00, AX + + // | + MOVQ CX, (DI) + MOVQ DX, 8(DI) + MOVQ R8, 16(DI) + MOVQ R9, 24(DI) + MOVQ R10, 32(DI) + MOVQ R11, 40(DI) + MOVQ R12, 48(DI) + MOVQ AX, ret+16(FP) + RET + + // | + +/* end */ + + RET + +// func _neg7(c *[7]uint64, a *[7]uint64, p *[7]uint64) +TEXT ·_neg7(SB), NOSPLIT, $0-24 + // | + MOVQ a+8(FP), DI + + // | + MOVQ p+16(FP), SI + MOVQ (SI), CX + SUBQ (DI), CX + MOVQ 8(SI), DX + SBBQ 8(DI), DX + MOVQ 16(SI), R8 + SBBQ 16(DI), R8 + MOVQ 24(SI), R9 + SBBQ 24(DI), R9 + MOVQ 32(SI), R10 + SBBQ 32(DI), R10 + MOVQ 40(SI), R11 + SBBQ 40(DI), R11 + MOVQ 48(SI), R12 + SBBQ 48(DI), R12 + + // | + MOVQ c+0(FP), DI + MOVQ CX, (DI) + MOVQ DX, 8(DI) + MOVQ R8, 16(DI) + MOVQ R9, 24(DI) + MOVQ R10, 32(DI) + MOVQ R11, 40(DI) + MOVQ R12, 48(DI) + RET + + // | + +/* end */ + + RET + +// func mul_two_7(a *[7]uint64) +TEXT ·mul_two_7(SB), NOSPLIT, $0-8 + MOVQ a+0(FP), DI + XORQ AX, AX + RCLQ $0x01, (DI) + RCLQ $0x01, 8(DI) + RCLQ $0x01, 16(DI) + RCLQ $0x01, 24(DI) + RCLQ $0x01, 32(DI) + RCLQ $0x01, 40(DI) + RCLQ $0x01, 48(DI) + RET + +// func div_two_7(a *[7]uint64) +TEXT ·div_two_7(SB), NOSPLIT, $0-8 + MOVQ a+0(FP), DI + XORQ AX, AX + RCRQ $0x01, 48(DI) + RCRQ $0x01, 40(DI) + RCRQ $0x01, 32(DI) + RCRQ $0x01, 24(DI) + RCRQ $0x01, 16(DI) + RCRQ $0x01, 8(DI) + RCRQ $0x01, (DI) + RET + +// func mul7(c *[7]uint64, a *[7]uint64, b *[7]uint64, p *[7]uint64, inp uint64) +TEXT ·mul7(SB), NOSPLIT, $32-40 + // | + +/* inputs */ + + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + XORQ AX, AX + + // | + +/* i = 0 */ + + // | a0 @ DX + MOVQ (DI), DX + + // | a0 * b0 + MULXQ (SI), AX, CX + MOVQ AX, (SP) + + // | a0 * b1 + MULXQ 8(SI), AX, R8 + ADCXQ AX, CX + + // | a0 * b2 + MULXQ 16(SI), AX, R9 + ADCXQ AX, R8 + + // | a0 * b3 + MULXQ 24(SI), AX, R10 + ADCXQ AX, R9 + + // | a0 * b4 + MULXQ 32(SI), AX, R11 + ADCXQ AX, R10 + + // | a0 * b5 + MULXQ 40(SI), AX, R12 + ADCXQ AX, R11 + + // | a0 * b6 + MULXQ 48(SI), AX, R13 + ADCXQ AX, R12 + ADCQ $0x00, R13 + + // | + +/* i = 1 */ + + // | a1 @ DX + MOVQ 8(DI), DX + XORQ R14, R14 + + // | a1 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + MOVQ CX, 8(SP) + + // | a1 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a1 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a1 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a1 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a1 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a1 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R13 + ADOXQ R14, R14 + ADCXQ BX, R14 + + // | + +/* i = 2 */ + + // | a2 @ DX + MOVQ 16(DI), DX + XORQ R15, R15 + + // | a2 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + MOVQ R8, 16(SP) + + // | a2 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a2 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a2 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a2 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a2 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a2 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R14 + ADOXQ R15, R15 + ADCXQ BX, R15 + + // | + +/* i = 3 */ + + // | a3 @ DX + MOVQ 24(DI), DX + XORQ CX, CX + + // | a3 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + MOVQ R9, 24(SP) + + // | a3 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a3 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a3 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a3 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a3 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a3 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R15 + ADOXQ CX, CX + ADCXQ BX, CX + + // | + +/* i = 4 */ + + // | a4 @ DX + MOVQ 32(DI), DX + XORQ R8, R8 + + // | a4 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a4 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a4 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a4 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a4 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a4 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a4 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, CX + ADOXQ R8, R8 + ADCXQ BX, R8 + + // | + +/* i = 5 */ + + // | a5 @ DX + MOVQ 40(DI), DX + XORQ R9, R9 + + // | a5 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a5 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a5 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a5 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a5 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a5 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a5 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R8 + ADOXQ R9, R9 + ADCXQ BX, R9 + + // | + +/* i = 6 */ + + // | a6 @ DX + MOVQ 48(DI), DX + XORQ DI, DI + + // | a6 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a6 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a6 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a6 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a6 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a6 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a6 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R9 + ADOXQ BX, DI + ADCQ $0x00, DI + + // | + +/* */ + + // | + // | W + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 R10 | 5 R11 | 6 R12 + // | 7 R13 | 8 R14 | 9 R15 | 10 CX | 11 R8 | 12 R9 | 13 DI + + + MOVQ (SP), BX + MOVQ 8(SP), SI + MOVQ DI, (SP) + MOVQ 16(SP), DI + MOVQ R9, 8(SP) + MOVQ 24(SP), R9 + MOVQ R8, 16(SP) + MOVQ CX, 24(SP) + + // | fetch modulus + MOVQ p+24(FP), CX + + // | + // | W ready to mont + // | 0 BX | 1 SI | 2 DI | 3 R9 | 4 R10 | 5 R11 | 6 R12 + // | 7 R13 | 8 R14 | 9 R15 | 10 24(SP) | 11 16(SP) | 12 8(SP) | 13 (SP) + + + // | + +/* montgomery reduction */ + + // | clear flags + XORQ AX, AX + + // | + +/* i = 0 */ + + // | + // | W + // | 0 BX | 1 SI | 2 DI | 3 R9 | 4 R10 | 5 R11 | 6 R12 + // | 7 R13 | 8 R14 | 9 R15 | 10 24(SP) | 11 16(SP) | 12 8(SP) | 13 (SP) + + + // | | u0 = w0 * inp + MOVQ BX, DX + MULXQ inp+32(FP), DX, R8 + + // | + +/* */ + + // | j0 + + // | w0 @ BX + MULXQ (CX), AX, R8 + ADOXQ AX, BX + ADCXQ R8, SI + + // | j1 + + // | w1 @ SI + MULXQ 8(CX), AX, R8 + ADOXQ AX, SI + ADCXQ R8, DI + + // | j2 + + // | w2 @ DI + MULXQ 16(CX), AX, R8 + ADOXQ AX, DI + ADCXQ R8, R9 + + // | j3 + + // | w3 @ R9 + MULXQ 24(CX), AX, R8 + ADOXQ AX, R9 + ADCXQ R8, R10 + + // | j4 + + // | w4 @ R10 + MULXQ 32(CX), AX, R8 + ADOXQ AX, R10 + ADCXQ R8, R11 + + // | j5 + + // | w5 @ R11 + MULXQ 40(CX), AX, R8 + ADOXQ AX, R11 + ADCXQ R8, R12 + + // | j6 + + // | w6 @ R12 + MULXQ 48(CX), AX, R8 + ADOXQ AX, R12 + ADCXQ R8, R13 + ADOXQ BX, R13 + ADCXQ BX, BX + MOVQ $0x00, AX + ADOXQ AX, BX + + // | clear flags + XORQ AX, AX + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 SI | 2 DI | 3 R9 | 4 R10 | 5 R11 | 6 R12 + // | 7 R13 | 8 R14 | 9 R15 | 10 24(SP) | 11 16(SP) | 12 8(SP) | 13 (SP) + + + // | | u1 = w1 * inp + MOVQ SI, DX + MULXQ inp+32(FP), DX, R8 + + // | + +/* */ + + // | j0 + + // | w1 @ SI + MULXQ (CX), AX, R8 + ADOXQ AX, SI + ADCXQ R8, DI + + // | j1 + + // | w2 @ DI + MULXQ 8(CX), AX, R8 + ADOXQ AX, DI + ADCXQ R8, R9 + + // | j2 + + // | w3 @ R9 + MULXQ 16(CX), AX, R8 + ADOXQ AX, R9 + ADCXQ R8, R10 + + // | j3 + + // | w4 @ R10 + MULXQ 24(CX), AX, R8 + ADOXQ AX, R10 + ADCXQ R8, R11 + + // | j4 + + // | w5 @ R11 + MULXQ 32(CX), AX, R8 + ADOXQ AX, R11 + ADCXQ R8, R12 + + // | j5 + + // | w6 @ R12 + MULXQ 40(CX), AX, R8 + ADOXQ AX, R12 + ADCXQ R8, R13 + + // | j6 + + // | w7 @ R13 + MULXQ 48(CX), AX, R8 + ADOXQ AX, R13 + ADCXQ R8, R14 + ADOXQ BX, R14 + ADCXQ SI, SI + MOVQ $0x00, AX + ADOXQ AX, SI + + // | clear flags + XORQ AX, AX + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 DI | 3 R9 | 4 R10 | 5 R11 | 6 R12 + // | 7 R13 | 8 R14 | 9 R15 | 10 24(SP) | 11 16(SP) | 12 8(SP) | 13 (SP) + + + // | | u2 = w2 * inp + MOVQ DI, DX + MULXQ inp+32(FP), DX, R8 + + // | + +/* */ + + // | j0 + + // | w2 @ DI + MULXQ (CX), AX, R8 + ADOXQ AX, DI + ADCXQ R8, R9 + + // | j1 + + // | w3 @ R9 + MULXQ 8(CX), AX, R8 + ADOXQ AX, R9 + ADCXQ R8, R10 + + // | j2 + + // | w4 @ R10 + MULXQ 16(CX), AX, R8 + ADOXQ AX, R10 + ADCXQ R8, R11 + + // | j3 + + // | w5 @ R11 + MULXQ 24(CX), AX, R8 + ADOXQ AX, R11 + ADCXQ R8, R12 + + // | j4 + + // | w6 @ R12 + MULXQ 32(CX), AX, R8 + ADOXQ AX, R12 + ADCXQ R8, R13 + + // | j5 + + // | w7 @ R13 + MULXQ 40(CX), AX, R8 + ADOXQ AX, R13 + ADCXQ R8, R14 + + // | j6 + + // | w8 @ R14 + MULXQ 48(CX), AX, R8 + ADOXQ AX, R14 + ADCXQ R8, R15 + ADOXQ SI, R15 + ADCXQ DI, DI + MOVQ $0x00, AX + ADOXQ AX, DI + + // | clear flags + XORQ AX, AX + + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 R9 | 4 R10 | 5 R11 | 6 R12 + // | 7 R13 | 8 R14 | 9 R15 | 10 24(SP) | 11 16(SP) | 12 8(SP) | 13 (SP) + + + // | | u3 = w3 * inp + MOVQ R9, DX + MULXQ inp+32(FP), DX, R8 + + // | + +/* */ + + // | j0 + + // | w3 @ R9 + MULXQ (CX), AX, R8 + ADOXQ AX, R9 + ADCXQ R8, R10 + + // | j1 + + // | w4 @ R10 + MULXQ 8(CX), AX, R8 + ADOXQ AX, R10 + ADCXQ R8, R11 + + // | j2 + + // | w5 @ R11 + MULXQ 16(CX), AX, R8 + ADOXQ AX, R11 + ADCXQ R8, R12 + + // | j3 + + // | w6 @ R12 + MULXQ 24(CX), AX, R8 + ADOXQ AX, R12 + ADCXQ R8, R13 + + // | j4 + + // | w7 @ R13 + MULXQ 32(CX), AX, R8 + ADOXQ AX, R13 + ADCXQ R8, R14 + + // | j5 + + // | w8 @ R14 + MULXQ 40(CX), AX, R8 + ADOXQ AX, R14 + ADCXQ R8, R15 + + // | j6 + + // | w9 @ R15 + MULXQ 48(CX), AX, R8 + ADOXQ AX, R15 + + // | w10 @ 24(SP) + // | move to an idle register + MOVQ 24(SP), BX + ADCXQ R8, BX + ADOXQ DI, BX + ADCXQ R9, R9 + MOVQ $0x00, AX + ADOXQ AX, R9 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 4 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 R10 | 5 R11 | 6 R12 + // | 7 R13 | 8 R14 | 9 R15 | 10 BX | 11 16(SP) | 12 8(SP) | 13 (SP) + + + // | | u4 = w4 * inp + MOVQ R10, DX + MULXQ inp+32(FP), DX, R8 + + // | + +/* */ + + // | j0 + + // | w4 @ R10 + MULXQ (CX), AX, R8 + ADOXQ AX, R10 + ADCXQ R8, R11 + + // | j1 + + // | w5 @ R11 + MULXQ 8(CX), AX, R8 + ADOXQ AX, R11 + ADCXQ R8, R12 + + // | j2 + + // | w6 @ R12 + MULXQ 16(CX), AX, R8 + ADOXQ AX, R12 + ADCXQ R8, R13 + + // | j3 + + // | w7 @ R13 + MULXQ 24(CX), AX, R8 + ADOXQ AX, R13 + ADCXQ R8, R14 + + // | j4 + + // | w8 @ R14 + MULXQ 32(CX), AX, R8 + ADOXQ AX, R14 + ADCXQ R8, R15 + + // | j5 + + // | w9 @ R15 + MULXQ 40(CX), AX, R8 + ADOXQ AX, R15 + ADCXQ R8, BX + + // | j6 + + // | w10 @ BX + MULXQ 48(CX), AX, R8 + ADOXQ AX, BX + + // | w11 @ 16(SP) + // | move to an idle register + MOVQ 16(SP), DI + ADCXQ R8, DI + ADOXQ R9, DI + ADCXQ R10, R10 + MOVQ $0x00, AX + ADOXQ AX, R10 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 5 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 R11 | 6 R12 + // | 7 R13 | 8 R14 | 9 R15 | 10 BX | 11 DI | 12 8(SP) | 13 (SP) + + + // | | u5 = w5 * inp + MOVQ R11, DX + MULXQ inp+32(FP), DX, R8 + + // | + +/* */ + + // | j0 + + // | w5 @ R11 + MULXQ (CX), AX, R8 + ADOXQ AX, R11 + ADCXQ R8, R12 + + // | j1 + + // | w6 @ R12 + MULXQ 8(CX), AX, R8 + ADOXQ AX, R12 + ADCXQ R8, R13 + + // | j2 + + // | w7 @ R13 + MULXQ 16(CX), AX, R8 + ADOXQ AX, R13 + ADCXQ R8, R14 + + // | j3 + + // | w8 @ R14 + MULXQ 24(CX), AX, R8 + ADOXQ AX, R14 + ADCXQ R8, R15 + + // | j4 + + // | w9 @ R15 + MULXQ 32(CX), AX, R8 + ADOXQ AX, R15 + ADCXQ R8, BX + + // | j5 + + // | w10 @ BX + MULXQ 40(CX), AX, R8 + ADOXQ AX, BX + ADCXQ R8, DI + + // | j6 + + // | w11 @ DI + MULXQ 48(CX), AX, R8 + ADOXQ AX, DI + + // | w12 @ 8(SP) + // | move to an idle register + MOVQ 8(SP), SI + ADCXQ R8, SI + ADOXQ R10, SI + ADCXQ R11, R11 + MOVQ $0x00, AX + ADOXQ AX, R11 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 6 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 R12 + // | 7 R13 | 8 R14 | 9 R15 | 10 BX | 11 DI | 12 SI | 13 (SP) + + + // | | u6 = w6 * inp + MOVQ R12, DX + MULXQ inp+32(FP), DX, R8 + + // | + +/* */ + + // | j0 + + // | w6 @ R12 + MULXQ (CX), AX, R8 + ADOXQ AX, R12 + ADCXQ R8, R13 + + // | j1 + + // | w7 @ R13 + MULXQ 8(CX), AX, R8 + ADOXQ AX, R13 + ADCXQ R8, R14 + + // | j2 + + // | w8 @ R14 + MULXQ 16(CX), AX, R8 + ADOXQ AX, R14 + ADCXQ R8, R15 + + // | j3 + + // | w9 @ R15 + MULXQ 24(CX), AX, R8 + ADOXQ AX, R15 + ADCXQ R8, BX + + // | j4 + + // | w10 @ BX + MULXQ 32(CX), AX, R8 + ADOXQ AX, BX + ADCXQ R8, DI + + // | j5 + + // | w11 @ DI + MULXQ 40(CX), AX, R8 + ADOXQ AX, DI + ADCXQ R8, SI + + // | j6 + + // | w12 @ SI + MULXQ 48(CX), AX, R8 + ADOXQ AX, SI + + // | w13 @ (SP) + // | move to an idle register + MOVQ (SP), R9 + ADCXQ R8, R9 + ADOXQ R11, R9 + ADCXQ R12, R12 + MOVQ $0x00, AX + ADOXQ AX, R12 + + // | + // | W montgomery reduction ends + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - + // | 7 R13 | 8 R14 | 9 R15 | 10 BX | 11 DI | 12 SI | 13 R9 + + + // | + +/* modular reduction */ + + MOVQ R13, AX + SUBQ (CX), AX + MOVQ R14, R8 + SBBQ 8(CX), R8 + MOVQ R15, R10 + SBBQ 16(CX), R10 + MOVQ BX, R11 + SBBQ 24(CX), R11 + MOVQ DI, DX + SBBQ 32(CX), DX + MOVQ DX, (SP) + MOVQ SI, DX + SBBQ 40(CX), DX + MOVQ DX, 8(SP) + MOVQ R9, DX + SBBQ 48(CX), DX + MOVQ DX, 16(SP) + SBBQ $0x00, R12 + + // | + +/* out */ + + MOVQ c+0(FP), R12 + CMOVQCC AX, R13 + MOVQ R13, (R12) + CMOVQCC R8, R14 + MOVQ R14, 8(R12) + CMOVQCC R10, R15 + MOVQ R15, 16(R12) + CMOVQCC R11, BX + MOVQ BX, 24(R12) + CMOVQCC (SP), DI + MOVQ DI, 32(R12) + CMOVQCC 8(SP), SI + MOVQ SI, 40(R12) + CMOVQCC 16(SP), R9 + MOVQ R9, 48(R12) + RET + + // | + +/* end */ + + +// func mul_no_adx_bmi2_7(c *[7]uint64, a *[7]uint64, b *[7]uint64, p *[7]uint64, inp uint64) +TEXT ·mul_no_adx_bmi2_7(SB), NOSPLIT, $48-40 + // | + +/* inputs */ + + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + MOVQ $0x00, R9 + MOVQ $0x00, R10 + MOVQ $0x00, R11 + MOVQ $0x00, R12 + MOVQ $0x00, R13 + MOVQ $0x00, R14 + MOVQ $0x00, R15 + + // | + +/* i = 0 */ + + // | a0 @ CX + MOVQ (DI), CX + + // | a0 * b0 + MOVQ (SI), AX + MULQ CX + MOVQ AX, (SP) + MOVQ DX, R8 + + // | a0 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + + // | a0 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + + // | a0 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + + // | a0 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + + // | a0 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + + // | a0 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + + // | + +/* i = 1 */ + + // | a1 @ CX + MOVQ 8(DI), CX + MOVQ $0x00, BX + + // | a1 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R10 + ADCQ $0x00, BX + MOVQ R8, 8(SP) + MOVQ $0x00, R8 + + // | a1 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + + // | a1 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + + // | + +/* i = 2 */ + + // | a2 @ CX + MOVQ 16(DI), CX + MOVQ $0x00, BX + + // | a2 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ $0x00, R11 + ADCQ $0x00, BX + MOVQ R9, 16(SP) + MOVQ $0x00, R9 + + // | a2 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + + // | a2 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + + // | + +/* i = 3 */ + + // | a3 @ CX + MOVQ 24(DI), CX + MOVQ $0x00, BX + + // | a3 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ $0x00, R12 + ADCQ $0x00, BX + MOVQ R10, 24(SP) + MOVQ $0x00, R10 + + // | a3 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + + // | a3 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + + // | + +/* i = 4 */ + + // | a4 @ CX + MOVQ 32(DI), CX + MOVQ $0x00, BX + + // | a4 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ $0x00, R13 + ADCQ $0x00, BX + MOVQ R11, 32(SP) + MOVQ $0x00, R11 + + // | a4 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + + // | a4 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + + // | + +/* i = 5 */ + + // | a5 @ CX + MOVQ 40(DI), CX + MOVQ $0x00, BX + + // | a5 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ $0x00, R14 + ADCQ $0x00, BX + + // | a5 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + + // | a5 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + + // | + +/* i = 6 */ + + // | a6 @ CX + MOVQ 48(DI), CX + MOVQ $0x00, BX + + // | a6 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ $0x00, R15 + ADCQ $0x00, BX + + // | a6 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ $0x00, BX + + // | a6 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, BX + + // | + +/* */ + + // | + // | W + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 R12 | 6 R13 + // | 7 R14 | 8 R15 | 9 R8 | 10 R9 | 11 R10 | 12 R11 | 13 BX + + + MOVQ (SP), CX + MOVQ 8(SP), DI + MOVQ 16(SP), SI + MOVQ BX, (SP) + MOVQ 24(SP), BX + MOVQ R11, 8(SP) + MOVQ 32(SP), R11 + MOVQ R10, 16(SP) + MOVQ R9, 24(SP) + MOVQ R8, 32(SP) + + // | fetch modulus + MOVQ p+24(FP), R10 + + // | + +/* montgomery reduction */ + + // | + +/* i = 0 */ + + // | + // | W + // | 0 CX | 1 DI | 2 SI | 3 BX | 4 R11 | 5 R12 | 6 R13 + // | 7 R14 | 8 R15 | 9 32(SP) | 10 24(SP) | 11 16(SP) | 12 8(SP) | 13 (SP) + + + // | | u0 = w0 * inp + MOVQ CX, AX + MULQ inp+32(FP) + MOVQ AX, R9 + MOVQ $0x00, R8 + + // | + +/* */ + + // | j0 + + // | w0 @ CX + MOVQ (R10), AX + MULQ R9 + ADDQ AX, CX + ADCQ DX, R8 + + // | j1 + + // | w1 @ DI + MOVQ 8(R10), AX + MULQ R9 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w2 @ SI + MOVQ 16(R10), AX + MULQ R9 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w3 @ BX + MOVQ 24(R10), AX + MULQ R9 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w4 @ R11 + MOVQ 32(R10), AX + MULQ R9 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w5 @ R12 + MOVQ 40(R10), AX + MULQ R9 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w6 @ R13 + MOVQ 48(R10), AX + MULQ R9 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + + // | w7 @ R14 + ADCQ DX, R14 + ADCQ $0x00, CX + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 DI | 2 SI | 3 BX | 4 R11 | 5 R12 | 6 R13 + // | 7 R14 | 8 R15 | 9 32(SP) | 10 24(SP) | 11 16(SP) | 12 8(SP) | 13 (SP) + + + // | | u1 = w1 * inp + MOVQ DI, AX + MULQ inp+32(FP) + MOVQ AX, R9 + MOVQ $0x00, R8 + + // | + +/* */ + + // | j0 + + // | w1 @ DI + MOVQ (R10), AX + MULQ R9 + ADDQ AX, DI + ADCQ DX, R8 + + // | j1 + + // | w2 @ SI + MOVQ 8(R10), AX + MULQ R9 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w3 @ BX + MOVQ 16(R10), AX + MULQ R9 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w4 @ R11 + MOVQ 24(R10), AX + MULQ R9 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w5 @ R12 + MOVQ 32(R10), AX + MULQ R9 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w6 @ R13 + MOVQ 40(R10), AX + MULQ R9 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w7 @ R14 + MOVQ 48(R10), AX + MULQ R9 + ADDQ AX, R14 + ADCQ DX, CX + ADDQ R8, R14 + + // | w8 @ R15 + ADCQ CX, R15 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 SI | 3 BX | 4 R11 | 5 R12 | 6 R13 + // | 7 R14 | 8 R15 | 9 32(SP) | 10 24(SP) | 11 16(SP) | 12 8(SP) | 13 (SP) + + + // | | u2 = w2 * inp + MOVQ SI, AX + MULQ inp+32(FP) + MOVQ AX, R9 + MOVQ $0x00, R8 + + // | + +/* */ + + // | j0 + + // | w2 @ SI + MOVQ (R10), AX + MULQ R9 + ADDQ AX, SI + ADCQ DX, R8 + + // | j1 + + // | w3 @ BX + MOVQ 8(R10), AX + MULQ R9 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w4 @ R11 + MOVQ 16(R10), AX + MULQ R9 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w5 @ R12 + MOVQ 24(R10), AX + MULQ R9 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w6 @ R13 + MOVQ 32(R10), AX + MULQ R9 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w7 @ R14 + MOVQ 40(R10), AX + MULQ R9 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w8 @ R15 + MOVQ 48(R10), AX + MULQ R9 + ADDQ AX, R15 + ADCQ DX, CX + ADDQ R8, R15 + + // | move to idle register + MOVQ 32(SP), DI + + // | w9 @ DI + ADCQ CX, DI + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 BX | 4 R11 | 5 R12 | 6 R13 + // | 7 R14 | 8 R15 | 9 DI | 10 24(SP) | 11 16(SP) | 12 8(SP) | 13 (SP) + + + // | | u3 = w3 * inp + MOVQ BX, AX + MULQ inp+32(FP) + MOVQ AX, R9 + MOVQ $0x00, R8 + + // | + +/* */ + + // | j0 + + // | w3 @ BX + MOVQ (R10), AX + MULQ R9 + ADDQ AX, BX + ADCQ DX, R8 + + // | j1 + + // | w4 @ R11 + MOVQ 8(R10), AX + MULQ R9 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w5 @ R12 + MOVQ 16(R10), AX + MULQ R9 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w6 @ R13 + MOVQ 24(R10), AX + MULQ R9 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w7 @ R14 + MOVQ 32(R10), AX + MULQ R9 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w8 @ R15 + MOVQ 40(R10), AX + MULQ R9 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R8, R15 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w9 @ DI + MOVQ 48(R10), AX + MULQ R9 + ADDQ AX, DI + ADCQ DX, CX + ADDQ R8, DI + + // | move to idle register + MOVQ 24(SP), BX + + // | w10 @ BX + ADCQ CX, BX + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 4 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 R11 | 5 R12 | 6 R13 + // | 7 R14 | 8 R15 | 9 DI | 10 BX | 11 16(SP) | 12 8(SP) | 13 (SP) + + + // | | u4 = w4 * inp + MOVQ R11, AX + MULQ inp+32(FP) + MOVQ AX, R9 + MOVQ $0x00, R8 + + // | + +/* */ + + // | j0 + + // | w4 @ R11 + MOVQ (R10), AX + MULQ R9 + ADDQ AX, R11 + ADCQ DX, R8 + + // | j1 + + // | w5 @ R12 + MOVQ 8(R10), AX + MULQ R9 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w6 @ R13 + MOVQ 16(R10), AX + MULQ R9 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w7 @ R14 + MOVQ 24(R10), AX + MULQ R9 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w8 @ R15 + MOVQ 32(R10), AX + MULQ R9 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R8, R15 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w9 @ DI + MOVQ 40(R10), AX + MULQ R9 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w10 @ BX + MOVQ 48(R10), AX + MULQ R9 + ADDQ AX, BX + ADCQ DX, CX + ADDQ R8, BX + + // | move to idle register + MOVQ 16(SP), SI + + // | w11 @ SI + ADCQ CX, SI + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 5 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 R12 | 6 R13 + // | 7 R14 | 8 R15 | 9 DI | 10 BX | 11 SI | 12 8(SP) | 13 (SP) + + + // | | u5 = w5 * inp + MOVQ R12, AX + MULQ inp+32(FP) + MOVQ AX, R9 + MOVQ $0x00, R8 + + // | + +/* */ + + // | j0 + + // | w5 @ R12 + MOVQ (R10), AX + MULQ R9 + ADDQ AX, R12 + ADCQ DX, R8 + + // | j1 + + // | w6 @ R13 + MOVQ 8(R10), AX + MULQ R9 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w7 @ R14 + MOVQ 16(R10), AX + MULQ R9 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w8 @ R15 + MOVQ 24(R10), AX + MULQ R9 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R8, R15 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w9 @ DI + MOVQ 32(R10), AX + MULQ R9 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w10 @ BX + MOVQ 40(R10), AX + MULQ R9 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w11 @ SI + MOVQ 48(R10), AX + MULQ R9 + ADDQ AX, SI + ADCQ DX, CX + ADDQ R8, SI + + // | move to idle register + MOVQ 8(SP), R11 + + // | w12 @ R11 + ADCQ CX, R11 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 6 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 R13 + // | 7 R14 | 8 R15 | 9 DI | 10 BX | 11 SI | 12 R11 | 13 (SP) + + + // | | u6 = w6 * inp + MOVQ R13, AX + MULQ inp+32(FP) + MOVQ AX, R9 + MOVQ $0x00, R8 + + // | + +/* */ + + // | j0 + + // | w6 @ R13 + MOVQ (R10), AX + MULQ R9 + ADDQ AX, R13 + ADCQ DX, R8 + + // | j1 + + // | w7 @ R14 + MOVQ 8(R10), AX + MULQ R9 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w8 @ R15 + MOVQ 16(R10), AX + MULQ R9 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R8, R15 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w9 @ DI + MOVQ 24(R10), AX + MULQ R9 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w10 @ BX + MOVQ 32(R10), AX + MULQ R9 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w11 @ SI + MOVQ 40(R10), AX + MULQ R9 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w12 @ R11 + MOVQ 48(R10), AX + MULQ R9 + ADDQ AX, R11 + ADCQ DX, CX + ADDQ R8, R11 + + // | move to idle register + MOVQ (SP), R12 + + // | w-1 @ R12 + ADCQ CX, R12 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + // | W montgomerry reduction ends + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - + // | 7 R14 | 8 R15 | 9 DI | 10 BX | 11 SI | 12 R11 | 13 R12 + + + // | + +/* modular reduction */ + + MOVQ R14, R13 + SUBQ (R10), R13 + MOVQ R15, DX + SBBQ 8(R10), DX + MOVQ DX, (SP) + MOVQ DI, DX + SBBQ 16(R10), DX + MOVQ DX, 8(SP) + MOVQ BX, DX + SBBQ 24(R10), DX + MOVQ DX, 16(SP) + MOVQ SI, DX + SBBQ 32(R10), DX + MOVQ DX, 24(SP) + MOVQ R11, DX + SBBQ 40(R10), DX + MOVQ DX, 32(SP) + MOVQ R12, DX + SBBQ 48(R10), DX + MOVQ DX, 40(SP) + SBBQ $0x00, CX + + // | + +/* out */ + + MOVQ c+0(FP), CX + CMOVQCC R13, R14 + MOVQ R14, (CX) + CMOVQCC (SP), R15 + MOVQ R15, 8(CX) + CMOVQCC 8(SP), DI + MOVQ DI, 16(CX) + CMOVQCC 16(SP), BX + MOVQ BX, 24(CX) + CMOVQCC 24(SP), SI + MOVQ SI, 32(CX) + CMOVQCC 32(SP), R11 + MOVQ R11, 40(CX) + CMOVQCC 40(SP), R12 + MOVQ R12, 48(CX) + RET + + // | + +/* end */ + + +// func cpy8(dst *[8]uint64, src *[8]uint64) +TEXT ·cpy8(SB), NOSPLIT, $0-16 + MOVQ dst+0(FP), DI + MOVQ src+8(FP), SI + MOVQ (SI), R8 + MOVQ R8, (DI) + MOVQ 8(SI), R8 + MOVQ R8, 8(DI) + MOVQ 16(SI), R8 + MOVQ R8, 16(DI) + MOVQ 24(SI), R8 + MOVQ R8, 24(DI) + MOVQ 32(SI), R8 + MOVQ R8, 32(DI) + MOVQ 40(SI), R8 + MOVQ R8, 40(DI) + MOVQ 48(SI), R8 + MOVQ R8, 48(DI) + MOVQ 56(SI), R8 + MOVQ R8, 56(DI) + RET + +// func eq8(a *[8]uint64, b *[8]uint64) bool +TEXT ·eq8(SB), NOSPLIT, $0-17 + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + MOVB $0x00, ret+16(FP) + MOVQ (DI), R8 + CMPQ (SI), R8 + JNE ret + MOVQ 8(DI), R8 + CMPQ 8(SI), R8 + JNE ret + MOVQ 16(DI), R8 + CMPQ 16(SI), R8 + JNE ret + MOVQ 24(DI), R8 + CMPQ 24(SI), R8 + JNE ret + MOVQ 32(DI), R8 + CMPQ 32(SI), R8 + JNE ret + MOVQ 40(DI), R8 + CMPQ 40(SI), R8 + JNE ret + MOVQ 48(DI), R8 + CMPQ 48(SI), R8 + JNE ret + MOVQ 56(DI), R8 + CMPQ 56(SI), R8 + JNE ret + MOVB $0x01, ret+16(FP) + +ret: + RET + +// func cmp8(a *[8]uint64, b *[8]uint64) int8 +TEXT ·cmp8(SB), NOSPLIT, $0-17 + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + MOVQ 56(DI), R8 + CMPQ 56(SI), R8 + JB gt + JA lt + MOVQ 48(DI), R8 + CMPQ 48(SI), R8 + JB gt + JA lt + MOVQ 40(DI), R8 + CMPQ 40(SI), R8 + JB gt + JA lt + MOVQ 32(DI), R8 + CMPQ 32(SI), R8 + JB gt + JA lt + MOVQ 24(DI), R8 + CMPQ 24(SI), R8 + JB gt + JA lt + MOVQ 16(DI), R8 + CMPQ 16(SI), R8 + JB gt + JA lt + MOVQ 8(DI), R8 + CMPQ 8(SI), R8 + JB gt + JA lt + MOVQ (DI), R8 + CMPQ (SI), R8 + JB gt + JA lt + MOVB $0x00, ret+16(FP) + JMP ret + +gt: + MOVB $0x01, ret+16(FP) + JMP ret + +lt: + MOVB $0xff, ret+16(FP) + +ret: + RET + +// func add8(c *[8]uint64, a *[8]uint64, b *[8]uint64, p *[8]uint64) +TEXT ·add8(SB), NOSPLIT, $48-32 + // | + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + XORQ AX, AX + + // | + MOVQ (DI), CX + ADDQ (SI), CX + MOVQ 8(DI), DX + ADCQ 8(SI), DX + MOVQ 16(DI), R8 + ADCQ 16(SI), R8 + MOVQ 24(DI), R9 + ADCQ 24(SI), R9 + MOVQ 32(DI), R10 + ADCQ 32(SI), R10 + MOVQ 40(DI), R11 + ADCQ 40(SI), R11 + MOVQ 48(DI), R12 + ADCQ 48(SI), R12 + MOVQ 56(DI), R13 + ADCQ 56(SI), R13 + ADCQ $0x00, AX + + // | + MOVQ p+24(FP), SI + MOVQ CX, R14 + SUBQ (SI), R14 + MOVQ DX, R15 + SBBQ 8(SI), R15 + MOVQ R8, BX + SBBQ 16(SI), BX + MOVQ BX, (SP) + MOVQ R9, BX + SBBQ 24(SI), BX + MOVQ BX, 8(SP) + MOVQ R10, BX + SBBQ 32(SI), BX + MOVQ BX, 16(SP) + MOVQ R11, BX + SBBQ 40(SI), BX + MOVQ BX, 24(SP) + MOVQ R12, BX + SBBQ 48(SI), BX + MOVQ BX, 32(SP) + MOVQ R13, BX + SBBQ 56(SI), BX + MOVQ BX, 40(SP) + SBBQ $0x00, AX + + // | + MOVQ c+0(FP), DI + CMOVQCC R14, CX + MOVQ CX, (DI) + CMOVQCC R15, DX + MOVQ DX, 8(DI) + CMOVQCC (SP), R8 + MOVQ R8, 16(DI) + CMOVQCC 8(SP), R9 + MOVQ R9, 24(DI) + CMOVQCC 16(SP), R10 + MOVQ R10, 32(DI) + CMOVQCC 24(SP), R11 + MOVQ R11, 40(DI) + CMOVQCC 32(SP), R12 + MOVQ R12, 48(DI) + CMOVQCC 40(SP), R13 + MOVQ R13, 56(DI) + RET + + // | + +/* end */ + + RET + +// func addn8(a *[8]uint64, b *[8]uint64) uint64 +TEXT ·addn8(SB), NOSPLIT, $0-24 + // | + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + + // | + MOVQ (DI), CX + ADDQ (SI), CX + MOVQ 8(DI), DX + ADCQ 8(SI), DX + MOVQ 16(DI), R8 + ADCQ 16(SI), R8 + MOVQ 24(DI), R9 + ADCQ 24(SI), R9 + MOVQ 32(DI), R10 + ADCQ 32(SI), R10 + MOVQ 40(DI), R11 + ADCQ 40(SI), R11 + MOVQ 48(DI), R12 + ADCQ 48(SI), R12 + MOVQ 56(DI), R13 + ADCQ 56(SI), R13 + ADCQ $0x00, AX + + // | + MOVQ CX, (DI) + MOVQ DX, 8(DI) + MOVQ R8, 16(DI) + MOVQ R9, 24(DI) + MOVQ R10, 32(DI) + MOVQ R11, 40(DI) + MOVQ R12, 48(DI) + MOVQ R13, 56(DI) + MOVQ AX, ret+16(FP) + RET + + // | + +/* end */ + + RET + +// func double8(c *[8]uint64, a *[8]uint64, p *[8]uint64) +TEXT ·double8(SB), NOSPLIT, $48-24 + // | + MOVQ a+8(FP), DI + XORQ AX, AX + MOVQ (DI), CX + ADDQ CX, CX + MOVQ 8(DI), DX + ADCQ DX, DX + MOVQ 16(DI), R8 + ADCQ R8, R8 + MOVQ 24(DI), R9 + ADCQ R9, R9 + MOVQ 32(DI), R10 + ADCQ R10, R10 + MOVQ 40(DI), R11 + ADCQ R11, R11 + MOVQ 48(DI), R12 + ADCQ R12, R12 + MOVQ 56(DI), R13 + ADCQ R13, R13 + ADCQ $0x00, AX + + // | + MOVQ p+16(FP), SI + MOVQ CX, R14 + SUBQ (SI), R14 + MOVQ DX, R15 + SBBQ 8(SI), R15 + MOVQ R8, BX + SBBQ 16(SI), BX + MOVQ BX, (SP) + MOVQ R9, BX + SBBQ 24(SI), BX + MOVQ BX, 8(SP) + MOVQ R10, BX + SBBQ 32(SI), BX + MOVQ BX, 16(SP) + MOVQ R11, BX + SBBQ 40(SI), BX + MOVQ BX, 24(SP) + MOVQ R12, BX + SBBQ 48(SI), BX + MOVQ BX, 32(SP) + MOVQ R13, BX + SBBQ 56(SI), BX + MOVQ BX, 40(SP) + SBBQ $0x00, AX + + // | + MOVQ c+0(FP), DI + CMOVQCC R14, CX + MOVQ CX, (DI) + CMOVQCC R15, DX + MOVQ DX, 8(DI) + CMOVQCC (SP), R8 + MOVQ R8, 16(DI) + CMOVQCC 8(SP), R9 + MOVQ R9, 24(DI) + CMOVQCC 16(SP), R10 + MOVQ R10, 32(DI) + CMOVQCC 24(SP), R11 + MOVQ R11, 40(DI) + CMOVQCC 32(SP), R12 + MOVQ R12, 48(DI) + CMOVQCC 40(SP), R13 + MOVQ R13, 56(DI) + RET + + // | + +/* end */ + + RET + +// func sub8(c *[8]uint64, a *[8]uint64, b *[8]uint64, p *[8]uint64) +TEXT ·sub8(SB), NOSPLIT, $48-32 + // | + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + XORQ AX, AX + MOVQ (DI), CX + SUBQ (SI), CX + MOVQ 8(DI), DX + SBBQ 8(SI), DX + MOVQ 16(DI), R8 + SBBQ 16(SI), R8 + MOVQ 24(DI), R9 + SBBQ 24(SI), R9 + MOVQ 32(DI), R10 + SBBQ 32(SI), R10 + MOVQ 40(DI), R11 + SBBQ 40(SI), R11 + MOVQ 48(DI), R12 + SBBQ 48(SI), R12 + MOVQ 56(DI), R13 + SBBQ 56(SI), R13 + + // | + MOVQ p+24(FP), SI + MOVQ (SI), R14 + CMOVQCC AX, R14 + MOVQ 8(SI), R15 + CMOVQCC AX, R15 + CMOVQCS 16(SI), AX + MOVQ AX, (SP) + CMOVQCS 24(SI), AX + MOVQ AX, 8(SP) + CMOVQCS 32(SI), AX + MOVQ AX, 16(SP) + CMOVQCS 40(SI), AX + MOVQ AX, 24(SP) + CMOVQCS 48(SI), AX + MOVQ AX, 32(SP) + CMOVQCS 56(SI), AX + MOVQ AX, 40(SP) + + // | + MOVQ c+0(FP), DI + ADDQ R14, CX + MOVQ CX, (DI) + ADCQ R15, DX + MOVQ DX, 8(DI) + ADCQ (SP), R8 + MOVQ R8, 16(DI) + ADCQ 8(SP), R9 + MOVQ R9, 24(DI) + ADCQ 16(SP), R10 + MOVQ R10, 32(DI) + ADCQ 24(SP), R11 + MOVQ R11, 40(DI) + ADCQ 32(SP), R12 + MOVQ R12, 48(DI) + ADCQ 40(SP), R13 + MOVQ R13, 56(DI) + RET + + // | + +/* end */ + + RET + +// func subn8(a *[8]uint64, b *[8]uint64) uint64 +TEXT ·subn8(SB), NOSPLIT, $0-24 + // | + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + XORQ AX, AX + + // | + MOVQ (DI), CX + SUBQ (SI), CX + MOVQ 8(DI), DX + SBBQ 8(SI), DX + MOVQ 16(DI), R8 + SBBQ 16(SI), R8 + MOVQ 24(DI), R9 + SBBQ 24(SI), R9 + MOVQ 32(DI), R10 + SBBQ 32(SI), R10 + MOVQ 40(DI), R11 + SBBQ 40(SI), R11 + MOVQ 48(DI), R12 + SBBQ 48(SI), R12 + MOVQ 56(DI), R13 + SBBQ 56(SI), R13 + ADCQ $0x00, AX + + // | + MOVQ CX, (DI) + MOVQ DX, 8(DI) + MOVQ R8, 16(DI) + MOVQ R9, 24(DI) + MOVQ R10, 32(DI) + MOVQ R11, 40(DI) + MOVQ R12, 48(DI) + MOVQ R13, 56(DI) + MOVQ AX, ret+16(FP) + RET + + // | + +/* end */ + + RET + +// func _neg8(c *[8]uint64, a *[8]uint64, p *[8]uint64) +TEXT ·_neg8(SB), NOSPLIT, $0-24 + // | + MOVQ a+8(FP), DI + + // | + MOVQ p+16(FP), SI + MOVQ (SI), CX + SUBQ (DI), CX + MOVQ 8(SI), DX + SBBQ 8(DI), DX + MOVQ 16(SI), R8 + SBBQ 16(DI), R8 + MOVQ 24(SI), R9 + SBBQ 24(DI), R9 + MOVQ 32(SI), R10 + SBBQ 32(DI), R10 + MOVQ 40(SI), R11 + SBBQ 40(DI), R11 + MOVQ 48(SI), R12 + SBBQ 48(DI), R12 + MOVQ 56(SI), R13 + SBBQ 56(DI), R13 + + // | + MOVQ c+0(FP), DI + MOVQ CX, (DI) + MOVQ DX, 8(DI) + MOVQ R8, 16(DI) + MOVQ R9, 24(DI) + MOVQ R10, 32(DI) + MOVQ R11, 40(DI) + MOVQ R12, 48(DI) + MOVQ R13, 56(DI) + RET + + // | + +/* end */ + + RET + +// func mul_two_8(a *[8]uint64) +TEXT ·mul_two_8(SB), NOSPLIT, $0-8 + MOVQ a+0(FP), DI + XORQ AX, AX + RCLQ $0x01, (DI) + RCLQ $0x01, 8(DI) + RCLQ $0x01, 16(DI) + RCLQ $0x01, 24(DI) + RCLQ $0x01, 32(DI) + RCLQ $0x01, 40(DI) + RCLQ $0x01, 48(DI) + RCLQ $0x01, 56(DI) + RET + +// func div_two_8(a *[8]uint64) +TEXT ·div_two_8(SB), NOSPLIT, $0-8 + MOVQ a+0(FP), DI + XORQ AX, AX + RCRQ $0x01, 56(DI) + RCRQ $0x01, 48(DI) + RCRQ $0x01, 40(DI) + RCRQ $0x01, 32(DI) + RCRQ $0x01, 24(DI) + RCRQ $0x01, 16(DI) + RCRQ $0x01, 8(DI) + RCRQ $0x01, (DI) + RET + +// func mul8(c *[8]uint64, a *[8]uint64, b *[8]uint64, p *[8]uint64, inp uint64) +TEXT ·mul8(SB), NOSPLIT, $48-40 + // | + +/* inputs */ + + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + XORQ AX, AX + + // | + +/* i = 0 */ + + // | a0 @ DX + MOVQ (DI), DX + + // | a0 * b0 + MULXQ (SI), AX, CX + MOVQ AX, (SP) + + // | a0 * b1 + MULXQ 8(SI), AX, R8 + ADCXQ AX, CX + + // | a0 * b2 + MULXQ 16(SI), AX, R9 + ADCXQ AX, R8 + + // | a0 * b3 + MULXQ 24(SI), AX, R10 + ADCXQ AX, R9 + + // | a0 * b4 + MULXQ 32(SI), AX, R11 + ADCXQ AX, R10 + + // | a0 * b5 + MULXQ 40(SI), AX, R12 + ADCXQ AX, R11 + + // | a0 * b6 + MULXQ 48(SI), AX, R13 + ADCXQ AX, R12 + + // | a0 * b7 + MULXQ 56(SI), AX, R14 + ADCXQ AX, R13 + ADCQ $0x00, R14 + + // | + +/* i = 1 */ + + // | a1 @ DX + MOVQ 8(DI), DX + XORQ R15, R15 + + // | a1 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + MOVQ CX, 8(SP) + + // | a1 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a1 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a1 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a1 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a1 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a1 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a1 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R14 + ADOXQ R15, R15 + ADCXQ BX, R15 + + // | + +/* i = 2 */ + + // | a2 @ DX + MOVQ 16(DI), DX + XORQ CX, CX + + // | a2 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + MOVQ R8, 16(SP) + + // | a2 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a2 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a2 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a2 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a2 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a2 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a2 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R15 + ADOXQ CX, CX + ADCXQ BX, CX + + // | + +/* i = 3 */ + + // | a3 @ DX + MOVQ 24(DI), DX + XORQ R8, R8 + + // | a3 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + MOVQ R9, 24(SP) + + // | a3 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a3 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a3 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a3 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a3 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a3 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a3 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, CX + ADOXQ R8, R8 + ADCXQ BX, R8 + + // | + +/* i = 4 */ + + // | a4 @ DX + MOVQ 32(DI), DX + XORQ R9, R9 + + // | a4 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + MOVQ R10, 32(SP) + + // | a4 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a4 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a4 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a4 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a4 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a4 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a4 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R8 + ADOXQ R9, R9 + ADCXQ BX, R9 + + // | + +/* i = 5 */ + + // | a5 @ DX + MOVQ 40(DI), DX + XORQ R10, R10 + + // | a5 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + MOVQ R11, 40(SP) + + // | a5 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a5 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a5 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a5 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a5 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a5 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a5 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R9 + ADOXQ R10, R10 + ADCXQ BX, R10 + + // | + +/* i = 6 */ + + // | a6 @ DX + MOVQ 48(DI), DX + XORQ R11, R11 + + // | a6 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a6 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a6 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a6 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a6 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a6 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a6 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a6 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R10 + ADOXQ R11, R11 + ADCXQ BX, R11 + + // | + +/* i = 7 */ + + // | a7 @ DX + MOVQ 56(DI), DX + XORQ DI, DI + + // | a7 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a7 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a7 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a7 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a7 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a7 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a7 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a7 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R11 + ADOXQ BX, DI + ADCQ $0x00, DI + + // | + +/* */ + + // | + // | W + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 R12 | 7 R13 + // | 8 R14 | 9 R15 | 10 CX | 11 R8 | 12 R9 | 13 R10 | 14 R11 | 15 DI + + + MOVQ (SP), BX + MOVQ 8(SP), SI + MOVQ DI, (SP) + MOVQ 16(SP), DI + MOVQ R11, 8(SP) + MOVQ 24(SP), R11 + MOVQ R10, 16(SP) + MOVQ 32(SP), R10 + MOVQ R9, 24(SP) + MOVQ 40(SP), R9 + MOVQ R8, 32(SP) + MOVQ CX, 40(SP) + + // | fetch modulus + MOVQ p+24(FP), CX + + // | + // | W ready to mont + // | 0 BX | 1 SI | 2 DI | 3 R11 | 4 R10 | 5 R9 | 6 R12 | 7 R13 + // | 8 R14 | 9 R15 | 10 40(SP) | 11 32(SP) | 12 24(SP) | 13 16(SP) | 14 8(SP) | 15 (SP) + + + // | + +/* montgomery reduction */ + + // | clear flags + XORQ AX, AX + + // | + +/* i = 0 */ + + // | + // | W + // | 0 BX | 1 SI | 2 DI | 3 R11 | 4 R10 | 5 R9 | 6 R12 | 7 R13 + // | 8 R14 | 9 R15 | 10 40(SP) | 11 32(SP) | 12 24(SP) | 13 16(SP) | 14 8(SP) | 15 (SP) + + + // | | u0 = w0 * inp + MOVQ BX, DX + MULXQ inp+32(FP), DX, R8 + + // | + +/* */ + + // | j0 + + // | w0 @ BX + MULXQ (CX), AX, R8 + ADOXQ AX, BX + ADCXQ R8, SI + + // | j1 + + // | w1 @ SI + MULXQ 8(CX), AX, R8 + ADOXQ AX, SI + ADCXQ R8, DI + + // | j2 + + // | w2 @ DI + MULXQ 16(CX), AX, R8 + ADOXQ AX, DI + ADCXQ R8, R11 + + // | j3 + + // | w3 @ R11 + MULXQ 24(CX), AX, R8 + ADOXQ AX, R11 + ADCXQ R8, R10 + + // | j4 + + // | w4 @ R10 + MULXQ 32(CX), AX, R8 + ADOXQ AX, R10 + ADCXQ R8, R9 + + // | j5 + + // | w5 @ R9 + MULXQ 40(CX), AX, R8 + ADOXQ AX, R9 + ADCXQ R8, R12 + + // | j6 + + // | w6 @ R12 + MULXQ 48(CX), AX, R8 + ADOXQ AX, R12 + ADCXQ R8, R13 + + // | j7 + + // | w7 @ R13 + MULXQ 56(CX), AX, R8 + ADOXQ AX, R13 + ADCXQ R8, R14 + ADOXQ BX, R14 + ADCXQ BX, BX + MOVQ $0x00, AX + ADOXQ AX, BX + + // | clear flags + XORQ AX, AX + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 SI | 2 DI | 3 R11 | 4 R10 | 5 R9 | 6 R12 | 7 R13 + // | 8 R14 | 9 R15 | 10 40(SP) | 11 32(SP) | 12 24(SP) | 13 16(SP) | 14 8(SP) | 15 (SP) + + + // | | u1 = w1 * inp + MOVQ SI, DX + MULXQ inp+32(FP), DX, R8 + + // | + +/* */ + + // | j0 + + // | w1 @ SI + MULXQ (CX), AX, R8 + ADOXQ AX, SI + ADCXQ R8, DI + + // | j1 + + // | w2 @ DI + MULXQ 8(CX), AX, R8 + ADOXQ AX, DI + ADCXQ R8, R11 + + // | j2 + + // | w3 @ R11 + MULXQ 16(CX), AX, R8 + ADOXQ AX, R11 + ADCXQ R8, R10 + + // | j3 + + // | w4 @ R10 + MULXQ 24(CX), AX, R8 + ADOXQ AX, R10 + ADCXQ R8, R9 + + // | j4 + + // | w5 @ R9 + MULXQ 32(CX), AX, R8 + ADOXQ AX, R9 + ADCXQ R8, R12 + + // | j5 + + // | w6 @ R12 + MULXQ 40(CX), AX, R8 + ADOXQ AX, R12 + ADCXQ R8, R13 + + // | j6 + + // | w7 @ R13 + MULXQ 48(CX), AX, R8 + ADOXQ AX, R13 + ADCXQ R8, R14 + + // | j7 + + // | w8 @ R14 + MULXQ 56(CX), AX, R8 + ADOXQ AX, R14 + ADCXQ R8, R15 + ADOXQ BX, R15 + ADCXQ SI, SI + MOVQ $0x00, AX + ADOXQ AX, SI + + // | clear flags + XORQ AX, AX + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 DI | 3 R11 | 4 R10 | 5 R9 | 6 R12 | 7 R13 + // | 8 R14 | 9 R15 | 10 40(SP) | 11 32(SP) | 12 24(SP) | 13 16(SP) | 14 8(SP) | 15 (SP) + + + // | | u2 = w2 * inp + MOVQ DI, DX + MULXQ inp+32(FP), DX, R8 + + // | + +/* */ + + // | j0 + + // | w2 @ DI + MULXQ (CX), AX, R8 + ADOXQ AX, DI + ADCXQ R8, R11 + + // | j1 + + // | w3 @ R11 + MULXQ 8(CX), AX, R8 + ADOXQ AX, R11 + ADCXQ R8, R10 + + // | j2 + + // | w4 @ R10 + MULXQ 16(CX), AX, R8 + ADOXQ AX, R10 + ADCXQ R8, R9 + + // | j3 + + // | w5 @ R9 + MULXQ 24(CX), AX, R8 + ADOXQ AX, R9 + ADCXQ R8, R12 + + // | j4 + + // | w6 @ R12 + MULXQ 32(CX), AX, R8 + ADOXQ AX, R12 + ADCXQ R8, R13 + + // | j5 + + // | w7 @ R13 + MULXQ 40(CX), AX, R8 + ADOXQ AX, R13 + ADCXQ R8, R14 + + // | j6 + + // | w8 @ R14 + MULXQ 48(CX), AX, R8 + ADOXQ AX, R14 + ADCXQ R8, R15 + + // | j7 + + // | w9 @ R15 + MULXQ 56(CX), AX, R8 + ADOXQ AX, R15 + + // | w10 @ 40(SP) + // | move to an idle register + MOVQ 40(SP), BX + ADCXQ R8, BX + ADOXQ SI, BX + ADCXQ DI, DI + MOVQ $0x00, AX + ADOXQ AX, DI + + // | clear flags + XORQ AX, AX + + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 R11 | 4 R10 | 5 R9 | 6 R12 | 7 R13 + // | 8 R14 | 9 R15 | 10 BX | 11 32(SP) | 12 24(SP) | 13 16(SP) | 14 8(SP) | 15 (SP) + + + // | | u3 = w3 * inp + MOVQ R11, DX + MULXQ inp+32(FP), DX, R8 + + // | + +/* */ + + // | j0 + + // | w3 @ R11 + MULXQ (CX), AX, R8 + ADOXQ AX, R11 + ADCXQ R8, R10 + + // | j1 + + // | w4 @ R10 + MULXQ 8(CX), AX, R8 + ADOXQ AX, R10 + ADCXQ R8, R9 + + // | j2 + + // | w5 @ R9 + MULXQ 16(CX), AX, R8 + ADOXQ AX, R9 + ADCXQ R8, R12 + + // | j3 + + // | w6 @ R12 + MULXQ 24(CX), AX, R8 + ADOXQ AX, R12 + ADCXQ R8, R13 + + // | j4 + + // | w7 @ R13 + MULXQ 32(CX), AX, R8 + ADOXQ AX, R13 + ADCXQ R8, R14 + + // | j5 + + // | w8 @ R14 + MULXQ 40(CX), AX, R8 + ADOXQ AX, R14 + ADCXQ R8, R15 + + // | j6 + + // | w9 @ R15 + MULXQ 48(CX), AX, R8 + ADOXQ AX, R15 + ADCXQ R8, BX + + // | j7 + + // | w10 @ BX + MULXQ 56(CX), AX, R8 + ADOXQ AX, BX + + // | w11 @ 32(SP) + // | move to an idle register + MOVQ 32(SP), SI + ADCXQ R8, SI + ADOXQ DI, SI + ADCXQ R11, R11 + MOVQ $0x00, AX + ADOXQ AX, R11 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 4 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 R10 | 5 R9 | 6 R12 | 7 R13 + // | 8 R14 | 9 R15 | 10 BX | 11 SI | 12 24(SP) | 13 16(SP) | 14 8(SP) | 15 (SP) + + + // | | u4 = w4 * inp + MOVQ R10, DX + MULXQ inp+32(FP), DX, R8 + + // | + +/* */ + + // | j0 + + // | w4 @ R10 + MULXQ (CX), AX, R8 + ADOXQ AX, R10 + ADCXQ R8, R9 + + // | j1 + + // | w5 @ R9 + MULXQ 8(CX), AX, R8 + ADOXQ AX, R9 + ADCXQ R8, R12 + + // | j2 + + // | w6 @ R12 + MULXQ 16(CX), AX, R8 + ADOXQ AX, R12 + ADCXQ R8, R13 + + // | j3 + + // | w7 @ R13 + MULXQ 24(CX), AX, R8 + ADOXQ AX, R13 + ADCXQ R8, R14 + + // | j4 + + // | w8 @ R14 + MULXQ 32(CX), AX, R8 + ADOXQ AX, R14 + ADCXQ R8, R15 + + // | j5 + + // | w9 @ R15 + MULXQ 40(CX), AX, R8 + ADOXQ AX, R15 + ADCXQ R8, BX + + // | j6 + + // | w10 @ BX + MULXQ 48(CX), AX, R8 + ADOXQ AX, BX + ADCXQ R8, SI + + // | j7 + + // | w11 @ SI + MULXQ 56(CX), AX, R8 + ADOXQ AX, SI + + // | w12 @ 24(SP) + // | move to an idle register + MOVQ 24(SP), DI + ADCXQ R8, DI + ADOXQ R11, DI + ADCXQ R10, R10 + MOVQ $0x00, AX + ADOXQ AX, R10 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 5 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 R9 | 6 R12 | 7 R13 + // | 8 R14 | 9 R15 | 10 BX | 11 SI | 12 DI | 13 16(SP) | 14 8(SP) | 15 (SP) + + + // | | u5 = w5 * inp + MOVQ R9, DX + MULXQ inp+32(FP), DX, R8 + + // | + +/* */ + + // | j0 + + // | w5 @ R9 + MULXQ (CX), AX, R8 + ADOXQ AX, R9 + ADCXQ R8, R12 + + // | j1 + + // | w6 @ R12 + MULXQ 8(CX), AX, R8 + ADOXQ AX, R12 + ADCXQ R8, R13 + + // | j2 + + // | w7 @ R13 + MULXQ 16(CX), AX, R8 + ADOXQ AX, R13 + ADCXQ R8, R14 + + // | j3 + + // | w8 @ R14 + MULXQ 24(CX), AX, R8 + ADOXQ AX, R14 + ADCXQ R8, R15 + + // | j4 + + // | w9 @ R15 + MULXQ 32(CX), AX, R8 + ADOXQ AX, R15 + ADCXQ R8, BX + + // | j5 + + // | w10 @ BX + MULXQ 40(CX), AX, R8 + ADOXQ AX, BX + ADCXQ R8, SI + + // | j6 + + // | w11 @ SI + MULXQ 48(CX), AX, R8 + ADOXQ AX, SI + ADCXQ R8, DI + + // | j7 + + // | w12 @ DI + MULXQ 56(CX), AX, R8 + ADOXQ AX, DI + + // | w13 @ 16(SP) + // | move to an idle register + MOVQ 16(SP), R11 + ADCXQ R8, R11 + ADOXQ R10, R11 + ADCXQ R9, R9 + MOVQ $0x00, AX + ADOXQ AX, R9 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 6 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 R12 | 7 R13 + // | 8 R14 | 9 R15 | 10 BX | 11 SI | 12 DI | 13 R11 | 14 8(SP) | 15 (SP) + + + // | | u6 = w6 * inp + MOVQ R12, DX + MULXQ inp+32(FP), DX, R8 + + // | + +/* */ + + // | j0 + + // | w6 @ R12 + MULXQ (CX), AX, R8 + ADOXQ AX, R12 + ADCXQ R8, R13 + + // | j1 + + // | w7 @ R13 + MULXQ 8(CX), AX, R8 + ADOXQ AX, R13 + ADCXQ R8, R14 + + // | j2 + + // | w8 @ R14 + MULXQ 16(CX), AX, R8 + ADOXQ AX, R14 + ADCXQ R8, R15 + + // | j3 + + // | w9 @ R15 + MULXQ 24(CX), AX, R8 + ADOXQ AX, R15 + ADCXQ R8, BX + + // | j4 + + // | w10 @ BX + MULXQ 32(CX), AX, R8 + ADOXQ AX, BX + ADCXQ R8, SI + + // | j5 + + // | w11 @ SI + MULXQ 40(CX), AX, R8 + ADOXQ AX, SI + ADCXQ R8, DI + + // | j6 + + // | w12 @ DI + MULXQ 48(CX), AX, R8 + ADOXQ AX, DI + ADCXQ R8, R11 + + // | j7 + + // | w13 @ R11 + MULXQ 56(CX), AX, R8 + ADOXQ AX, R11 + + // | w14 @ 8(SP) + // | move to an idle register + MOVQ 8(SP), R10 + ADCXQ R8, R10 + ADOXQ R9, R10 + ADCXQ R12, R12 + MOVQ $0x00, AX + ADOXQ AX, R12 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 7 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 R13 + // | 8 R14 | 9 R15 | 10 BX | 11 SI | 12 DI | 13 R11 | 14 R10 | 15 (SP) + + + // | | u7 = w7 * inp + MOVQ R13, DX + MULXQ inp+32(FP), DX, R8 + + // | + +/* */ + + // | j0 + + // | w7 @ R13 + MULXQ (CX), AX, R8 + ADOXQ AX, R13 + ADCXQ R8, R14 + + // | j1 + + // | w8 @ R14 + MULXQ 8(CX), AX, R8 + ADOXQ AX, R14 + ADCXQ R8, R15 + + // | j2 + + // | w9 @ R15 + MULXQ 16(CX), AX, R8 + ADOXQ AX, R15 + ADCXQ R8, BX + + // | j3 + + // | w10 @ BX + MULXQ 24(CX), AX, R8 + ADOXQ AX, BX + ADCXQ R8, SI + + // | j4 + + // | w11 @ SI + MULXQ 32(CX), AX, R8 + ADOXQ AX, SI + ADCXQ R8, DI + + // | j5 + + // | w12 @ DI + MULXQ 40(CX), AX, R8 + ADOXQ AX, DI + ADCXQ R8, R11 + + // | j6 + + // | w13 @ R11 + MULXQ 48(CX), AX, R8 + ADOXQ AX, R11 + ADCXQ R8, R10 + + // | j7 + + // | w14 @ R10 + MULXQ 56(CX), AX, R8 + ADOXQ AX, R10 + + // | w15 @ (SP) + // | move to an idle register + MOVQ (SP), R9 + ADCXQ R8, R9 + ADOXQ R12, R9 + ADCXQ R13, R13 + MOVQ $0x00, AX + ADOXQ AX, R13 + + // | + // | W montgomery reduction ends + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - + // | 8 R14 | 9 R15 | 10 BX | 11 SI | 12 DI | 13 R11 | 14 R10 | 15 R9 + + + // | + +/* modular reduction */ + + MOVQ R14, AX + SUBQ (CX), AX + MOVQ R15, R8 + SBBQ 8(CX), R8 + MOVQ BX, R12 + SBBQ 16(CX), R12 + MOVQ SI, DX + SBBQ 24(CX), DX + MOVQ DX, (SP) + MOVQ DI, DX + SBBQ 32(CX), DX + MOVQ DX, 8(SP) + MOVQ R11, DX + SBBQ 40(CX), DX + MOVQ DX, 16(SP) + MOVQ R10, DX + SBBQ 48(CX), DX + MOVQ DX, 24(SP) + MOVQ R9, DX + SBBQ 56(CX), DX + MOVQ DX, 32(SP) + SBBQ $0x00, R13 + + // | + +/* out */ + + MOVQ c+0(FP), R13 + CMOVQCC AX, R14 + MOVQ R14, (R13) + CMOVQCC R8, R15 + MOVQ R15, 8(R13) + CMOVQCC R12, BX + MOVQ BX, 16(R13) + CMOVQCC (SP), SI + MOVQ SI, 24(R13) + CMOVQCC 8(SP), DI + MOVQ DI, 32(R13) + CMOVQCC 16(SP), R11 + MOVQ R11, 40(R13) + CMOVQCC 24(SP), R10 + MOVQ R10, 48(R13) + CMOVQCC 32(SP), R9 + MOVQ R9, 56(R13) + RET + + // | + +/* end */ + + +// func mul_no_adx_bmi2_8(c *[8]uint64, a *[8]uint64, b *[8]uint64, p *[8]uint64, inp uint64) +TEXT ·mul_no_adx_bmi2_8(SB), NOSPLIT, $128-40 + // | + +/* inputs */ + + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + MOVQ $0x00, R9 + MOVQ $0x00, R10 + MOVQ $0x00, R11 + MOVQ $0x00, R12 + MOVQ $0x00, R13 + MOVQ $0x00, R14 + MOVQ $0x00, R15 + + // | + +/* i = 0 */ + + // | a0 @ CX + MOVQ (DI), CX + + // | a0 * b0 + MOVQ (SI), AX + MULQ CX + MOVQ AX, (SP) + MOVQ DX, R8 + + // | a0 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + + // | a0 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + + // | a0 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + + // | a0 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + + // | a0 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + + // | a0 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + + // | a0 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + + // | + +/* i = 1 */ + + // | a1 @ CX + MOVQ 8(DI), CX + MOVQ $0x00, BX + + // | a1 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R10 + ADCQ $0x00, BX + MOVQ R8, 8(SP) + MOVQ $0x00, R8 + + // | a1 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + + // | a1 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + + // | + +/* i = 2 */ + + // | a2 @ CX + MOVQ 16(DI), CX + MOVQ $0x00, BX + + // | a2 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ $0x00, R11 + ADCQ $0x00, BX + MOVQ R9, 16(SP) + MOVQ $0x00, R9 + + // | a2 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + + // | a2 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + + // | + +/* i = 3 */ + + // | a3 @ CX + MOVQ 24(DI), CX + MOVQ $0x00, BX + + // | a3 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ $0x00, R12 + ADCQ $0x00, BX + MOVQ R10, 24(SP) + MOVQ $0x00, R10 + + // | a3 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + + // | a3 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + + // | + +/* i = 4 */ + + // | a4 @ CX + MOVQ 32(DI), CX + MOVQ $0x00, BX + + // | a4 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ $0x00, R13 + ADCQ $0x00, BX + MOVQ R11, 32(SP) + MOVQ $0x00, R11 + + // | a4 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + + // | a4 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + + // | + +/* i = 5 */ + + // | a5 @ CX + MOVQ 40(DI), CX + MOVQ $0x00, BX + + // | a5 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ $0x00, R14 + ADCQ $0x00, BX + MOVQ R12, 40(SP) + MOVQ $0x00, R12 + + // | a5 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + + // | a5 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + + // | + +/* i = 6 */ + + // | a6 @ CX + MOVQ 48(DI), CX + MOVQ $0x00, BX + + // | a6 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ $0x00, R15 + ADCQ $0x00, BX + MOVQ R13, 48(SP) + MOVQ $0x00, R13 + + // | a6 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + + // | a6 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + + // | + +/* i = 7 */ + + // | a7 @ CX + MOVQ 56(DI), CX + MOVQ $0x00, BX + + // | a7 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ $0x00, R8 + ADCQ $0x00, BX + + // | a7 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ $0x00, BX + + // | a7 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, BX + + // | + +/* */ + + // | + // | W + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 R14 + // | 8 R15 | 9 R8 | 10 R9 | 11 R10 | 12 R11 | 13 R12 | 14 R13 | 15 BX + + + MOVQ (SP), CX + MOVQ 8(SP), DI + MOVQ 16(SP), SI + MOVQ BX, (SP) + MOVQ 24(SP), BX + MOVQ R13, 8(SP) + MOVQ 32(SP), R13 + MOVQ R12, 16(SP) + MOVQ 40(SP), R12 + MOVQ R11, 24(SP) + MOVQ 48(SP), R11 + MOVQ R10, 32(SP) + MOVQ R9, 40(SP) + MOVQ R8, 48(SP) + + // | fetch modulus + MOVQ p+24(FP), R10 + + // | + +/* montgomery reduction q1 */ + + // | + +/* i = 0 */ + + // | + // | W + // | 0 CX | 1 DI | 2 SI | 3 BX | 4 R13 | 5 R12 | 6 R11 | 7 R14 + // | 8 R15 | 9 48(SP) | 10 40(SP) | 11 32(SP) | 12 24(SP) | 13 16(SP) | 14 8(SP) | 15 (SP) + + + // | | u0 = w0 * inp + MOVQ CX, AX + MULQ inp+32(FP) + MOVQ AX, R9 + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u0 + MOVQ R9, 56(SP) + + // | j0 + + // | w0 @ CX + MOVQ (R10), AX + MULQ R9 + ADDQ AX, CX + ADCQ DX, R8 + + // | j1 + + // | w1 @ DI + MOVQ 8(R10), AX + MULQ R9 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w2 @ SI + MOVQ 16(R10), AX + MULQ R9 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w3 @ BX + MOVQ 24(R10), AX + MULQ R9 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w4 @ R13 + MOVQ 32(R10), AX + MULQ R9 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w5 @ R12 + MOVQ 40(R10), AX + MULQ R9 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w6 @ R11 + MOVQ 48(R10), AX + MULQ R9 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w7 @ R14 + MOVQ 56(R10), AX + MULQ R9 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + + // | w8 @ R15 + ADCQ DX, R15 + ADCQ $0x00, CX + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 DI | 2 SI | 3 BX | 4 R13 | 5 R12 | 6 R11 | 7 R14 + // | 8 R15 | 9 48(SP) | 10 40(SP) | 11 32(SP) | 12 24(SP) | 13 16(SP) | 14 8(SP) | 15 (SP) + + + // | | u1 = w1 * inp + MOVQ DI, AX + MULQ inp+32(FP) + MOVQ AX, R9 + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u1 + MOVQ R9, 64(SP) + + // | j0 + + // | w1 @ DI + MOVQ (R10), AX + MULQ R9 + ADDQ AX, DI + ADCQ DX, R8 + + // | j1 + + // | w2 @ SI + MOVQ 8(R10), AX + MULQ R9 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w3 @ BX + MOVQ 16(R10), AX + MULQ R9 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w4 @ R13 + MOVQ 24(R10), AX + MULQ R9 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w5 @ R12 + MOVQ 32(R10), AX + MULQ R9 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w6 @ R11 + MOVQ 40(R10), AX + MULQ R9 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w7 @ R14 + MOVQ 48(R10), AX + MULQ R9 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w8 @ R15 + MOVQ 56(R10), AX + MULQ R9 + ADDQ AX, R15 + ADCQ DX, CX + ADDQ R8, R15 + + // | move to idle register + MOVQ 48(SP), DI + + // | w9 @ DI + ADCQ CX, DI + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 SI | 3 BX | 4 R13 | 5 R12 | 6 R11 | 7 R14 + // | 8 R15 | 9 DI | 10 40(SP) | 11 32(SP) | 12 24(SP) | 13 16(SP) | 14 8(SP) | 15 (SP) + + + // | | u2 = w2 * inp + MOVQ SI, AX + MULQ inp+32(FP) + MOVQ AX, R9 + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u2 + MOVQ R9, 48(SP) + + // | j0 + + // | w2 @ SI + MOVQ (R10), AX + MULQ R9 + ADDQ AX, SI + ADCQ DX, R8 + + // | j1 + + // | w3 @ BX + MOVQ 8(R10), AX + MULQ R9 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w4 @ R13 + MOVQ 16(R10), AX + MULQ R9 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w5 @ R12 + MOVQ 24(R10), AX + MULQ R9 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w6 @ R11 + MOVQ 32(R10), AX + MULQ R9 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w7 @ R14 + MOVQ 40(R10), AX + MULQ R9 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w8 @ R15 + MOVQ 48(R10), AX + MULQ R9 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R8, R15 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w9 @ DI + MOVQ 56(R10), AX + MULQ R9 + ADDQ AX, DI + ADCQ DX, CX + ADDQ R8, DI + + // | move to idle register + MOVQ 40(SP), SI + + // | w10 @ SI + ADCQ CX, SI + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 BX | 4 R13 | 5 R12 | 6 R11 | 7 R14 + // | 8 R15 | 9 DI | 10 SI | 11 32(SP) | 12 24(SP) | 13 16(SP) | 14 8(SP) | 15 (SP) + + + // | | u3 = w3 * inp + MOVQ BX, AX + MULQ inp+32(FP) + MOVQ AX, R9 + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u3 + MOVQ R9, 40(SP) + + // | j0 + + // | w3 @ BX + MOVQ (R10), AX + MULQ R9 + ADDQ AX, BX + ADCQ DX, R8 + + // | j1 + + // | w4 @ R13 + MOVQ 8(R10), AX + MULQ R9 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w5 @ R12 + MOVQ 16(R10), AX + MULQ R9 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w6 @ R11 + MOVQ 24(R10), AX + MULQ R9 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w7 @ R14 + MOVQ 32(R10), AX + MULQ R9 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w8 @ R15 + MOVQ 40(R10), AX + MULQ R9 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R8, R15 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w9 @ DI + MOVQ 48(R10), AX + MULQ R9 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w10 @ SI + MOVQ 56(R10), AX + MULQ R9 + ADDQ AX, SI + ADCQ DX, CX + ADDQ R8, SI + + // | move to idle register + MOVQ 32(SP), BX + + // | w11 @ BX + ADCQ CX, BX + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 4 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 R13 | 5 R12 | 6 R11 | 7 R14 + // | 8 R15 | 9 DI | 10 SI | 11 BX | 12 24(SP) | 13 16(SP) | 14 8(SP) | 15 (SP) + + + // | | u4 = w4 * inp + MOVQ R13, AX + MULQ inp+32(FP) + MOVQ AX, R9 + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u4 + MOVQ R9, 32(SP) + + // | j0 + + // | w4 @ R13 + MOVQ (R10), AX + MULQ R9 + ADDQ AX, R13 + ADCQ DX, R8 + + // | j1 + + // | w5 @ R12 + MOVQ 8(R10), AX + MULQ R9 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w6 @ R11 + MOVQ 16(R10), AX + MULQ R9 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w7 @ R14 + MOVQ 24(R10), AX + MULQ R9 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w8 @ R15 + MOVQ 32(R10), AX + MULQ R9 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R8, R15 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w9 @ DI + MOVQ 40(R10), AX + MULQ R9 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w10 @ SI + MOVQ 48(R10), AX + MULQ R9 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w11 @ BX + MOVQ 56(R10), AX + MULQ R9 + ADDQ AX, BX + ADCQ DX, CX + ADDQ R8, BX + + // | move to idle register + MOVQ 24(SP), R13 + + // | w12 @ R13 + ADCQ CX, R13 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 5 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 R12 | 6 R11 | 7 R14 + // | 8 R15 | 9 DI | 10 SI | 11 BX | 12 R13 | 13 16(SP) | 14 8(SP) | 15 (SP) + + + // | | u5 = w5 * inp + MOVQ R12, AX + MULQ inp+32(FP) + MOVQ AX, R9 + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u5 + MOVQ R9, 24(SP) + + // | j0 + + // | w5 @ R12 + MOVQ (R10), AX + MULQ R9 + ADDQ AX, R12 + ADCQ DX, R8 + + // | j1 + + // | w6 @ R11 + MOVQ 8(R10), AX + MULQ R9 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w7 @ R14 + MOVQ 16(R10), AX + MULQ R9 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w8 @ R15 + MOVQ 24(R10), AX + MULQ R9 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R8, R15 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w9 @ DI + MOVQ 32(R10), AX + MULQ R9 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w10 @ SI + MOVQ 40(R10), AX + MULQ R9 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w11 @ BX + MOVQ 48(R10), AX + MULQ R9 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w12 @ R13 + MOVQ 56(R10), AX + MULQ R9 + ADDQ AX, R13 + ADCQ DX, CX + ADDQ R8, R13 + + // | move to idle register + MOVQ 16(SP), R12 + + // | w13 @ R12 + ADCQ CX, R12 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 6 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 R11 | 7 R14 + // | 8 R15 | 9 DI | 10 SI | 11 BX | 12 R13 | 13 R12 | 14 8(SP) | 15 (SP) + + + // | | u6 = w6 * inp + MOVQ R11, AX + MULQ inp+32(FP) + MOVQ AX, R9 + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u6 + MOVQ R9, 16(SP) + + // | j0 + + // | w6 @ R11 + MOVQ (R10), AX + MULQ R9 + ADDQ AX, R11 + ADCQ DX, R8 + + // | j1 + + // | w7 @ R14 + MOVQ 8(R10), AX + MULQ R9 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w8 @ R15 + MOVQ 16(R10), AX + MULQ R9 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R8, R15 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w9 @ DI + MOVQ 24(R10), AX + MULQ R9 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w10 @ SI + MOVQ 32(R10), AX + MULQ R9 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w11 @ BX + MOVQ 40(R10), AX + MULQ R9 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w12 @ R13 + MOVQ 48(R10), AX + MULQ R9 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w13 @ R12 + MOVQ 56(R10), AX + MULQ R9 + ADDQ AX, R12 + ADCQ DX, CX + ADDQ R8, R12 + + // | move to idle register + MOVQ 8(SP), R11 + + // | w14 @ R11 + ADCQ CX, R11 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 7 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 R14 + // | 8 R15 | 9 DI | 10 SI | 11 BX | 12 R13 | 13 R12 | 14 R11 | 15 (SP) + + + // | | u7 = w7 * inp + MOVQ R14, AX + MULQ inp+32(FP) + MOVQ AX, R9 + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u7 + MOVQ R9, 8(SP) + + // | j0 + + // | w7 @ R14 + MOVQ (R10), AX + MULQ R9 + ADDQ AX, R14 + ADCQ DX, R8 + + // | j1 + + // | w8 @ R15 + MOVQ 8(R10), AX + MULQ R9 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R8, R15 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w9 @ DI + MOVQ 16(R10), AX + MULQ R9 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w10 @ SI + MOVQ 24(R10), AX + MULQ R9 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w11 @ BX + MOVQ 32(R10), AX + MULQ R9 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w12 @ R13 + MOVQ 40(R10), AX + MULQ R9 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w13 @ R12 + MOVQ 48(R10), AX + MULQ R9 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w14 @ R11 + MOVQ 56(R10), AX + MULQ R9 + ADDQ AX, R11 + ADCQ DX, CX + ADDQ R8, R11 + + // | move to idle register + MOVQ (SP), R14 + + // | w-1 @ R14 + ADCQ CX, R14 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + // | W montgomerry reduction ends + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - + // | 8 R15 | 9 DI | 10 SI | 11 BX | 12 R13 | 13 R12 | 14 R11 | 15 R14 + + + // | + +/* modular reduction */ + + MOVQ R15, DX + SUBQ (R10), DX + MOVQ DX, (SP) + MOVQ DI, DX + SBBQ 8(R10), DX + MOVQ DX, 72(SP) + MOVQ SI, DX + SBBQ 16(R10), DX + MOVQ DX, 80(SP) + MOVQ BX, DX + SBBQ 24(R10), DX + MOVQ DX, 88(SP) + MOVQ R13, DX + SBBQ 32(R10), DX + MOVQ DX, 96(SP) + MOVQ R12, DX + SBBQ 40(R10), DX + MOVQ DX, 104(SP) + MOVQ R11, DX + SBBQ 48(R10), DX + MOVQ DX, 112(SP) + MOVQ R14, DX + SBBQ 56(R10), DX + MOVQ DX, 120(SP) + SBBQ $0x00, CX + + // | + +/* out */ + + MOVQ c+0(FP), CX + CMOVQCC (SP), R15 + MOVQ R15, (CX) + CMOVQCC 72(SP), DI + MOVQ DI, 8(CX) + CMOVQCC 80(SP), SI + MOVQ SI, 16(CX) + CMOVQCC 88(SP), BX + MOVQ BX, 24(CX) + CMOVQCC 96(SP), R13 + MOVQ R13, 32(CX) + CMOVQCC 104(SP), R12 + MOVQ R12, 40(CX) + CMOVQCC 112(SP), R11 + MOVQ R11, 48(CX) + CMOVQCC 120(SP), R14 + MOVQ R14, 56(CX) + RET + + // | + +/* end */ + + +// func cpy9(dst *[9]uint64, src *[9]uint64) +TEXT ·cpy9(SB), NOSPLIT, $0-16 + MOVQ dst+0(FP), DI + MOVQ src+8(FP), SI + MOVQ (SI), R8 + MOVQ R8, (DI) + MOVQ 8(SI), R8 + MOVQ R8, 8(DI) + MOVQ 16(SI), R8 + MOVQ R8, 16(DI) + MOVQ 24(SI), R8 + MOVQ R8, 24(DI) + MOVQ 32(SI), R8 + MOVQ R8, 32(DI) + MOVQ 40(SI), R8 + MOVQ R8, 40(DI) + MOVQ 48(SI), R8 + MOVQ R8, 48(DI) + MOVQ 56(SI), R8 + MOVQ R8, 56(DI) + MOVQ 64(SI), R8 + MOVQ R8, 64(DI) + RET + +// func eq9(a *[9]uint64, b *[9]uint64) bool +TEXT ·eq9(SB), NOSPLIT, $0-17 + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + MOVB $0x00, ret+16(FP) + MOVQ (DI), R8 + CMPQ (SI), R8 + JNE ret + MOVQ 8(DI), R8 + CMPQ 8(SI), R8 + JNE ret + MOVQ 16(DI), R8 + CMPQ 16(SI), R8 + JNE ret + MOVQ 24(DI), R8 + CMPQ 24(SI), R8 + JNE ret + MOVQ 32(DI), R8 + CMPQ 32(SI), R8 + JNE ret + MOVQ 40(DI), R8 + CMPQ 40(SI), R8 + JNE ret + MOVQ 48(DI), R8 + CMPQ 48(SI), R8 + JNE ret + MOVQ 56(DI), R8 + CMPQ 56(SI), R8 + JNE ret + MOVQ 64(DI), R8 + CMPQ 64(SI), R8 + JNE ret + MOVB $0x01, ret+16(FP) + +ret: + RET + +// func cmp9(a *[9]uint64, b *[9]uint64) int8 +TEXT ·cmp9(SB), NOSPLIT, $0-17 + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + MOVQ 64(DI), R8 + CMPQ 64(SI), R8 + JB gt + JA lt + MOVQ 56(DI), R8 + CMPQ 56(SI), R8 + JB gt + JA lt + MOVQ 48(DI), R8 + CMPQ 48(SI), R8 + JB gt + JA lt + MOVQ 40(DI), R8 + CMPQ 40(SI), R8 + JB gt + JA lt + MOVQ 32(DI), R8 + CMPQ 32(SI), R8 + JB gt + JA lt + MOVQ 24(DI), R8 + CMPQ 24(SI), R8 + JB gt + JA lt + MOVQ 16(DI), R8 + CMPQ 16(SI), R8 + JB gt + JA lt + MOVQ 8(DI), R8 + CMPQ 8(SI), R8 + JB gt + JA lt + MOVQ (DI), R8 + CMPQ (SI), R8 + JB gt + JA lt + MOVB $0x00, ret+16(FP) + JMP ret + +gt: + MOVB $0x01, ret+16(FP) + JMP ret + +lt: + MOVB $0xff, ret+16(FP) + +ret: + RET + +// func add9(c *[9]uint64, a *[9]uint64, b *[9]uint64, p *[9]uint64) +TEXT ·add9(SB), NOSPLIT, $64-32 + // | + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + XORQ AX, AX + + // | + MOVQ (DI), CX + ADDQ (SI), CX + MOVQ 8(DI), DX + ADCQ 8(SI), DX + MOVQ 16(DI), R8 + ADCQ 16(SI), R8 + MOVQ 24(DI), R9 + ADCQ 24(SI), R9 + MOVQ 32(DI), R10 + ADCQ 32(SI), R10 + MOVQ 40(DI), R11 + ADCQ 40(SI), R11 + MOVQ 48(DI), R12 + ADCQ 48(SI), R12 + MOVQ 56(DI), R13 + ADCQ 56(SI), R13 + MOVQ 64(DI), R14 + ADCQ 64(SI), R14 + ADCQ $0x00, AX + + // | + MOVQ p+24(FP), SI + MOVQ CX, R15 + SUBQ (SI), R15 + MOVQ DX, BX + SBBQ 8(SI), BX + MOVQ BX, (SP) + MOVQ R8, BX + SBBQ 16(SI), BX + MOVQ BX, 8(SP) + MOVQ R9, BX + SBBQ 24(SI), BX + MOVQ BX, 16(SP) + MOVQ R10, BX + SBBQ 32(SI), BX + MOVQ BX, 24(SP) + MOVQ R11, BX + SBBQ 40(SI), BX + MOVQ BX, 32(SP) + MOVQ R12, BX + SBBQ 48(SI), BX + MOVQ BX, 40(SP) + MOVQ R13, BX + SBBQ 56(SI), BX + MOVQ BX, 48(SP) + MOVQ R14, BX + SBBQ 64(SI), BX + MOVQ BX, 56(SP) + SBBQ $0x00, AX + + // | + MOVQ c+0(FP), DI + CMOVQCC R15, CX + MOVQ CX, (DI) + CMOVQCC (SP), DX + MOVQ DX, 8(DI) + CMOVQCC 8(SP), R8 + MOVQ R8, 16(DI) + CMOVQCC 16(SP), R9 + MOVQ R9, 24(DI) + CMOVQCC 24(SP), R10 + MOVQ R10, 32(DI) + CMOVQCC 32(SP), R11 + MOVQ R11, 40(DI) + CMOVQCC 40(SP), R12 + MOVQ R12, 48(DI) + CMOVQCC 48(SP), R13 + MOVQ R13, 56(DI) + CMOVQCC 56(SP), R14 + MOVQ R14, 64(DI) + RET + + // | + +/* end */ + + RET + +// func addn9(a *[9]uint64, b *[9]uint64) uint64 +TEXT ·addn9(SB), NOSPLIT, $0-24 + // | + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + + // | + MOVQ (DI), CX + ADDQ (SI), CX + MOVQ 8(DI), DX + ADCQ 8(SI), DX + MOVQ 16(DI), R8 + ADCQ 16(SI), R8 + MOVQ 24(DI), R9 + ADCQ 24(SI), R9 + MOVQ 32(DI), R10 + ADCQ 32(SI), R10 + MOVQ 40(DI), R11 + ADCQ 40(SI), R11 + MOVQ 48(DI), R12 + ADCQ 48(SI), R12 + MOVQ 56(DI), R13 + ADCQ 56(SI), R13 + MOVQ 64(DI), R14 + ADCQ 64(SI), R14 + ADCQ $0x00, AX + + // | + MOVQ CX, (DI) + MOVQ DX, 8(DI) + MOVQ R8, 16(DI) + MOVQ R9, 24(DI) + MOVQ R10, 32(DI) + MOVQ R11, 40(DI) + MOVQ R12, 48(DI) + MOVQ R13, 56(DI) + MOVQ R14, 64(DI) + MOVQ AX, ret+16(FP) + RET + + // | + +/* end */ + + RET + +// func double9(c *[9]uint64, a *[9]uint64, p *[9]uint64) +TEXT ·double9(SB), NOSPLIT, $64-24 + // | + MOVQ a+8(FP), DI + XORQ AX, AX + MOVQ (DI), CX + ADDQ CX, CX + MOVQ 8(DI), DX + ADCQ DX, DX + MOVQ 16(DI), R8 + ADCQ R8, R8 + MOVQ 24(DI), R9 + ADCQ R9, R9 + MOVQ 32(DI), R10 + ADCQ R10, R10 + MOVQ 40(DI), R11 + ADCQ R11, R11 + MOVQ 48(DI), R12 + ADCQ R12, R12 + MOVQ 56(DI), R13 + ADCQ R13, R13 + MOVQ 64(DI), R14 + ADCQ R14, R14 + ADCQ $0x00, AX + + // | + MOVQ p+16(FP), SI + MOVQ CX, R15 + SUBQ (SI), R15 + MOVQ DX, BX + SBBQ 8(SI), BX + MOVQ BX, (SP) + MOVQ R8, BX + SBBQ 16(SI), BX + MOVQ BX, 8(SP) + MOVQ R9, BX + SBBQ 24(SI), BX + MOVQ BX, 16(SP) + MOVQ R10, BX + SBBQ 32(SI), BX + MOVQ BX, 24(SP) + MOVQ R11, BX + SBBQ 40(SI), BX + MOVQ BX, 32(SP) + MOVQ R12, BX + SBBQ 48(SI), BX + MOVQ BX, 40(SP) + MOVQ R13, BX + SBBQ 56(SI), BX + MOVQ BX, 48(SP) + MOVQ R14, BX + SBBQ 64(SI), BX + MOVQ BX, 56(SP) + SBBQ $0x00, AX + + // | + MOVQ c+0(FP), DI + CMOVQCC R15, CX + MOVQ CX, (DI) + CMOVQCC (SP), DX + MOVQ DX, 8(DI) + CMOVQCC 8(SP), R8 + MOVQ R8, 16(DI) + CMOVQCC 16(SP), R9 + MOVQ R9, 24(DI) + CMOVQCC 24(SP), R10 + MOVQ R10, 32(DI) + CMOVQCC 32(SP), R11 + MOVQ R11, 40(DI) + CMOVQCC 40(SP), R12 + MOVQ R12, 48(DI) + CMOVQCC 48(SP), R13 + MOVQ R13, 56(DI) + CMOVQCC 56(SP), R14 + MOVQ R14, 64(DI) + RET + + // | + +/* end */ + + RET + +// func sub9(c *[9]uint64, a *[9]uint64, b *[9]uint64, p *[9]uint64) +TEXT ·sub9(SB), NOSPLIT, $64-32 + // | + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + XORQ AX, AX + MOVQ (DI), CX + SUBQ (SI), CX + MOVQ 8(DI), DX + SBBQ 8(SI), DX + MOVQ 16(DI), R8 + SBBQ 16(SI), R8 + MOVQ 24(DI), R9 + SBBQ 24(SI), R9 + MOVQ 32(DI), R10 + SBBQ 32(SI), R10 + MOVQ 40(DI), R11 + SBBQ 40(SI), R11 + MOVQ 48(DI), R12 + SBBQ 48(SI), R12 + MOVQ 56(DI), R13 + SBBQ 56(SI), R13 + MOVQ 64(DI), R14 + SBBQ 64(SI), R14 + + // | + MOVQ p+24(FP), SI + MOVQ (SI), R15 + CMOVQCC AX, R15 + CMOVQCS 8(SI), AX + MOVQ AX, (SP) + CMOVQCS 16(SI), AX + MOVQ AX, 8(SP) + CMOVQCS 24(SI), AX + MOVQ AX, 16(SP) + CMOVQCS 32(SI), AX + MOVQ AX, 24(SP) + CMOVQCS 40(SI), AX + MOVQ AX, 32(SP) + CMOVQCS 48(SI), AX + MOVQ AX, 40(SP) + CMOVQCS 56(SI), AX + MOVQ AX, 48(SP) + CMOVQCS 64(SI), AX + MOVQ AX, 56(SP) + + // | + MOVQ c+0(FP), DI + ADDQ R15, CX + MOVQ CX, (DI) + ADCQ (SP), DX + MOVQ DX, 8(DI) + ADCQ 8(SP), R8 + MOVQ R8, 16(DI) + ADCQ 16(SP), R9 + MOVQ R9, 24(DI) + ADCQ 24(SP), R10 + MOVQ R10, 32(DI) + ADCQ 32(SP), R11 + MOVQ R11, 40(DI) + ADCQ 40(SP), R12 + MOVQ R12, 48(DI) + ADCQ 48(SP), R13 + MOVQ R13, 56(DI) + ADCQ 56(SP), R14 + MOVQ R14, 64(DI) + RET + + // | + +/* end */ + + RET + +// func subn9(a *[9]uint64, b *[9]uint64) uint64 +TEXT ·subn9(SB), NOSPLIT, $0-24 + // | + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + XORQ AX, AX + + // | + MOVQ (DI), CX + SUBQ (SI), CX + MOVQ 8(DI), DX + SBBQ 8(SI), DX + MOVQ 16(DI), R8 + SBBQ 16(SI), R8 + MOVQ 24(DI), R9 + SBBQ 24(SI), R9 + MOVQ 32(DI), R10 + SBBQ 32(SI), R10 + MOVQ 40(DI), R11 + SBBQ 40(SI), R11 + MOVQ 48(DI), R12 + SBBQ 48(SI), R12 + MOVQ 56(DI), R13 + SBBQ 56(SI), R13 + MOVQ 64(DI), R14 + SBBQ 64(SI), R14 + ADCQ $0x00, AX + + // | + MOVQ CX, (DI) + MOVQ DX, 8(DI) + MOVQ R8, 16(DI) + MOVQ R9, 24(DI) + MOVQ R10, 32(DI) + MOVQ R11, 40(DI) + MOVQ R12, 48(DI) + MOVQ R13, 56(DI) + MOVQ R14, 64(DI) + MOVQ AX, ret+16(FP) + RET + + // | + +/* end */ + + RET + +// func _neg9(c *[9]uint64, a *[9]uint64, p *[9]uint64) +TEXT ·_neg9(SB), NOSPLIT, $0-24 + // | + MOVQ a+8(FP), DI + + // | + MOVQ p+16(FP), SI + MOVQ (SI), CX + SUBQ (DI), CX + MOVQ 8(SI), DX + SBBQ 8(DI), DX + MOVQ 16(SI), R8 + SBBQ 16(DI), R8 + MOVQ 24(SI), R9 + SBBQ 24(DI), R9 + MOVQ 32(SI), R10 + SBBQ 32(DI), R10 + MOVQ 40(SI), R11 + SBBQ 40(DI), R11 + MOVQ 48(SI), R12 + SBBQ 48(DI), R12 + MOVQ 56(SI), R13 + SBBQ 56(DI), R13 + MOVQ 64(SI), R14 + SBBQ 64(DI), R14 + + // | + MOVQ c+0(FP), DI + MOVQ CX, (DI) + MOVQ DX, 8(DI) + MOVQ R8, 16(DI) + MOVQ R9, 24(DI) + MOVQ R10, 32(DI) + MOVQ R11, 40(DI) + MOVQ R12, 48(DI) + MOVQ R13, 56(DI) + MOVQ R14, 64(DI) + RET + + // | + +/* end */ + + RET + +// func mul_two_9(a *[9]uint64) +TEXT ·mul_two_9(SB), NOSPLIT, $0-8 + MOVQ a+0(FP), DI + XORQ AX, AX + RCLQ $0x01, (DI) + RCLQ $0x01, 8(DI) + RCLQ $0x01, 16(DI) + RCLQ $0x01, 24(DI) + RCLQ $0x01, 32(DI) + RCLQ $0x01, 40(DI) + RCLQ $0x01, 48(DI) + RCLQ $0x01, 56(DI) + RCLQ $0x01, 64(DI) + RET + +// func div_two_9(a *[9]uint64) +TEXT ·div_two_9(SB), NOSPLIT, $0-8 + MOVQ a+0(FP), DI + XORQ AX, AX + RCRQ $0x01, 64(DI) + RCRQ $0x01, 56(DI) + RCRQ $0x01, 48(DI) + RCRQ $0x01, 40(DI) + RCRQ $0x01, 32(DI) + RCRQ $0x01, 24(DI) + RCRQ $0x01, 16(DI) + RCRQ $0x01, 8(DI) + RCRQ $0x01, (DI) + RET + +// func mul9(c *[9]uint64, a *[9]uint64, b *[9]uint64, p *[9]uint64, inp uint64) +TEXT ·mul9(SB), NOSPLIT, $64-40 + // | + +/* inputs */ + + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + XORQ AX, AX + + // | + +/* i = 0 */ + + // | a0 @ DX + MOVQ (DI), DX + + // | a0 * b0 + MULXQ (SI), AX, CX + MOVQ AX, (SP) + + // | a0 * b1 + MULXQ 8(SI), AX, R8 + ADCXQ AX, CX + + // | a0 * b2 + MULXQ 16(SI), AX, R9 + ADCXQ AX, R8 + + // | a0 * b3 + MULXQ 24(SI), AX, R10 + ADCXQ AX, R9 + + // | a0 * b4 + MULXQ 32(SI), AX, R11 + ADCXQ AX, R10 + + // | a0 * b5 + MULXQ 40(SI), AX, R12 + ADCXQ AX, R11 + + // | a0 * b6 + MULXQ 48(SI), AX, R13 + ADCXQ AX, R12 + + // | a0 * b7 + MULXQ 56(SI), AX, R14 + ADCXQ AX, R13 + + // | a0 * b8 + MULXQ 64(SI), AX, R15 + ADCXQ AX, R14 + ADCQ $0x00, R15 + + // | + +/* i = 1 */ + + // | a1 @ DX + MOVQ 8(DI), DX + XORQ AX, AX + + // | a1 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + MOVQ CX, 8(SP) + MOVQ $0x00, CX + + // | a1 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a1 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a1 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a1 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a1 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a1 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a1 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a1 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R15 + ADOXQ CX, CX + ADCXQ BX, CX + + // | + +/* i = 2 */ + + // | a2 @ DX + MOVQ 16(DI), DX + XORQ AX, AX + + // | a2 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + MOVQ R8, 16(SP) + MOVQ $0x00, R8 + + // | a2 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a2 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a2 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a2 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a2 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a2 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a2 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a2 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, CX + ADOXQ R8, R8 + ADCXQ BX, R8 + + // | + +/* i = 3 */ + + // | a3 @ DX + MOVQ 24(DI), DX + XORQ AX, AX + + // | a3 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + MOVQ R9, 24(SP) + MOVQ $0x00, R9 + + // | a3 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a3 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a3 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a3 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a3 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a3 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a3 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a3 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R8 + ADOXQ R9, R9 + ADCXQ BX, R9 + + // | + +/* i = 4 */ + + // | a4 @ DX + MOVQ 32(DI), DX + XORQ AX, AX + + // | a4 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + MOVQ R10, 32(SP) + MOVQ $0x00, R10 + + // | a4 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a4 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a4 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a4 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a4 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a4 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a4 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a4 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R9 + ADOXQ R10, R10 + ADCXQ BX, R10 + + // | + +/* i = 5 */ + + // | a5 @ DX + MOVQ 40(DI), DX + XORQ AX, AX + + // | a5 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + MOVQ R11, 40(SP) + MOVQ $0x00, R11 + + // | a5 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a5 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a5 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a5 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a5 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a5 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a5 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a5 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R10 + ADOXQ R11, R11 + ADCXQ BX, R11 + + // | + +/* i = 6 */ + + // | a6 @ DX + MOVQ 48(DI), DX + XORQ AX, AX + + // | a6 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + MOVQ R12, 48(SP) + MOVQ $0x00, R12 + + // | a6 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a6 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a6 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a6 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a6 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a6 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a6 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a6 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R11 + ADOXQ R12, R12 + ADCXQ BX, R12 + + // | + +/* i = 7 */ + + // | a7 @ DX + MOVQ 56(DI), DX + XORQ AX, AX + + // | a7 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + MOVQ R13, 56(SP) + MOVQ $0x00, R13 + + // | a7 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a7 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a7 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a7 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a7 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a7 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a7 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a7 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R12 + ADOXQ R13, R13 + ADCXQ BX, R13 + + // | + +/* i = 8 */ + + // | a8 @ DX + MOVQ 64(DI), DX + XORQ DI, DI + + // | a8 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a8 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a8 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a8 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a8 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a8 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a8 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a8 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a8 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R13 + ADOXQ BX, DI + ADCQ $0x00, DI + + // | + +/* */ + + // | + // | W + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 R14 + // | 9 R15 | 10 CX | 11 R8 | 12 R9 | 13 R10 | 14 R11 | 15 R12 | 16 R13 | 17 DI + + + MOVQ (SP), BX + MOVQ 8(SP), SI + MOVQ DI, (SP) + MOVQ 16(SP), DI + MOVQ R13, 8(SP) + MOVQ 24(SP), R13 + MOVQ R12, 16(SP) + MOVQ 32(SP), R12 + MOVQ R11, 24(SP) + MOVQ 40(SP), R11 + MOVQ R10, 32(SP) + MOVQ 48(SP), R10 + MOVQ R9, 40(SP) + MOVQ 56(SP), R9 + MOVQ R8, 48(SP) + MOVQ CX, 56(SP) + + // | fetch modulus + MOVQ p+24(FP), CX + + // | + // | W ready to mont + // | 0 BX | 1 SI | 2 DI | 3 R13 | 4 R12 | 5 R11 | 6 R10 | 7 R9 | 8 R14 + // | 9 R15 | 10 56(SP) | 11 48(SP) | 12 40(SP) | 13 32(SP) | 14 24(SP) | 15 16(SP) | 16 8(SP) | 17 (SP) + + + // | + +/* montgomery reduction */ + + // | clear flags + XORQ AX, AX + + // | + +/* i = 0 */ + + // | + // | W + // | 0 BX | 1 SI | 2 DI | 3 R13 | 4 R12 | 5 R11 | 6 R10 | 7 R9 | 8 R14 + // | 9 R15 | 10 56(SP) | 11 48(SP) | 12 40(SP) | 13 32(SP) | 14 24(SP) | 15 16(SP) | 16 8(SP) | 17 (SP) + + + // | | u0 = w0 * inp + MOVQ BX, DX + MULXQ inp+32(FP), DX, R8 + + // | + +/* */ + + // | j0 + + // | w0 @ BX + MULXQ (CX), AX, R8 + ADOXQ AX, BX + ADCXQ R8, SI + + // | j1 + + // | w1 @ SI + MULXQ 8(CX), AX, R8 + ADOXQ AX, SI + ADCXQ R8, DI + + // | j2 + + // | w2 @ DI + MULXQ 16(CX), AX, R8 + ADOXQ AX, DI + ADCXQ R8, R13 + + // | j3 + + // | w3 @ R13 + MULXQ 24(CX), AX, R8 + ADOXQ AX, R13 + ADCXQ R8, R12 + + // | j4 + + // | w4 @ R12 + MULXQ 32(CX), AX, R8 + ADOXQ AX, R12 + ADCXQ R8, R11 + + // | j5 + + // | w5 @ R11 + MULXQ 40(CX), AX, R8 + ADOXQ AX, R11 + ADCXQ R8, R10 + + // | j6 + + // | w6 @ R10 + MULXQ 48(CX), AX, R8 + ADOXQ AX, R10 + ADCXQ R8, R9 + + // | j7 + + // | w7 @ R9 + MULXQ 56(CX), AX, R8 + ADOXQ AX, R9 + ADCXQ R8, R14 + + // | j8 + + // | w8 @ R14 + MULXQ 64(CX), AX, R8 + ADOXQ AX, R14 + ADCXQ R8, R15 + ADOXQ BX, R15 + ADCXQ BX, BX + MOVQ $0x00, AX + ADOXQ AX, BX + + // | clear flags + XORQ AX, AX + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 SI | 2 DI | 3 R13 | 4 R12 | 5 R11 | 6 R10 | 7 R9 | 8 R14 + // | 9 R15 | 10 56(SP) | 11 48(SP) | 12 40(SP) | 13 32(SP) | 14 24(SP) | 15 16(SP) | 16 8(SP) | 17 (SP) + + + // | | u1 = w1 * inp + MOVQ SI, DX + MULXQ inp+32(FP), DX, R8 + + // | + +/* */ + + // | j0 + + // | w1 @ SI + MULXQ (CX), AX, R8 + ADOXQ AX, SI + ADCXQ R8, DI + + // | j1 + + // | w2 @ DI + MULXQ 8(CX), AX, R8 + ADOXQ AX, DI + ADCXQ R8, R13 + + // | j2 + + // | w3 @ R13 + MULXQ 16(CX), AX, R8 + ADOXQ AX, R13 + ADCXQ R8, R12 + + // | j3 + + // | w4 @ R12 + MULXQ 24(CX), AX, R8 + ADOXQ AX, R12 + ADCXQ R8, R11 + + // | j4 + + // | w5 @ R11 + MULXQ 32(CX), AX, R8 + ADOXQ AX, R11 + ADCXQ R8, R10 + + // | j5 + + // | w6 @ R10 + MULXQ 40(CX), AX, R8 + ADOXQ AX, R10 + ADCXQ R8, R9 + + // | j6 + + // | w7 @ R9 + MULXQ 48(CX), AX, R8 + ADOXQ AX, R9 + ADCXQ R8, R14 + + // | j7 + + // | w8 @ R14 + MULXQ 56(CX), AX, R8 + ADOXQ AX, R14 + ADCXQ R8, R15 + + // | j8 + + // | w9 @ R15 + MULXQ 64(CX), AX, R8 + ADOXQ AX, R15 + + // | w10 @ 56(SP) + // | move to temp register + MOVQ 56(SP), AX + ADCXQ R8, AX + ADOXQ BX, AX + + // | move to an idle register + // | w10 @ AX + MOVQ AX, BX + ADCXQ SI, SI + MOVQ $0x00, AX + ADOXQ AX, SI + + // | clear flags + XORQ AX, AX + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 DI | 3 R13 | 4 R12 | 5 R11 | 6 R10 | 7 R9 | 8 R14 + // | 9 R15 | 10 BX | 11 48(SP) | 12 40(SP) | 13 32(SP) | 14 24(SP) | 15 16(SP) | 16 8(SP) | 17 (SP) + + + // | | u2 = w2 * inp + MOVQ DI, DX + MULXQ inp+32(FP), DX, R8 + + // | + +/* */ + + // | j0 + + // | w2 @ DI + MULXQ (CX), AX, R8 + ADOXQ AX, DI + ADCXQ R8, R13 + + // | j1 + + // | w3 @ R13 + MULXQ 8(CX), AX, R8 + ADOXQ AX, R13 + ADCXQ R8, R12 + + // | j2 + + // | w4 @ R12 + MULXQ 16(CX), AX, R8 + ADOXQ AX, R12 + ADCXQ R8, R11 + + // | j3 + + // | w5 @ R11 + MULXQ 24(CX), AX, R8 + ADOXQ AX, R11 + ADCXQ R8, R10 + + // | j4 + + // | w6 @ R10 + MULXQ 32(CX), AX, R8 + ADOXQ AX, R10 + ADCXQ R8, R9 + + // | j5 + + // | w7 @ R9 + MULXQ 40(CX), AX, R8 + ADOXQ AX, R9 + ADCXQ R8, R14 + + // | j6 + + // | w8 @ R14 + MULXQ 48(CX), AX, R8 + ADOXQ AX, R14 + ADCXQ R8, R15 + + // | j7 + + // | w9 @ R15 + MULXQ 56(CX), AX, R8 + ADOXQ AX, R15 + ADCXQ R8, BX + + // | j8 + + // | w10 @ BX + MULXQ 64(CX), AX, R8 + ADOXQ AX, BX + + // | w11 @ 48(SP) + // | move to temp register + MOVQ 48(SP), AX + ADCXQ R8, AX + ADOXQ SI, AX + + // | move to an idle register + // | w11 @ AX + MOVQ AX, SI + ADCXQ DI, DI + MOVQ $0x00, AX + ADOXQ AX, DI + + // | clear flags + XORQ AX, AX + + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 R13 | 4 R12 | 5 R11 | 6 R10 | 7 R9 | 8 R14 + // | 9 R15 | 10 BX | 11 SI | 12 40(SP) | 13 32(SP) | 14 24(SP) | 15 16(SP) | 16 8(SP) | 17 (SP) + + + // | | u3 = w3 * inp + MOVQ R13, DX + MULXQ inp+32(FP), DX, R8 + + // | + +/* */ + + // | j0 + + // | w3 @ R13 + MULXQ (CX), AX, R8 + ADOXQ AX, R13 + ADCXQ R8, R12 + + // | j1 + + // | w4 @ R12 + MULXQ 8(CX), AX, R8 + ADOXQ AX, R12 + ADCXQ R8, R11 + + // | j2 + + // | w5 @ R11 + MULXQ 16(CX), AX, R8 + ADOXQ AX, R11 + ADCXQ R8, R10 + + // | j3 + + // | w6 @ R10 + MULXQ 24(CX), AX, R8 + ADOXQ AX, R10 + ADCXQ R8, R9 + + // | j4 + + // | w7 @ R9 + MULXQ 32(CX), AX, R8 + ADOXQ AX, R9 + ADCXQ R8, R14 + + // | j5 + + // | w8 @ R14 + MULXQ 40(CX), AX, R8 + ADOXQ AX, R14 + ADCXQ R8, R15 + + // | j6 + + // | w9 @ R15 + MULXQ 48(CX), AX, R8 + ADOXQ AX, R15 + ADCXQ R8, BX + + // | j7 + + // | w10 @ BX + MULXQ 56(CX), AX, R8 + ADOXQ AX, BX + ADCXQ R8, SI + + // | j8 + + // | w11 @ SI + MULXQ 64(CX), AX, R8 + ADOXQ AX, SI + + // | w12 @ 40(SP) + // | move to temp register + MOVQ 40(SP), AX + ADCXQ R8, AX + ADOXQ DI, AX + + // | move to an idle register + // | w12 @ AX + MOVQ AX, DI + ADCXQ R13, R13 + MOVQ $0x00, AX + ADOXQ AX, R13 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 4 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 R12 | 5 R11 | 6 R10 | 7 R9 | 8 R14 + // | 9 R15 | 10 BX | 11 SI | 12 DI | 13 32(SP) | 14 24(SP) | 15 16(SP) | 16 8(SP) | 17 (SP) + + + // | | u4 = w4 * inp + MOVQ R12, DX + MULXQ inp+32(FP), DX, R8 + + // | + +/* */ + + // | j0 + + // | w4 @ R12 + MULXQ (CX), AX, R8 + ADOXQ AX, R12 + ADCXQ R8, R11 + + // | j1 + + // | w5 @ R11 + MULXQ 8(CX), AX, R8 + ADOXQ AX, R11 + ADCXQ R8, R10 + + // | j2 + + // | w6 @ R10 + MULXQ 16(CX), AX, R8 + ADOXQ AX, R10 + ADCXQ R8, R9 + + // | j3 + + // | w7 @ R9 + MULXQ 24(CX), AX, R8 + ADOXQ AX, R9 + ADCXQ R8, R14 + + // | j4 + + // | w8 @ R14 + MULXQ 32(CX), AX, R8 + ADOXQ AX, R14 + ADCXQ R8, R15 + + // | j5 + + // | w9 @ R15 + MULXQ 40(CX), AX, R8 + ADOXQ AX, R15 + ADCXQ R8, BX + + // | j6 + + // | w10 @ BX + MULXQ 48(CX), AX, R8 + ADOXQ AX, BX + ADCXQ R8, SI + + // | j7 + + // | w11 @ SI + MULXQ 56(CX), AX, R8 + ADOXQ AX, SI + ADCXQ R8, DI + + // | j8 + + // | w12 @ DI + MULXQ 64(CX), AX, R8 + ADOXQ AX, DI + + // | w13 @ 32(SP) + // | move to temp register + MOVQ 32(SP), AX + ADCXQ R8, AX + ADOXQ R13, AX + + // | move to an idle register + // | w13 @ AX + MOVQ AX, R13 + ADCXQ R12, R12 + MOVQ $0x00, AX + ADOXQ AX, R12 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 5 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 R11 | 6 R10 | 7 R9 | 8 R14 + // | 9 R15 | 10 BX | 11 SI | 12 DI | 13 R13 | 14 24(SP) | 15 16(SP) | 16 8(SP) | 17 (SP) + + + // | | u5 = w5 * inp + MOVQ R11, DX + MULXQ inp+32(FP), DX, R8 + + // | + +/* */ + + // | j0 + + // | w5 @ R11 + MULXQ (CX), AX, R8 + ADOXQ AX, R11 + ADCXQ R8, R10 + + // | j1 + + // | w6 @ R10 + MULXQ 8(CX), AX, R8 + ADOXQ AX, R10 + ADCXQ R8, R9 + + // | j2 + + // | w7 @ R9 + MULXQ 16(CX), AX, R8 + ADOXQ AX, R9 + ADCXQ R8, R14 + + // | j3 + + // | w8 @ R14 + MULXQ 24(CX), AX, R8 + ADOXQ AX, R14 + ADCXQ R8, R15 + + // | j4 + + // | w9 @ R15 + MULXQ 32(CX), AX, R8 + ADOXQ AX, R15 + ADCXQ R8, BX + + // | j5 + + // | w10 @ BX + MULXQ 40(CX), AX, R8 + ADOXQ AX, BX + ADCXQ R8, SI + + // | j6 + + // | w11 @ SI + MULXQ 48(CX), AX, R8 + ADOXQ AX, SI + ADCXQ R8, DI + + // | j7 + + // | w12 @ DI + MULXQ 56(CX), AX, R8 + ADOXQ AX, DI + ADCXQ R8, R13 + + // | j8 + + // | w13 @ R13 + MULXQ 64(CX), AX, R8 + ADOXQ AX, R13 + + // | w14 @ 24(SP) + // | move to temp register + MOVQ 24(SP), AX + ADCXQ R8, AX + ADOXQ R12, AX + + // | move to an idle register + // | w14 @ AX + MOVQ AX, R12 + ADCXQ R11, R11 + MOVQ $0x00, AX + ADOXQ AX, R11 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 6 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 R10 | 7 R9 | 8 R14 + // | 9 R15 | 10 BX | 11 SI | 12 DI | 13 R13 | 14 R12 | 15 16(SP) | 16 8(SP) | 17 (SP) + + + // | | u6 = w6 * inp + MOVQ R10, DX + MULXQ inp+32(FP), DX, R8 + + // | + +/* */ + + // | j0 + + // | w6 @ R10 + MULXQ (CX), AX, R8 + ADOXQ AX, R10 + ADCXQ R8, R9 + + // | j1 + + // | w7 @ R9 + MULXQ 8(CX), AX, R8 + ADOXQ AX, R9 + ADCXQ R8, R14 + + // | j2 + + // | w8 @ R14 + MULXQ 16(CX), AX, R8 + ADOXQ AX, R14 + ADCXQ R8, R15 + + // | j3 + + // | w9 @ R15 + MULXQ 24(CX), AX, R8 + ADOXQ AX, R15 + ADCXQ R8, BX + + // | j4 + + // | w10 @ BX + MULXQ 32(CX), AX, R8 + ADOXQ AX, BX + ADCXQ R8, SI + + // | j5 + + // | w11 @ SI + MULXQ 40(CX), AX, R8 + ADOXQ AX, SI + ADCXQ R8, DI + + // | j6 + + // | w12 @ DI + MULXQ 48(CX), AX, R8 + ADOXQ AX, DI + ADCXQ R8, R13 + + // | j7 + + // | w13 @ R13 + MULXQ 56(CX), AX, R8 + ADOXQ AX, R13 + ADCXQ R8, R12 + + // | j8 + + // | w14 @ R12 + MULXQ 64(CX), AX, R8 + ADOXQ AX, R12 + + // | w15 @ 16(SP) + // | move to temp register + MOVQ 16(SP), AX + ADCXQ R8, AX + ADOXQ R11, AX + + // | move to an idle register + // | w15 @ AX + MOVQ AX, R11 + ADCXQ R10, R10 + MOVQ $0x00, AX + ADOXQ AX, R10 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 7 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 R9 | 8 R14 + // | 9 R15 | 10 BX | 11 SI | 12 DI | 13 R13 | 14 R12 | 15 R11 | 16 8(SP) | 17 (SP) + + + // | | u7 = w7 * inp + MOVQ R9, DX + MULXQ inp+32(FP), DX, R8 + + // | + +/* */ + + // | j0 + + // | w7 @ R9 + MULXQ (CX), AX, R8 + ADOXQ AX, R9 + ADCXQ R8, R14 + + // | j1 + + // | w8 @ R14 + MULXQ 8(CX), AX, R8 + ADOXQ AX, R14 + ADCXQ R8, R15 + + // | j2 + + // | w9 @ R15 + MULXQ 16(CX), AX, R8 + ADOXQ AX, R15 + ADCXQ R8, BX + + // | j3 + + // | w10 @ BX + MULXQ 24(CX), AX, R8 + ADOXQ AX, BX + ADCXQ R8, SI + + // | j4 + + // | w11 @ SI + MULXQ 32(CX), AX, R8 + ADOXQ AX, SI + ADCXQ R8, DI + + // | j5 + + // | w12 @ DI + MULXQ 40(CX), AX, R8 + ADOXQ AX, DI + ADCXQ R8, R13 + + // | j6 + + // | w13 @ R13 + MULXQ 48(CX), AX, R8 + ADOXQ AX, R13 + ADCXQ R8, R12 + + // | j7 + + // | w14 @ R12 + MULXQ 56(CX), AX, R8 + ADOXQ AX, R12 + ADCXQ R8, R11 + + // | j8 + + // | w15 @ R11 + MULXQ 64(CX), AX, R8 + ADOXQ AX, R11 + + // | w16 @ 8(SP) + // | move to temp register + MOVQ 8(SP), AX + ADCXQ R8, AX + ADOXQ R10, AX + + // | move to an idle register + // | w16 @ AX + MOVQ AX, R10 + ADCXQ R9, R9 + MOVQ $0x00, AX + ADOXQ AX, R9 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 8 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R14 + // | 9 R15 | 10 BX | 11 SI | 12 DI | 13 R13 | 14 R12 | 15 R11 | 16 R10 | 17 (SP) + + + // | | u8 = w8 * inp + MOVQ R14, DX + MULXQ inp+32(FP), DX, R8 + + // | + +/* */ + + // | j0 + + // | w8 @ R14 + MULXQ (CX), AX, R8 + ADOXQ AX, R14 + ADCXQ R8, R15 + + // | j1 + + // | w9 @ R15 + MULXQ 8(CX), AX, R8 + ADOXQ AX, R15 + ADCXQ R8, BX + + // | j2 + + // | w10 @ BX + MULXQ 16(CX), AX, R8 + ADOXQ AX, BX + ADCXQ R8, SI + + // | j3 + + // | w11 @ SI + MULXQ 24(CX), AX, R8 + ADOXQ AX, SI + ADCXQ R8, DI + + // | j4 + + // | w12 @ DI + MULXQ 32(CX), AX, R8 + ADOXQ AX, DI + ADCXQ R8, R13 + + // | j5 + + // | w13 @ R13 + MULXQ 40(CX), AX, R8 + ADOXQ AX, R13 + ADCXQ R8, R12 + + // | j6 + + // | w14 @ R12 + MULXQ 48(CX), AX, R8 + ADOXQ AX, R12 + ADCXQ R8, R11 + + // | j7 + + // | w15 @ R11 + MULXQ 56(CX), AX, R8 + ADOXQ AX, R11 + ADCXQ R8, R10 + + // | j8 + + // | w16 @ R10 + MULXQ 64(CX), AX, R8 + ADOXQ AX, R10 + + // | w17 @ (SP) + // | move to temp register + MOVQ (SP), AX + ADCXQ R8, AX + ADOXQ R9, AX + + // | move to an idle register + // | w17 @ AX + MOVQ AX, R9 + ADCXQ R14, R14 + MOVQ $0x00, AX + ADOXQ AX, R14 + + // | + // | W montgomery reduction ends + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - + // | 9 R15 | 10 BX | 11 SI | 12 DI | 13 R13 | 14 R12 | 15 R11 | 16 R10 | 17 R9 + + + // | + +/* modular reduction */ + + MOVQ R15, AX + SUBQ (CX), AX + MOVQ BX, R8 + SBBQ 8(CX), R8 + MOVQ SI, DX + SBBQ 16(CX), DX + MOVQ DX, (SP) + MOVQ DI, DX + SBBQ 24(CX), DX + MOVQ DX, 8(SP) + MOVQ R13, DX + SBBQ 32(CX), DX + MOVQ DX, 16(SP) + MOVQ R12, DX + SBBQ 40(CX), DX + MOVQ DX, 24(SP) + MOVQ R11, DX + SBBQ 48(CX), DX + MOVQ DX, 32(SP) + MOVQ R10, DX + SBBQ 56(CX), DX + MOVQ DX, 40(SP) + MOVQ R9, DX + SBBQ 64(CX), DX + MOVQ DX, 48(SP) + SBBQ $0x00, R14 + + // | + +/* out */ + + MOVQ c+0(FP), R14 + CMOVQCC AX, R15 + MOVQ R15, (R14) + CMOVQCC R8, BX + MOVQ BX, 8(R14) + CMOVQCC (SP), SI + MOVQ SI, 16(R14) + CMOVQCC 8(SP), DI + MOVQ DI, 24(R14) + CMOVQCC 16(SP), R13 + MOVQ R13, 32(R14) + CMOVQCC 24(SP), R12 + MOVQ R12, 40(R14) + CMOVQCC 32(SP), R11 + MOVQ R11, 48(R14) + CMOVQCC 40(SP), R10 + MOVQ R10, 56(R14) + CMOVQCC 48(SP), R9 + MOVQ R9, 64(R14) + RET + + // | + +/* end */ + + +// func mul_no_adx_bmi2_9(c *[9]uint64, a *[9]uint64, b *[9]uint64, p *[9]uint64, inp uint64) +TEXT ·mul_no_adx_bmi2_9(SB), NOSPLIT, $144-40 + // | + +/* inputs */ + + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + MOVQ $0x00, R9 + MOVQ $0x00, R10 + MOVQ $0x00, R11 + MOVQ $0x00, R12 + MOVQ $0x00, R13 + MOVQ $0x00, R14 + MOVQ $0x00, R15 + + // | + +/* i = 0 */ + + // | a0 @ CX + MOVQ (DI), CX + + // | a0 * b0 + MOVQ (SI), AX + MULQ CX + MOVQ AX, (SP) + MOVQ DX, R8 + + // | a0 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + + // | a0 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + + // | a0 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + + // | a0 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + + // | a0 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + + // | a0 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + + // | a0 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + + // | + +/* i = 1 */ + + // | a1 @ CX + MOVQ 8(DI), CX + MOVQ $0x00, BX + + // | a1 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R10 + ADCQ $0x00, BX + MOVQ R8, 8(SP) + MOVQ $0x00, R8 + + // | a1 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + + // | a1 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + + // | + +/* i = 2 */ + + // | a2 @ CX + MOVQ 16(DI), CX + MOVQ $0x00, BX + + // | a2 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ $0x00, R11 + ADCQ $0x00, BX + MOVQ R9, 16(SP) + MOVQ $0x00, R9 + + // | a2 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + + // | a2 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + + // | + +/* i = 3 */ + + // | a3 @ CX + MOVQ 24(DI), CX + MOVQ $0x00, BX + + // | a3 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ $0x00, R12 + ADCQ $0x00, BX + MOVQ R10, 24(SP) + MOVQ $0x00, R10 + + // | a3 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + + // | a3 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + + // | + +/* i = 4 */ + + // | a4 @ CX + MOVQ 32(DI), CX + MOVQ $0x00, BX + + // | a4 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ $0x00, R13 + ADCQ $0x00, BX + MOVQ R11, 32(SP) + MOVQ $0x00, R11 + + // | a4 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + + // | a4 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + + // | + +/* i = 5 */ + + // | a5 @ CX + MOVQ 40(DI), CX + MOVQ $0x00, BX + + // | a5 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ $0x00, R14 + ADCQ $0x00, BX + MOVQ R12, 40(SP) + MOVQ $0x00, R12 + + // | a5 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + + // | a5 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + + // | + +/* i = 6 */ + + // | a6 @ CX + MOVQ 48(DI), CX + MOVQ $0x00, BX + + // | a6 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ $0x00, R15 + ADCQ $0x00, BX + MOVQ R13, 48(SP) + MOVQ $0x00, R13 + + // | a6 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + + // | a6 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + + // | + +/* i = 7 */ + + // | a7 @ CX + MOVQ 56(DI), CX + MOVQ $0x00, BX + + // | a7 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ $0x00, R8 + ADCQ $0x00, BX + MOVQ R14, 56(SP) + MOVQ $0x00, R14 + + // | a7 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + + // | a7 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + + // | + +/* i = 8 */ + + // | a8 @ CX + MOVQ 64(DI), CX + MOVQ $0x00, BX + + // | a8 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ $0x00, R9 + ADCQ $0x00, BX + + // | a8 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ $0x00, BX + + // | a8 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, BX + + // | + +/* */ + + // | + // | W part 1 multiplication + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 R15 + // | 9 R8 | 10 R9 | 11 R10 | 12 R11 | 13 R12 | 14 R13 | 15 R14 | 16 BX | 17 - + + + MOVQ R15, 64(SP) + MOVQ R8, 72(SP) + MOVQ R9, 80(SP) + MOVQ R10, 88(SP) + MOVQ R11, 96(SP) + MOVQ R12, 104(SP) + MOVQ R13, 112(SP) + MOVQ R14, 120(SP) + MOVQ BX, 128(SP) + + // | + // | W part 1 moved to stack + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) + // | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 112(SP) | 15 120(SP) | 16 128(SP) | 17 - + + + MOVQ $0x00, R9 + MOVQ $0x00, R10 + MOVQ $0x00, R11 + MOVQ $0x00, R12 + MOVQ $0x00, R13 + MOVQ $0x00, R14 + MOVQ $0x00, R15 + + // | + +/* i = 8 */ + + // | b8 @ CX + MOVQ 64(SI), CX + MOVQ $0x00, BX + + // | a0 * b8 + MOVQ (DI), AX + MULQ CX + MOVQ AX, 136(SP) + MOVQ DX, R8 + + // | a1 * b8 + MOVQ 8(DI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + + // | a2 * b8 + MOVQ 16(DI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + + // | a3 * b8 + MOVQ 24(DI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + + // | a4 * b8 + MOVQ 32(DI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + + // | a5 * b8 + MOVQ 40(DI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + + // | a6 * b8 + MOVQ 48(DI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + + // | a7 * b8 + MOVQ 56(DI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + + // | a8 * b8 + MOVQ 64(DI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, BX + + // | + +/* */ + + // | + // | W part 2 multiplication + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 136(SP) + // | 9 R8 | 10 R9 | 11 R10 | 12 R11 | 13 R12 | 14 R13 | 15 R14 | 16 R15 | 17 BX + + + // | + // | W part 1 + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) + // | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 112(SP) | 15 120(SP) | 16 128(SP) | 17 - + + + MOVQ 64(SP), AX + ADDQ AX, 136(SP) + ADCQ 72(SP), R8 + ADCQ 80(SP), R9 + ADCQ 88(SP), R10 + ADCQ 96(SP), R11 + ADCQ 104(SP), R12 + ADCQ 112(SP), R13 + ADCQ 120(SP), R14 + ADCQ 128(SP), R15 + ADCQ $0x00, BX + + // | + // | W combined + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 136(SP) + // | 9 R8 | 10 R9 | 11 R10 | 12 R11 | 13 R12 | 14 R13 | 15 R14 | 16 R15 | 17 BX + + + MOVQ (SP), CX + MOVQ 8(SP), DI + MOVQ 16(SP), SI + MOVQ BX, (SP) + MOVQ 24(SP), BX + MOVQ R15, 8(SP) + MOVQ 32(SP), R15 + MOVQ R14, 16(SP) + MOVQ 40(SP), R14 + MOVQ R13, 24(SP) + MOVQ 48(SP), R13 + MOVQ R12, 32(SP) + MOVQ 56(SP), R12 + MOVQ R11, 40(SP) + MOVQ 136(SP), R11 + MOVQ R10, 48(SP) + MOVQ R9, 56(SP) + MOVQ R8, 64(SP) + + // | fetch modulus + MOVQ p+24(FP), R10 + + // | + +/* montgomery reduction q1 */ + + // | + +/* i = 0 */ + + // | + // | W + // | 0 CX | 1 DI | 2 SI | 3 BX | 4 R15 | 5 R14 | 6 R13 | 7 R12 | 8 R11 + // | 9 64(SP) | 10 56(SP) | 11 48(SP) | 12 40(SP) | 13 32(SP) | 14 24(SP) | 15 16(SP) | 16 8(SP) | 17 (SP) + + + // | | u0 = w0 * inp + MOVQ CX, AX + MULQ inp+32(FP) + MOVQ AX, R9 + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u0 + MOVQ R9, 72(SP) + + // | j0 + + // | w0 @ CX + MOVQ (R10), AX + MULQ R9 + ADDQ AX, CX + ADCQ DX, R8 + + // | j1 + + // | w1 @ DI + MOVQ 8(R10), AX + MULQ R9 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w2 @ SI + MOVQ 16(R10), AX + MULQ R9 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w3 @ BX + MOVQ 24(R10), AX + MULQ R9 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w4 @ R15 + MOVQ 32(R10), AX + MULQ R9 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R8, R15 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w5 @ R14 + MOVQ 40(R10), AX + MULQ R9 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w6 @ R13 + MOVQ 48(R10), AX + MULQ R9 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w7 @ R12 + MOVQ 56(R10), AX + MULQ R9 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + + // | w8 @ R11 + ADCQ DX, R11 + ADCQ $0x00, CX + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 DI | 2 SI | 3 BX | 4 R15 | 5 R14 | 6 R13 | 7 R12 | 8 R11 + // | 9 64(SP) | 10 56(SP) | 11 48(SP) | 12 40(SP) | 13 32(SP) | 14 24(SP) | 15 16(SP) | 16 8(SP) | 17 (SP) + + + // | | u1 = w1 * inp + MOVQ DI, AX + MULQ inp+32(FP) + MOVQ AX, R9 + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u1 + MOVQ R9, 80(SP) + + // | j0 + + // | w1 @ DI + MOVQ (R10), AX + MULQ R9 + ADDQ AX, DI + ADCQ DX, R8 + + // | j1 + + // | w2 @ SI + MOVQ 8(R10), AX + MULQ R9 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w3 @ BX + MOVQ 16(R10), AX + MULQ R9 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w4 @ R15 + MOVQ 24(R10), AX + MULQ R9 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R8, R15 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w5 @ R14 + MOVQ 32(R10), AX + MULQ R9 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w6 @ R13 + MOVQ 40(R10), AX + MULQ R9 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w7 @ R12 + MOVQ 48(R10), AX + MULQ R9 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w8 @ R11 + MOVQ 56(R10), AX + MULQ R9 + ADDQ AX, R11 + ADCQ DX, CX + ADDQ R8, R11 + + // | move to idle register + MOVQ 64(SP), DI + + // | w9 @ DI + ADCQ CX, DI + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 SI | 3 BX | 4 R15 | 5 R14 | 6 R13 | 7 R12 | 8 R11 + // | 9 DI | 10 56(SP) | 11 48(SP) | 12 40(SP) | 13 32(SP) | 14 24(SP) | 15 16(SP) | 16 8(SP) | 17 (SP) + + + // | | u2 = w2 * inp + MOVQ SI, AX + MULQ inp+32(FP) + MOVQ AX, R9 + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u2 + MOVQ R9, 64(SP) + + // | j0 + + // | w2 @ SI + MOVQ (R10), AX + MULQ R9 + ADDQ AX, SI + ADCQ DX, R8 + + // | j1 + + // | w3 @ BX + MOVQ 8(R10), AX + MULQ R9 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w4 @ R15 + MOVQ 16(R10), AX + MULQ R9 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R8, R15 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w5 @ R14 + MOVQ 24(R10), AX + MULQ R9 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w6 @ R13 + MOVQ 32(R10), AX + MULQ R9 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w7 @ R12 + MOVQ 40(R10), AX + MULQ R9 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w8 @ R11 + MOVQ 48(R10), AX + MULQ R9 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w9 @ DI + MOVQ 56(R10), AX + MULQ R9 + ADDQ AX, DI + ADCQ DX, CX + ADDQ R8, DI + + // | move to idle register + MOVQ 56(SP), SI + + // | w10 @ SI + ADCQ CX, SI + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 BX | 4 R15 | 5 R14 | 6 R13 | 7 R12 | 8 R11 + // | 9 DI | 10 SI | 11 48(SP) | 12 40(SP) | 13 32(SP) | 14 24(SP) | 15 16(SP) | 16 8(SP) | 17 (SP) + + + // | | u3 = w3 * inp + MOVQ BX, AX + MULQ inp+32(FP) + MOVQ AX, R9 + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u3 + MOVQ R9, 56(SP) + + // | j0 + + // | w3 @ BX + MOVQ (R10), AX + MULQ R9 + ADDQ AX, BX + ADCQ DX, R8 + + // | j1 + + // | w4 @ R15 + MOVQ 8(R10), AX + MULQ R9 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R8, R15 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w5 @ R14 + MOVQ 16(R10), AX + MULQ R9 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w6 @ R13 + MOVQ 24(R10), AX + MULQ R9 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w7 @ R12 + MOVQ 32(R10), AX + MULQ R9 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w8 @ R11 + MOVQ 40(R10), AX + MULQ R9 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w9 @ DI + MOVQ 48(R10), AX + MULQ R9 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w10 @ SI + MOVQ 56(R10), AX + MULQ R9 + ADDQ AX, SI + ADCQ DX, CX + ADDQ R8, SI + + // | move to idle register + MOVQ 48(SP), BX + + // | w11 @ BX + ADCQ CX, BX + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 4 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 R15 | 5 R14 | 6 R13 | 7 R12 | 8 R11 + // | 9 DI | 10 SI | 11 BX | 12 40(SP) | 13 32(SP) | 14 24(SP) | 15 16(SP) | 16 8(SP) | 17 (SP) + + + // | | u4 = w4 * inp + MOVQ R15, AX + MULQ inp+32(FP) + MOVQ AX, R9 + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u4 + MOVQ R9, 48(SP) + + // | j0 + + // | w4 @ R15 + MOVQ (R10), AX + MULQ R9 + ADDQ AX, R15 + ADCQ DX, R8 + + // | j1 + + // | w5 @ R14 + MOVQ 8(R10), AX + MULQ R9 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w6 @ R13 + MOVQ 16(R10), AX + MULQ R9 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w7 @ R12 + MOVQ 24(R10), AX + MULQ R9 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w8 @ R11 + MOVQ 32(R10), AX + MULQ R9 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w9 @ DI + MOVQ 40(R10), AX + MULQ R9 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w10 @ SI + MOVQ 48(R10), AX + MULQ R9 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w11 @ BX + MOVQ 56(R10), AX + MULQ R9 + ADDQ AX, BX + ADCQ DX, CX + ADDQ R8, BX + + // | move to idle register + MOVQ 40(SP), R15 + + // | w12 @ R15 + ADCQ CX, R15 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 5 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 R14 | 6 R13 | 7 R12 | 8 R11 + // | 9 DI | 10 SI | 11 BX | 12 R15 | 13 32(SP) | 14 24(SP) | 15 16(SP) | 16 8(SP) | 17 (SP) + + + // | | u5 = w5 * inp + MOVQ R14, AX + MULQ inp+32(FP) + MOVQ AX, R9 + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u5 + MOVQ R9, 40(SP) + + // | j0 + + // | w5 @ R14 + MOVQ (R10), AX + MULQ R9 + ADDQ AX, R14 + ADCQ DX, R8 + + // | j1 + + // | w6 @ R13 + MOVQ 8(R10), AX + MULQ R9 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w7 @ R12 + MOVQ 16(R10), AX + MULQ R9 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w8 @ R11 + MOVQ 24(R10), AX + MULQ R9 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w9 @ DI + MOVQ 32(R10), AX + MULQ R9 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w10 @ SI + MOVQ 40(R10), AX + MULQ R9 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w11 @ BX + MOVQ 48(R10), AX + MULQ R9 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w12 @ R15 + MOVQ 56(R10), AX + MULQ R9 + ADDQ AX, R15 + ADCQ DX, CX + ADDQ R8, R15 + + // | move to idle register + MOVQ 32(SP), R14 + + // | w13 @ R14 + ADCQ CX, R14 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 6 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 R13 | 7 R12 | 8 R11 + // | 9 DI | 10 SI | 11 BX | 12 R15 | 13 R14 | 14 24(SP) | 15 16(SP) | 16 8(SP) | 17 (SP) + + + // | | u6 = w6 * inp + MOVQ R13, AX + MULQ inp+32(FP) + MOVQ AX, R9 + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u6 + MOVQ R9, 32(SP) + + // | j0 + + // | w6 @ R13 + MOVQ (R10), AX + MULQ R9 + ADDQ AX, R13 + ADCQ DX, R8 + + // | j1 + + // | w7 @ R12 + MOVQ 8(R10), AX + MULQ R9 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w8 @ R11 + MOVQ 16(R10), AX + MULQ R9 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w9 @ DI + MOVQ 24(R10), AX + MULQ R9 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w10 @ SI + MOVQ 32(R10), AX + MULQ R9 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w11 @ BX + MOVQ 40(R10), AX + MULQ R9 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w12 @ R15 + MOVQ 48(R10), AX + MULQ R9 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R8, R15 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w13 @ R14 + MOVQ 56(R10), AX + MULQ R9 + ADDQ AX, R14 + ADCQ DX, CX + ADDQ R8, R14 + + // | move to idle register + MOVQ 24(SP), R13 + + // | w14 @ R13 + ADCQ CX, R13 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 7 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 R12 | 8 R11 + // | 9 DI | 10 SI | 11 BX | 12 R15 | 13 R14 | 14 R13 | 15 16(SP) | 16 8(SP) | 17 (SP) + + + // | | u7 = w7 * inp + MOVQ R12, AX + MULQ inp+32(FP) + MOVQ AX, R9 + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u7 + MOVQ R9, 24(SP) + + // | j0 + + // | w7 @ R12 + MOVQ (R10), AX + MULQ R9 + ADDQ AX, R12 + ADCQ DX, R8 + + // | j1 + + // | w8 @ R11 + MOVQ 8(R10), AX + MULQ R9 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w9 @ DI + MOVQ 16(R10), AX + MULQ R9 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w10 @ SI + MOVQ 24(R10), AX + MULQ R9 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w11 @ BX + MOVQ 32(R10), AX + MULQ R9 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w12 @ R15 + MOVQ 40(R10), AX + MULQ R9 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R8, R15 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w13 @ R14 + MOVQ 48(R10), AX + MULQ R9 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w14 @ R13 + MOVQ 56(R10), AX + MULQ R9 + ADDQ AX, R13 + ADCQ DX, CX + ADDQ R8, R13 + + // | move to idle register + MOVQ 16(SP), R12 + + // | w15 @ R12 + ADCQ CX, R12 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + // | W q1 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R11 + // | 9 DI | 10 SI | 11 BX | 12 R15 | 13 R14 | 14 R13 | 15 R12 | 16 8(SP) | 17 (SP) + + + // | + +/* montgomerry reduction q2 */ + + MOVQ $0x00, R8 + + // | + +/* i = 0 */ + + // | w8 @ R11 + MOVQ 64(R10), AX + MULQ 72(SP) + ADDQ AX, R11 + ADCQ DX, DI + ADCQ $0x00, R8 + + // | + +/* i = 1 */ + + // | w9 @ DI + MOVQ 64(R10), AX + MULQ 80(SP) + ADDQ AX, DI + ADCQ R8, DX + MOVQ $0x00, R8 + ADDQ DX, SI + ADCQ $0x00, R8 + + // | + +/* i = 2 */ + + // | w10 @ SI + MOVQ 64(R10), AX + MULQ 64(SP) + ADDQ AX, SI + ADCQ R8, DX + MOVQ $0x00, R8 + ADDQ DX, BX + ADCQ $0x00, R8 + + // | + +/* i = 3 */ + + // | w11 @ BX + MOVQ 64(R10), AX + MULQ 56(SP) + ADDQ AX, BX + ADCQ R8, DX + MOVQ $0x00, R8 + ADDQ DX, R15 + ADCQ $0x00, R8 + + // | + +/* i = 4 */ + + // | w12 @ R15 + MOVQ 64(R10), AX + MULQ 48(SP) + ADDQ AX, R15 + ADCQ R8, DX + MOVQ $0x00, R8 + ADDQ DX, R14 + ADCQ $0x00, R8 + + // | + +/* i = 5 */ + + // | w13 @ R14 + MOVQ 64(R10), AX + MULQ 40(SP) + ADDQ AX, R14 + ADCQ R8, DX + MOVQ $0x00, R8 + ADDQ DX, R13 + ADCQ $0x00, R8 + + // | + +/* i = 6 */ + + // | w14 @ R13 + MOVQ 64(R10), AX + MULQ 32(SP) + ADDQ AX, R13 + ADCQ R8, DX + MOVQ $0x00, R8 + ADDQ DX, R12 + + // | carry from q1 + ADCQ CX, R8 + + // | + +/* i = 7 */ + + // | w15 @ R12 + MOVQ 64(R10), AX + MULQ 24(SP) + ADDQ AX, R12 + ADCQ R8, DX + MOVQ $0x00, R8 + ADDQ DX, 8(SP) + ADCQ $0x00, R8 + + // | + // | W q2 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R11 + // | 9 DI | 10 SI | 11 BX | 12 R15 | 13 R14 | 14 R13 | 15 R12 | 16 8(SP) | 17 (SP) + + + // | + +/* montgomery reduction q3 */ + + // | + +/* i = 8 */ + + // | | u8 = w8 * inp + MOVQ R11, AX + MULQ inp+32(FP) + MOVQ AX, R9 + MOVQ $0x00, CX + + // | j0 + + // | w8 @ R11 + MOVQ (R10), AX + MULQ R9 + ADDQ AX, R11 + ADCQ DX, CX + + // | j1 + + // | w9 @ DI + MOVQ 8(R10), AX + MULQ R9 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ CX, DI + MOVQ $0x00, CX + ADCQ DX, CX + + // | j2 + + // | w10 @ SI + MOVQ 16(R10), AX + MULQ R9 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ CX, SI + MOVQ $0x00, CX + ADCQ DX, CX + + // | j3 + + // | w11 @ BX + MOVQ 24(R10), AX + MULQ R9 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ CX, BX + MOVQ $0x00, CX + ADCQ DX, CX + + // | j4 + + // | w12 @ R15 + MOVQ 32(R10), AX + MULQ R9 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ CX, R15 + MOVQ $0x00, CX + ADCQ DX, CX + + // | j5 + + // | w13 @ R14 + MOVQ 40(R10), AX + MULQ R9 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ CX, R14 + MOVQ $0x00, CX + ADCQ DX, CX + + // | j6 + + // | w14 @ R13 + MOVQ 48(R10), AX + MULQ R9 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ CX, R13 + MOVQ $0x00, CX + ADCQ DX, CX + + // | j7 + + // | w15 @ R12 + MOVQ 56(R10), AX + MULQ R9 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ CX, R12 + MOVQ $0x00, CX + ADCQ DX, CX + + // | j8 + + // | w16 @ 8(SP) + // | move to idle register + MOVQ 8(SP), R11 + MOVQ 64(R10), AX + MULQ R9 + ADDQ AX, R11 + ADCQ DX, R8 + ADDQ CX, R11 + + // | w17 @ (SP) + ADCQ (SP), R8 + + // | care the last bit + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + // | W q3 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - + // | 9 DI | 10 SI | 11 BX | 12 R15 | 13 R14 | 14 R13 | 15 R12 | 16 R11 | 17 R8 + + + // | + +/* modular reduction */ + + MOVQ DI, DX + SUBQ (R10), DX + MOVQ DX, 16(SP) + MOVQ SI, DX + SBBQ 8(R10), DX + MOVQ DX, 24(SP) + MOVQ BX, DX + SBBQ 16(R10), DX + MOVQ DX, 32(SP) + MOVQ R15, DX + SBBQ 24(R10), DX + MOVQ DX, 40(SP) + MOVQ R14, DX + SBBQ 32(R10), DX + MOVQ DX, 48(SP) + MOVQ R13, DX + SBBQ 40(R10), DX + MOVQ DX, 56(SP) + MOVQ R12, DX + SBBQ 48(R10), DX + MOVQ DX, 64(SP) + MOVQ R11, DX + SBBQ 56(R10), DX + MOVQ DX, 80(SP) + MOVQ R8, DX + SBBQ 64(R10), DX + MOVQ DX, 88(SP) + SBBQ $0x00, CX + + // | + +/* out */ + + MOVQ c+0(FP), CX + CMOVQCC 16(SP), DI + MOVQ DI, (CX) + CMOVQCC 24(SP), SI + MOVQ SI, 8(CX) + CMOVQCC 32(SP), BX + MOVQ BX, 16(CX) + CMOVQCC 40(SP), R15 + MOVQ R15, 24(CX) + CMOVQCC 48(SP), R14 + MOVQ R14, 32(CX) + CMOVQCC 56(SP), R13 + MOVQ R13, 40(CX) + CMOVQCC 64(SP), R12 + MOVQ R12, 48(CX) + CMOVQCC 80(SP), R11 + MOVQ R11, 56(CX) + CMOVQCC 88(SP), R8 + MOVQ R8, 64(CX) + RET + + // | + +/* end */ + + +// func cpy10(dst *[10]uint64, src *[10]uint64) +TEXT ·cpy10(SB), NOSPLIT, $0-16 + MOVQ dst+0(FP), DI + MOVQ src+8(FP), SI + MOVQ (SI), R8 + MOVQ R8, (DI) + MOVQ 8(SI), R8 + MOVQ R8, 8(DI) + MOVQ 16(SI), R8 + MOVQ R8, 16(DI) + MOVQ 24(SI), R8 + MOVQ R8, 24(DI) + MOVQ 32(SI), R8 + MOVQ R8, 32(DI) + MOVQ 40(SI), R8 + MOVQ R8, 40(DI) + MOVQ 48(SI), R8 + MOVQ R8, 48(DI) + MOVQ 56(SI), R8 + MOVQ R8, 56(DI) + MOVQ 64(SI), R8 + MOVQ R8, 64(DI) + MOVQ 72(SI), R8 + MOVQ R8, 72(DI) + RET + +// func eq10(a *[10]uint64, b *[10]uint64) bool +TEXT ·eq10(SB), NOSPLIT, $0-17 + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + MOVB $0x00, ret+16(FP) + MOVQ (DI), R8 + CMPQ (SI), R8 + JNE ret + MOVQ 8(DI), R8 + CMPQ 8(SI), R8 + JNE ret + MOVQ 16(DI), R8 + CMPQ 16(SI), R8 + JNE ret + MOVQ 24(DI), R8 + CMPQ 24(SI), R8 + JNE ret + MOVQ 32(DI), R8 + CMPQ 32(SI), R8 + JNE ret + MOVQ 40(DI), R8 + CMPQ 40(SI), R8 + JNE ret + MOVQ 48(DI), R8 + CMPQ 48(SI), R8 + JNE ret + MOVQ 56(DI), R8 + CMPQ 56(SI), R8 + JNE ret + MOVQ 64(DI), R8 + CMPQ 64(SI), R8 + JNE ret + MOVQ 72(DI), R8 + CMPQ 72(SI), R8 + JNE ret + MOVB $0x01, ret+16(FP) + +ret: + RET + +// func cmp10(a *[10]uint64, b *[10]uint64) int8 +TEXT ·cmp10(SB), NOSPLIT, $0-17 + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + MOVQ 72(DI), R8 + CMPQ 72(SI), R8 + JB gt + JA lt + MOVQ 64(DI), R8 + CMPQ 64(SI), R8 + JB gt + JA lt + MOVQ 56(DI), R8 + CMPQ 56(SI), R8 + JB gt + JA lt + MOVQ 48(DI), R8 + CMPQ 48(SI), R8 + JB gt + JA lt + MOVQ 40(DI), R8 + CMPQ 40(SI), R8 + JB gt + JA lt + MOVQ 32(DI), R8 + CMPQ 32(SI), R8 + JB gt + JA lt + MOVQ 24(DI), R8 + CMPQ 24(SI), R8 + JB gt + JA lt + MOVQ 16(DI), R8 + CMPQ 16(SI), R8 + JB gt + JA lt + MOVQ 8(DI), R8 + CMPQ 8(SI), R8 + JB gt + JA lt + MOVQ (DI), R8 + CMPQ (SI), R8 + JB gt + JA lt + MOVB $0x00, ret+16(FP) + JMP ret + +gt: + MOVB $0x01, ret+16(FP) + JMP ret + +lt: + MOVB $0xff, ret+16(FP) + +ret: + RET + +// func add10(c *[10]uint64, a *[10]uint64, b *[10]uint64, p *[10]uint64) +TEXT ·add10(SB), NOSPLIT, $80-32 + // | + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + XORQ AX, AX + + // | + MOVQ (DI), CX + ADDQ (SI), CX + MOVQ 8(DI), DX + ADCQ 8(SI), DX + MOVQ 16(DI), R8 + ADCQ 16(SI), R8 + MOVQ 24(DI), R9 + ADCQ 24(SI), R9 + MOVQ 32(DI), R10 + ADCQ 32(SI), R10 + MOVQ 40(DI), R11 + ADCQ 40(SI), R11 + MOVQ 48(DI), R12 + ADCQ 48(SI), R12 + MOVQ 56(DI), R13 + ADCQ 56(SI), R13 + MOVQ 64(DI), R14 + ADCQ 64(SI), R14 + MOVQ 72(DI), R15 + ADCQ 72(SI), R15 + ADCQ $0x00, AX + + // | + MOVQ p+24(FP), SI + MOVQ CX, BX + SUBQ (SI), BX + MOVQ BX, (SP) + MOVQ DX, BX + SBBQ 8(SI), BX + MOVQ BX, 8(SP) + MOVQ R8, BX + SBBQ 16(SI), BX + MOVQ BX, 16(SP) + MOVQ R9, BX + SBBQ 24(SI), BX + MOVQ BX, 24(SP) + MOVQ R10, BX + SBBQ 32(SI), BX + MOVQ BX, 32(SP) + MOVQ R11, BX + SBBQ 40(SI), BX + MOVQ BX, 40(SP) + MOVQ R12, BX + SBBQ 48(SI), BX + MOVQ BX, 48(SP) + MOVQ R13, BX + SBBQ 56(SI), BX + MOVQ BX, 56(SP) + MOVQ R14, BX + SBBQ 64(SI), BX + MOVQ BX, 64(SP) + MOVQ R15, BX + SBBQ 72(SI), BX + MOVQ BX, 72(SP) + SBBQ $0x00, AX + + // | + MOVQ c+0(FP), DI + CMOVQCC (SP), CX + MOVQ CX, (DI) + CMOVQCC 8(SP), DX + MOVQ DX, 8(DI) + CMOVQCC 16(SP), R8 + MOVQ R8, 16(DI) + CMOVQCC 24(SP), R9 + MOVQ R9, 24(DI) + CMOVQCC 32(SP), R10 + MOVQ R10, 32(DI) + CMOVQCC 40(SP), R11 + MOVQ R11, 40(DI) + CMOVQCC 48(SP), R12 + MOVQ R12, 48(DI) + CMOVQCC 56(SP), R13 + MOVQ R13, 56(DI) + CMOVQCC 64(SP), R14 + MOVQ R14, 64(DI) + CMOVQCC 72(SP), R15 + MOVQ R15, 72(DI) + RET + + // | + +/* end */ + + RET + +// func addn10(a *[10]uint64, b *[10]uint64) uint64 +TEXT ·addn10(SB), NOSPLIT, $0-24 + // | + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + + // | + MOVQ (DI), CX + ADDQ (SI), CX + MOVQ 8(DI), DX + ADCQ 8(SI), DX + MOVQ 16(DI), R8 + ADCQ 16(SI), R8 + MOVQ 24(DI), R9 + ADCQ 24(SI), R9 + MOVQ 32(DI), R10 + ADCQ 32(SI), R10 + MOVQ 40(DI), R11 + ADCQ 40(SI), R11 + MOVQ 48(DI), R12 + ADCQ 48(SI), R12 + MOVQ 56(DI), R13 + ADCQ 56(SI), R13 + MOVQ 64(DI), R14 + ADCQ 64(SI), R14 + MOVQ 72(DI), R15 + ADCQ 72(SI), R15 + ADCQ $0x00, AX + + // | + MOVQ CX, (DI) + MOVQ DX, 8(DI) + MOVQ R8, 16(DI) + MOVQ R9, 24(DI) + MOVQ R10, 32(DI) + MOVQ R11, 40(DI) + MOVQ R12, 48(DI) + MOVQ R13, 56(DI) + MOVQ R14, 64(DI) + MOVQ R15, 72(DI) + MOVQ AX, ret+16(FP) + RET + + // | + +/* end */ + + RET + +// func double10(c *[10]uint64, a *[10]uint64, p *[10]uint64) +TEXT ·double10(SB), NOSPLIT, $80-24 + // | + MOVQ a+8(FP), DI + XORQ AX, AX + MOVQ (DI), CX + ADDQ CX, CX + MOVQ 8(DI), DX + ADCQ DX, DX + MOVQ 16(DI), R8 + ADCQ R8, R8 + MOVQ 24(DI), R9 + ADCQ R9, R9 + MOVQ 32(DI), R10 + ADCQ R10, R10 + MOVQ 40(DI), R11 + ADCQ R11, R11 + MOVQ 48(DI), R12 + ADCQ R12, R12 + MOVQ 56(DI), R13 + ADCQ R13, R13 + MOVQ 64(DI), R14 + ADCQ R14, R14 + MOVQ 72(DI), R15 + ADCQ R15, R15 + ADCQ $0x00, AX + + // | + MOVQ p+16(FP), SI + MOVQ CX, BX + SUBQ (SI), BX + MOVQ BX, (SP) + MOVQ DX, BX + SBBQ 8(SI), BX + MOVQ BX, 8(SP) + MOVQ R8, BX + SBBQ 16(SI), BX + MOVQ BX, 16(SP) + MOVQ R9, BX + SBBQ 24(SI), BX + MOVQ BX, 24(SP) + MOVQ R10, BX + SBBQ 32(SI), BX + MOVQ BX, 32(SP) + MOVQ R11, BX + SBBQ 40(SI), BX + MOVQ BX, 40(SP) + MOVQ R12, BX + SBBQ 48(SI), BX + MOVQ BX, 48(SP) + MOVQ R13, BX + SBBQ 56(SI), BX + MOVQ BX, 56(SP) + MOVQ R14, BX + SBBQ 64(SI), BX + MOVQ BX, 64(SP) + MOVQ R15, BX + SBBQ 72(SI), BX + MOVQ BX, 72(SP) + SBBQ $0x00, AX + + // | + MOVQ c+0(FP), DI + CMOVQCC (SP), CX + MOVQ CX, (DI) + CMOVQCC 8(SP), DX + MOVQ DX, 8(DI) + CMOVQCC 16(SP), R8 + MOVQ R8, 16(DI) + CMOVQCC 24(SP), R9 + MOVQ R9, 24(DI) + CMOVQCC 32(SP), R10 + MOVQ R10, 32(DI) + CMOVQCC 40(SP), R11 + MOVQ R11, 40(DI) + CMOVQCC 48(SP), R12 + MOVQ R12, 48(DI) + CMOVQCC 56(SP), R13 + MOVQ R13, 56(DI) + CMOVQCC 64(SP), R14 + MOVQ R14, 64(DI) + CMOVQCC 72(SP), R15 + MOVQ R15, 72(DI) + RET + + // | + +/* end */ + + RET + +// func sub10(c *[10]uint64, a *[10]uint64, b *[10]uint64, p *[10]uint64) +TEXT ·sub10(SB), NOSPLIT, $80-32 + // | + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + XORQ AX, AX + MOVQ (DI), CX + SUBQ (SI), CX + MOVQ 8(DI), DX + SBBQ 8(SI), DX + MOVQ 16(DI), R8 + SBBQ 16(SI), R8 + MOVQ 24(DI), R9 + SBBQ 24(SI), R9 + MOVQ 32(DI), R10 + SBBQ 32(SI), R10 + MOVQ 40(DI), R11 + SBBQ 40(SI), R11 + MOVQ 48(DI), R12 + SBBQ 48(SI), R12 + MOVQ 56(DI), R13 + SBBQ 56(SI), R13 + MOVQ 64(DI), R14 + SBBQ 64(SI), R14 + MOVQ 72(DI), R15 + SBBQ 72(SI), R15 + + // | + MOVQ p+24(FP), SI + CMOVQCS (SI), AX + MOVQ AX, (SP) + CMOVQCS 8(SI), AX + MOVQ AX, 8(SP) + CMOVQCS 16(SI), AX + MOVQ AX, 16(SP) + CMOVQCS 24(SI), AX + MOVQ AX, 24(SP) + CMOVQCS 32(SI), AX + MOVQ AX, 32(SP) + CMOVQCS 40(SI), AX + MOVQ AX, 40(SP) + CMOVQCS 48(SI), AX + MOVQ AX, 48(SP) + CMOVQCS 56(SI), AX + MOVQ AX, 56(SP) + CMOVQCS 64(SI), AX + MOVQ AX, 64(SP) + CMOVQCS 72(SI), AX + MOVQ AX, 72(SP) + + // | + MOVQ c+0(FP), DI + ADDQ (SP), CX + MOVQ CX, (DI) + ADCQ 8(SP), DX + MOVQ DX, 8(DI) + ADCQ 16(SP), R8 + MOVQ R8, 16(DI) + ADCQ 24(SP), R9 + MOVQ R9, 24(DI) + ADCQ 32(SP), R10 + MOVQ R10, 32(DI) + ADCQ 40(SP), R11 + MOVQ R11, 40(DI) + ADCQ 48(SP), R12 + MOVQ R12, 48(DI) + ADCQ 56(SP), R13 + MOVQ R13, 56(DI) + ADCQ 64(SP), R14 + MOVQ R14, 64(DI) + ADCQ 72(SP), R15 + MOVQ R15, 72(DI) + RET + + // | + +/* end */ + + RET + +// func subn10(a *[10]uint64, b *[10]uint64) uint64 +TEXT ·subn10(SB), NOSPLIT, $0-24 + // | + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + XORQ AX, AX + + // | + MOVQ (DI), CX + SUBQ (SI), CX + MOVQ 8(DI), DX + SBBQ 8(SI), DX + MOVQ 16(DI), R8 + SBBQ 16(SI), R8 + MOVQ 24(DI), R9 + SBBQ 24(SI), R9 + MOVQ 32(DI), R10 + SBBQ 32(SI), R10 + MOVQ 40(DI), R11 + SBBQ 40(SI), R11 + MOVQ 48(DI), R12 + SBBQ 48(SI), R12 + MOVQ 56(DI), R13 + SBBQ 56(SI), R13 + MOVQ 64(DI), R14 + SBBQ 64(SI), R14 + MOVQ 72(DI), R15 + SBBQ 72(SI), R15 + ADCQ $0x00, AX + + // | + MOVQ CX, (DI) + MOVQ DX, 8(DI) + MOVQ R8, 16(DI) + MOVQ R9, 24(DI) + MOVQ R10, 32(DI) + MOVQ R11, 40(DI) + MOVQ R12, 48(DI) + MOVQ R13, 56(DI) + MOVQ R14, 64(DI) + MOVQ R15, 72(DI) + MOVQ AX, ret+16(FP) + RET + + // | + +/* end */ + + RET + +// func _neg10(c *[10]uint64, a *[10]uint64, p *[10]uint64) +TEXT ·_neg10(SB), NOSPLIT, $0-24 + // | + MOVQ a+8(FP), DI + + // | + MOVQ p+16(FP), SI + MOVQ (SI), CX + SUBQ (DI), CX + MOVQ 8(SI), DX + SBBQ 8(DI), DX + MOVQ 16(SI), R8 + SBBQ 16(DI), R8 + MOVQ 24(SI), R9 + SBBQ 24(DI), R9 + MOVQ 32(SI), R10 + SBBQ 32(DI), R10 + MOVQ 40(SI), R11 + SBBQ 40(DI), R11 + MOVQ 48(SI), R12 + SBBQ 48(DI), R12 + MOVQ 56(SI), R13 + SBBQ 56(DI), R13 + MOVQ 64(SI), R14 + SBBQ 64(DI), R14 + MOVQ 72(SI), R15 + SBBQ 72(DI), R15 + + // | + MOVQ c+0(FP), DI + MOVQ CX, (DI) + MOVQ DX, 8(DI) + MOVQ R8, 16(DI) + MOVQ R9, 24(DI) + MOVQ R10, 32(DI) + MOVQ R11, 40(DI) + MOVQ R12, 48(DI) + MOVQ R13, 56(DI) + MOVQ R14, 64(DI) + MOVQ R15, 72(DI) + RET + + // | + +/* end */ + + RET + +// func mul_two_10(a *[10]uint64) +TEXT ·mul_two_10(SB), NOSPLIT, $0-8 + MOVQ a+0(FP), DI + XORQ AX, AX + RCLQ $0x01, (DI) + RCLQ $0x01, 8(DI) + RCLQ $0x01, 16(DI) + RCLQ $0x01, 24(DI) + RCLQ $0x01, 32(DI) + RCLQ $0x01, 40(DI) + RCLQ $0x01, 48(DI) + RCLQ $0x01, 56(DI) + RCLQ $0x01, 64(DI) + RCLQ $0x01, 72(DI) + RET + +// func div_two_10(a *[10]uint64) +TEXT ·div_two_10(SB), NOSPLIT, $0-8 + MOVQ a+0(FP), DI + XORQ AX, AX + RCRQ $0x01, 72(DI) + RCRQ $0x01, 64(DI) + RCRQ $0x01, 56(DI) + RCRQ $0x01, 48(DI) + RCRQ $0x01, 40(DI) + RCRQ $0x01, 32(DI) + RCRQ $0x01, 24(DI) + RCRQ $0x01, 16(DI) + RCRQ $0x01, 8(DI) + RCRQ $0x01, (DI) + RET + +// func mul10(c *[10]uint64, a *[10]uint64, b *[10]uint64, p *[10]uint64, inp uint64) +TEXT ·mul10(SB), NOSPLIT, $160-40 + // | + +/* inputs */ + + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + XORQ AX, AX + + // | + +/* i = 0 */ + + // | a0 @ DX + MOVQ (DI), DX + + // | a0 * b0 + MULXQ (SI), AX, CX + MOVQ AX, (SP) + + // | a0 * b1 + MULXQ 8(SI), AX, R8 + ADCXQ AX, CX + + // | a0 * b2 + MULXQ 16(SI), AX, R9 + ADCXQ AX, R8 + + // | a0 * b3 + MULXQ 24(SI), AX, R10 + ADCXQ AX, R9 + + // | a0 * b4 + MULXQ 32(SI), AX, R11 + ADCXQ AX, R10 + + // | a0 * b5 + MULXQ 40(SI), AX, R12 + ADCXQ AX, R11 + + // | a0 * b6 + MULXQ 48(SI), AX, R13 + ADCXQ AX, R12 + + // | a0 * b7 + MULXQ 56(SI), AX, R14 + ADCXQ AX, R13 + + // | a0 * b8 + MULXQ 64(SI), AX, R15 + ADCXQ AX, R14 + ADCQ $0x00, R15 + + // | + +/* i = 1 */ + + // | a1 @ DX + MOVQ 8(DI), DX + XORQ AX, AX + + // | a1 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + MOVQ CX, 8(SP) + MOVQ $0x00, CX + + // | a1 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a1 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a1 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a1 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a1 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a1 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a1 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a1 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R15 + ADOXQ CX, CX + ADCXQ BX, CX + + // | + +/* i = 2 */ + + // | a2 @ DX + MOVQ 16(DI), DX + XORQ AX, AX + + // | a2 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + MOVQ R8, 16(SP) + MOVQ $0x00, R8 + + // | a2 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a2 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a2 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a2 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a2 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a2 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a2 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a2 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, CX + ADOXQ R8, R8 + ADCXQ BX, R8 + + // | + +/* i = 3 */ + + // | a3 @ DX + MOVQ 24(DI), DX + XORQ AX, AX + + // | a3 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + MOVQ R9, 24(SP) + MOVQ $0x00, R9 + + // | a3 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a3 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a3 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a3 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a3 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a3 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a3 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a3 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R8 + ADOXQ R9, R9 + ADCXQ BX, R9 + + // | + +/* i = 4 */ + + // | a4 @ DX + MOVQ 32(DI), DX + XORQ AX, AX + + // | a4 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + MOVQ R10, 32(SP) + MOVQ $0x00, R10 + + // | a4 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a4 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a4 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a4 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a4 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a4 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a4 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a4 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R9 + ADOXQ R10, R10 + ADCXQ BX, R10 + + // | + +/* i = 5 */ + + // | a5 @ DX + MOVQ 40(DI), DX + XORQ AX, AX + + // | a5 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + MOVQ R11, 40(SP) + MOVQ $0x00, R11 + + // | a5 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a5 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a5 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a5 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a5 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a5 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a5 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a5 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R10 + ADOXQ R11, R11 + ADCXQ BX, R11 + + // | + +/* i = 6 */ + + // | a6 @ DX + MOVQ 48(DI), DX + XORQ AX, AX + + // | a6 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + MOVQ R12, 48(SP) + MOVQ $0x00, R12 + + // | a6 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a6 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a6 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a6 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a6 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a6 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a6 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a6 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R11 + ADOXQ R12, R12 + ADCXQ BX, R12 + + // | + +/* i = 7 */ + + // | a7 @ DX + MOVQ 56(DI), DX + XORQ AX, AX + + // | a7 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + MOVQ R13, 56(SP) + MOVQ $0x00, R13 + + // | a7 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a7 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a7 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a7 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a7 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a7 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a7 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a7 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R12 + ADOXQ R13, R13 + ADCXQ BX, R13 + + // | + +/* i = 8 */ + + // | a8 @ DX + MOVQ 64(DI), DX + XORQ AX, AX + + // | a8 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + MOVQ R14, 64(SP) + MOVQ $0x00, R14 + + // | a8 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a8 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a8 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a8 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a8 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a8 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a8 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a8 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R13 + ADOXQ R14, R14 + ADCXQ BX, R14 + + // | + +/* i = 9 */ + + // | a9 @ DX + MOVQ 72(DI), DX + XORQ AX, AX + + // | a9 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + MOVQ R15, 72(SP) + MOVQ $0x00, R15 + + // | a9 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a9 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a9 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a9 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a9 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a9 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a9 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a9 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R14 + ADOXQ BX, R15 + ADCQ $0x00, R15 + + // | + +/* */ + + // | + // | W right + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) + // | 10 CX | 11 R8 | 12 R9 | 13 R10 | 14 R11 | 15 R12 | 16 R13 | 17 R14 | 18 R15 | 19 - + + + MOVQ CX, 80(SP) + MOVQ R8, 88(SP) + MOVQ R9, 96(SP) + MOVQ R10, 104(SP) + MOVQ R11, 112(SP) + MOVQ R12, 120(SP) + MOVQ R13, 128(SP) + MOVQ R14, 136(SP) + MOVQ R15, 144(SP) + + // | + // | W right at stack + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) + // | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 112(SP) | 15 120(SP) | 16 128(SP) | 17 136(SP) | 18 144(SP) | 19 - + + + XORQ AX, AX + + // | + +/* i = 0 */ + + // | a0 @ DX + MOVQ (DI), DX + + // | a0 * b9 + MULXQ 72(SI), AX, CX + MOVQ AX, 152(SP) + + // | + +/* i = 1 */ + + // | a1 @ DX + MOVQ 8(DI), DX + XORQ R8, R8 + + // | a1 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, CX + ADOXQ R8, R8 + ADCXQ BX, R8 + + // | + +/* i = 2 */ + + // | a2 @ DX + MOVQ 16(DI), DX + XORQ R9, R9 + + // | a2 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R8 + ADOXQ R9, R9 + ADCXQ BX, R9 + + // | + +/* i = 3 */ + + // | a3 @ DX + MOVQ 24(DI), DX + XORQ R10, R10 + + // | a3 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R9 + ADOXQ R10, R10 + ADCXQ BX, R10 + + // | + +/* i = 4 */ + + // | a4 @ DX + MOVQ 32(DI), DX + XORQ R11, R11 + + // | a4 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R10 + ADOXQ R11, R11 + ADCXQ BX, R11 + + // | + +/* i = 5 */ + + // | a5 @ DX + MOVQ 40(DI), DX + XORQ R12, R12 + + // | a5 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R11 + ADOXQ R12, R12 + ADCXQ BX, R12 + + // | + +/* i = 6 */ + + // | a6 @ DX + MOVQ 48(DI), DX + XORQ R13, R13 + + // | a6 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R12 + ADOXQ R13, R13 + ADCXQ BX, R13 + + // | + +/* i = 7 */ + + // | a7 @ DX + MOVQ 56(DI), DX + XORQ R14, R14 + + // | a7 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R13 + ADOXQ R14, R14 + ADCXQ BX, R14 + + // | + +/* i = 8 */ + + // | a8 @ DX + MOVQ 64(DI), DX + XORQ R15, R15 + + // | a8 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R14 + ADOXQ R15, R15 + ADCXQ BX, R15 + + // | + +/* i = 9 */ + + // | a9 @ DX + MOVQ 72(DI), DX + XORQ DI, DI + + // | a9 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R15 + ADOXQ BX, DI + ADCQ $0x00, DI + + // | + +/* */ + + // | + // | W left + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 152(SP) + // | 10 CX | 11 R8 | 12 R9 | 13 R10 | 14 R11 | 15 R12 | 16 R13 | 17 R14 | 18 R15 | 19 DI + + + // | + // | W right + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) + // | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 112(SP) | 15 120(SP) | 16 128(SP) | 17 136(SP) | 18 144(SP) | 19 - + + + MOVQ 72(SP), AX + ADDQ AX, 152(SP) + ADCQ 80(SP), CX + ADCQ 88(SP), R8 + ADCQ 96(SP), R9 + ADCQ 104(SP), R10 + ADCQ 112(SP), R11 + ADCQ 120(SP), R12 + ADCQ 128(SP), R13 + ADCQ 136(SP), R14 + ADCQ 144(SP), R15 + ADCQ $0x00, DI + + // | + // | W combined + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 152(SP) + // | 10 CX | 11 R8 | 12 R9 | 13 R10 | 14 R11 | 15 R12 | 16 R13 | 17 R14 | 18 R15 | 19 DI + + + MOVQ (SP), BX + MOVQ 8(SP), SI + MOVQ DI, (SP) + MOVQ 16(SP), DI + MOVQ R15, 8(SP) + MOVQ 24(SP), R15 + MOVQ R14, 16(SP) + MOVQ 32(SP), R14 + MOVQ R13, 24(SP) + MOVQ 40(SP), R13 + MOVQ R12, 32(SP) + MOVQ 48(SP), R12 + MOVQ R11, 40(SP) + MOVQ 56(SP), R11 + MOVQ R10, 48(SP) + MOVQ 64(SP), R10 + MOVQ R9, 56(SP) + MOVQ 152(SP), R9 + MOVQ R8, 64(SP) + MOVQ CX, 72(SP) + + // | fetch modulus + MOVQ p+24(FP), CX + + // | + // | W ready to mont + // | 0 BX | 1 SI | 2 DI | 3 R15 | 4 R14 | 5 R13 | 6 R12 | 7 R11 | 8 R10 | 9 R9 + // | 10 72(SP) | 11 64(SP) | 12 56(SP) | 13 48(SP) | 14 40(SP) | 15 32(SP) | 16 24(SP) | 17 16(SP) | 18 8(SP) | 19 (SP) + + + // | + +/* montgomery reduction q1 */ + + // | clear flags + XORQ AX, AX + + // | + +/* i = 0 */ + + // | + // | W + // | 0 BX | 1 SI | 2 DI | 3 R15 | 4 R14 | 5 R13 | 6 R12 | 7 R11 | 8 R10 | 9 R9 + // | 10 72(SP) | 11 64(SP) | 12 56(SP) | 13 48(SP) | 14 40(SP) | 15 32(SP) | 16 24(SP) | 17 16(SP) | 18 8(SP) | 19 (SP) + + + // | | u0 = w0 * inp + MOVQ BX, DX + MULXQ inp+32(FP), DX, R8 + + // | save u0 + MOVQ DX, 80(SP) + + // | + +/* */ + + // | j0 + + // | w0 @ BX + MULXQ (CX), AX, R8 + ADOXQ AX, BX + ADCXQ R8, SI + + // | j1 + + // | w1 @ SI + MULXQ 8(CX), AX, R8 + ADOXQ AX, SI + ADCXQ R8, DI + + // | j2 + + // | w2 @ DI + MULXQ 16(CX), AX, R8 + ADOXQ AX, DI + ADCXQ R8, R15 + + // | j3 + + // | w3 @ R15 + MULXQ 24(CX), AX, R8 + ADOXQ AX, R15 + ADCXQ R8, R14 + + // | j4 + + // | w4 @ R14 + MULXQ 32(CX), AX, R8 + ADOXQ AX, R14 + ADCXQ R8, R13 + + // | j5 + + // | w5 @ R13 + MULXQ 40(CX), AX, R8 + ADOXQ AX, R13 + ADCXQ R8, R12 + + // | j6 + + // | w6 @ R12 + MULXQ 48(CX), AX, R8 + ADOXQ AX, R12 + ADCXQ R8, R11 + + // | j7 + + // | w7 @ R11 + MULXQ 56(CX), AX, R8 + ADOXQ AX, R11 + ADCXQ R8, R10 + + // | j8 + + // | w8 @ R10 + MULXQ 64(CX), AX, R8 + ADOXQ AX, R10 + ADCXQ R8, R9 + ADOXQ BX, R9 + ADCXQ BX, BX + MOVQ $0x00, AX + ADOXQ AX, BX + + // | clear flags + XORQ AX, AX + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 SI | 2 DI | 3 R15 | 4 R14 | 5 R13 | 6 R12 | 7 R11 | 8 R10 | 9 R9 + // | 10 72(SP) | 11 64(SP) | 12 56(SP) | 13 48(SP) | 14 40(SP) | 15 32(SP) | 16 24(SP) | 17 16(SP) | 18 8(SP) | 19 (SP) + + + // | | u1 = w1 * inp + MOVQ SI, DX + MULXQ inp+32(FP), DX, R8 + + // | save u1 + MOVQ DX, 88(SP) + + // | + +/* */ + + // | j0 + + // | w1 @ SI + MULXQ (CX), AX, R8 + ADOXQ AX, SI + ADCXQ R8, DI + + // | j1 + + // | w2 @ DI + MULXQ 8(CX), AX, R8 + ADOXQ AX, DI + ADCXQ R8, R15 + + // | j2 + + // | w3 @ R15 + MULXQ 16(CX), AX, R8 + ADOXQ AX, R15 + ADCXQ R8, R14 + + // | j3 + + // | w4 @ R14 + MULXQ 24(CX), AX, R8 + ADOXQ AX, R14 + ADCXQ R8, R13 + + // | j4 + + // | w5 @ R13 + MULXQ 32(CX), AX, R8 + ADOXQ AX, R13 + ADCXQ R8, R12 + + // | j5 + + // | w6 @ R12 + MULXQ 40(CX), AX, R8 + ADOXQ AX, R12 + ADCXQ R8, R11 + + // | j6 + + // | w7 @ R11 + MULXQ 48(CX), AX, R8 + ADOXQ AX, R11 + ADCXQ R8, R10 + + // | j7 + + // | w8 @ R10 + MULXQ 56(CX), AX, R8 + ADOXQ AX, R10 + ADCXQ R8, R9 + + // | j8 + + // | w9 @ R9 + MULXQ 64(CX), AX, R8 + ADOXQ AX, R9 + + // | w10 @ 72(SP) + // | move to temp register + MOVQ 72(SP), AX + ADCXQ R8, AX + ADOXQ BX, AX + + // | move to an idle register + // | w10 @ AX + MOVQ AX, BX + ADCXQ SI, SI + MOVQ $0x00, AX + ADOXQ AX, SI + + // | clear flags + XORQ AX, AX + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 DI | 3 R15 | 4 R14 | 5 R13 | 6 R12 | 7 R11 | 8 R10 | 9 R9 + // | 10 BX | 11 64(SP) | 12 56(SP) | 13 48(SP) | 14 40(SP) | 15 32(SP) | 16 24(SP) | 17 16(SP) | 18 8(SP) | 19 (SP) + + + // | | u2 = w2 * inp + MOVQ DI, DX + MULXQ inp+32(FP), DX, R8 + + // | save u2 + MOVQ DX, 72(SP) + + // | + +/* */ + + // | j0 + + // | w2 @ DI + MULXQ (CX), AX, R8 + ADOXQ AX, DI + ADCXQ R8, R15 + + // | j1 + + // | w3 @ R15 + MULXQ 8(CX), AX, R8 + ADOXQ AX, R15 + ADCXQ R8, R14 + + // | j2 + + // | w4 @ R14 + MULXQ 16(CX), AX, R8 + ADOXQ AX, R14 + ADCXQ R8, R13 + + // | j3 + + // | w5 @ R13 + MULXQ 24(CX), AX, R8 + ADOXQ AX, R13 + ADCXQ R8, R12 + + // | j4 + + // | w6 @ R12 + MULXQ 32(CX), AX, R8 + ADOXQ AX, R12 + ADCXQ R8, R11 + + // | j5 + + // | w7 @ R11 + MULXQ 40(CX), AX, R8 + ADOXQ AX, R11 + ADCXQ R8, R10 + + // | j6 + + // | w8 @ R10 + MULXQ 48(CX), AX, R8 + ADOXQ AX, R10 + ADCXQ R8, R9 + + // | j7 + + // | w9 @ R9 + MULXQ 56(CX), AX, R8 + ADOXQ AX, R9 + ADCXQ R8, BX + + // | j8 + + // | w10 @ BX + MULXQ 64(CX), AX, R8 + ADOXQ AX, BX + + // | w11 @ 64(SP) + // | move to temp register + MOVQ 64(SP), AX + ADCXQ R8, AX + ADOXQ SI, AX + + // | move to an idle register + // | w11 @ AX + MOVQ AX, SI + ADCXQ DI, DI + MOVQ $0x00, AX + ADOXQ AX, DI + + // | clear flags + XORQ AX, AX + + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 R15 | 4 R14 | 5 R13 | 6 R12 | 7 R11 | 8 R10 | 9 R9 + // | 10 BX | 11 SI | 12 56(SP) | 13 48(SP) | 14 40(SP) | 15 32(SP) | 16 24(SP) | 17 16(SP) | 18 8(SP) | 19 (SP) + + + // | | u3 = w3 * inp + MOVQ R15, DX + MULXQ inp+32(FP), DX, R8 + + // | save u3 + MOVQ DX, 64(SP) + + // | + +/* */ + + // | j0 + + // | w3 @ R15 + MULXQ (CX), AX, R8 + ADOXQ AX, R15 + ADCXQ R8, R14 + + // | j1 + + // | w4 @ R14 + MULXQ 8(CX), AX, R8 + ADOXQ AX, R14 + ADCXQ R8, R13 + + // | j2 + + // | w5 @ R13 + MULXQ 16(CX), AX, R8 + ADOXQ AX, R13 + ADCXQ R8, R12 + + // | j3 + + // | w6 @ R12 + MULXQ 24(CX), AX, R8 + ADOXQ AX, R12 + ADCXQ R8, R11 + + // | j4 + + // | w7 @ R11 + MULXQ 32(CX), AX, R8 + ADOXQ AX, R11 + ADCXQ R8, R10 + + // | j5 + + // | w8 @ R10 + MULXQ 40(CX), AX, R8 + ADOXQ AX, R10 + ADCXQ R8, R9 + + // | j6 + + // | w9 @ R9 + MULXQ 48(CX), AX, R8 + ADOXQ AX, R9 + ADCXQ R8, BX + + // | j7 + + // | w10 @ BX + MULXQ 56(CX), AX, R8 + ADOXQ AX, BX + ADCXQ R8, SI + + // | j8 + + // | w11 @ SI + MULXQ 64(CX), AX, R8 + ADOXQ AX, SI + + // | w12 @ 56(SP) + // | move to temp register + MOVQ 56(SP), AX + ADCXQ R8, AX + ADOXQ DI, AX + + // | move to an idle register + // | w12 @ AX + MOVQ AX, DI + ADCXQ R15, R15 + MOVQ $0x00, AX + ADOXQ AX, R15 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 4 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 R14 | 5 R13 | 6 R12 | 7 R11 | 8 R10 | 9 R9 + // | 10 BX | 11 SI | 12 DI | 13 48(SP) | 14 40(SP) | 15 32(SP) | 16 24(SP) | 17 16(SP) | 18 8(SP) | 19 (SP) + + + // | | u4 = w4 * inp + MOVQ R14, DX + MULXQ inp+32(FP), DX, R8 + + // | save u4 + MOVQ DX, 56(SP) + + // | + +/* */ + + // | j0 + + // | w4 @ R14 + MULXQ (CX), AX, R8 + ADOXQ AX, R14 + ADCXQ R8, R13 + + // | j1 + + // | w5 @ R13 + MULXQ 8(CX), AX, R8 + ADOXQ AX, R13 + ADCXQ R8, R12 + + // | j2 + + // | w6 @ R12 + MULXQ 16(CX), AX, R8 + ADOXQ AX, R12 + ADCXQ R8, R11 + + // | j3 + + // | w7 @ R11 + MULXQ 24(CX), AX, R8 + ADOXQ AX, R11 + ADCXQ R8, R10 + + // | j4 + + // | w8 @ R10 + MULXQ 32(CX), AX, R8 + ADOXQ AX, R10 + ADCXQ R8, R9 + + // | j5 + + // | w9 @ R9 + MULXQ 40(CX), AX, R8 + ADOXQ AX, R9 + ADCXQ R8, BX + + // | j6 + + // | w10 @ BX + MULXQ 48(CX), AX, R8 + ADOXQ AX, BX + ADCXQ R8, SI + + // | j7 + + // | w11 @ SI + MULXQ 56(CX), AX, R8 + ADOXQ AX, SI + ADCXQ R8, DI + + // | j8 + + // | w12 @ DI + MULXQ 64(CX), AX, R8 + ADOXQ AX, DI + + // | w13 @ 48(SP) + // | move to temp register + MOVQ 48(SP), AX + ADCXQ R8, AX + ADOXQ R15, AX + + // | move to an idle register + // | w13 @ AX + MOVQ AX, R15 + ADCXQ R14, R14 + MOVQ $0x00, AX + ADOXQ AX, R14 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 5 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 R13 | 6 R12 | 7 R11 | 8 R10 | 9 R9 + // | 10 BX | 11 SI | 12 DI | 13 R15 | 14 40(SP) | 15 32(SP) | 16 24(SP) | 17 16(SP) | 18 8(SP) | 19 (SP) + + + // | | u5 = w5 * inp + MOVQ R13, DX + MULXQ inp+32(FP), DX, R8 + + // | save u5 + MOVQ DX, 48(SP) + + // | + +/* */ + + // | j0 + + // | w5 @ R13 + MULXQ (CX), AX, R8 + ADOXQ AX, R13 + ADCXQ R8, R12 + + // | j1 + + // | w6 @ R12 + MULXQ 8(CX), AX, R8 + ADOXQ AX, R12 + ADCXQ R8, R11 + + // | j2 + + // | w7 @ R11 + MULXQ 16(CX), AX, R8 + ADOXQ AX, R11 + ADCXQ R8, R10 + + // | j3 + + // | w8 @ R10 + MULXQ 24(CX), AX, R8 + ADOXQ AX, R10 + ADCXQ R8, R9 + + // | j4 + + // | w9 @ R9 + MULXQ 32(CX), AX, R8 + ADOXQ AX, R9 + ADCXQ R8, BX + + // | j5 + + // | w10 @ BX + MULXQ 40(CX), AX, R8 + ADOXQ AX, BX + ADCXQ R8, SI + + // | j6 + + // | w11 @ SI + MULXQ 48(CX), AX, R8 + ADOXQ AX, SI + ADCXQ R8, DI + + // | j7 + + // | w12 @ DI + MULXQ 56(CX), AX, R8 + ADOXQ AX, DI + ADCXQ R8, R15 + + // | j8 + + // | w13 @ R15 + MULXQ 64(CX), AX, R8 + ADOXQ AX, R15 + + // | w14 @ 40(SP) + // | move to temp register + MOVQ 40(SP), AX + ADCXQ R8, AX + ADOXQ R14, AX + + // | move to an idle register + // | w14 @ AX + MOVQ AX, R14 + ADCXQ R13, R13 + MOVQ $0x00, AX + ADOXQ AX, R13 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 6 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 R12 | 7 R11 | 8 R10 | 9 R9 + // | 10 BX | 11 SI | 12 DI | 13 R15 | 14 R14 | 15 32(SP) | 16 24(SP) | 17 16(SP) | 18 8(SP) | 19 (SP) + + + // | | u6 = w6 * inp + MOVQ R12, DX + MULXQ inp+32(FP), DX, R8 + + // | save u6 + MOVQ DX, 40(SP) + + // | + +/* */ + + // | j0 + + // | w6 @ R12 + MULXQ (CX), AX, R8 + ADOXQ AX, R12 + ADCXQ R8, R11 + + // | j1 + + // | w7 @ R11 + MULXQ 8(CX), AX, R8 + ADOXQ AX, R11 + ADCXQ R8, R10 + + // | j2 + + // | w8 @ R10 + MULXQ 16(CX), AX, R8 + ADOXQ AX, R10 + ADCXQ R8, R9 + + // | j3 + + // | w9 @ R9 + MULXQ 24(CX), AX, R8 + ADOXQ AX, R9 + ADCXQ R8, BX + + // | j4 + + // | w10 @ BX + MULXQ 32(CX), AX, R8 + ADOXQ AX, BX + ADCXQ R8, SI + + // | j5 + + // | w11 @ SI + MULXQ 40(CX), AX, R8 + ADOXQ AX, SI + ADCXQ R8, DI + + // | j6 + + // | w12 @ DI + MULXQ 48(CX), AX, R8 + ADOXQ AX, DI + ADCXQ R8, R15 + + // | j7 + + // | w13 @ R15 + MULXQ 56(CX), AX, R8 + ADOXQ AX, R15 + ADCXQ R8, R14 + + // | j8 + + // | w14 @ R14 + MULXQ 64(CX), AX, R8 + ADOXQ AX, R14 + + // | w15 @ 32(SP) + // | move to temp register + MOVQ 32(SP), AX + ADCXQ R8, AX + ADOXQ R13, AX + + // | move to an idle register + // | w15 @ AX + MOVQ AX, R13 + ADCXQ R12, R12 + MOVQ $0x00, AX + ADOXQ AX, R12 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 7 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 R11 | 8 R10 | 9 R9 + // | 10 BX | 11 SI | 12 DI | 13 R15 | 14 R14 | 15 R13 | 16 24(SP) | 17 16(SP) | 18 8(SP) | 19 (SP) + + + // | | u7 = w7 * inp + MOVQ R11, DX + MULXQ inp+32(FP), DX, R8 + + // | save u7 + MOVQ DX, 32(SP) + + // | + +/* */ + + // | j0 + + // | w7 @ R11 + MULXQ (CX), AX, R8 + ADOXQ AX, R11 + ADCXQ R8, R10 + + // | j1 + + // | w8 @ R10 + MULXQ 8(CX), AX, R8 + ADOXQ AX, R10 + ADCXQ R8, R9 + + // | j2 + + // | w9 @ R9 + MULXQ 16(CX), AX, R8 + ADOXQ AX, R9 + ADCXQ R8, BX + + // | j3 + + // | w10 @ BX + MULXQ 24(CX), AX, R8 + ADOXQ AX, BX + ADCXQ R8, SI + + // | j4 + + // | w11 @ SI + MULXQ 32(CX), AX, R8 + ADOXQ AX, SI + ADCXQ R8, DI + + // | j5 + + // | w12 @ DI + MULXQ 40(CX), AX, R8 + ADOXQ AX, DI + ADCXQ R8, R15 + + // | j6 + + // | w13 @ R15 + MULXQ 48(CX), AX, R8 + ADOXQ AX, R15 + ADCXQ R8, R14 + + // | j7 + + // | w14 @ R14 + MULXQ 56(CX), AX, R8 + ADOXQ AX, R14 + ADCXQ R8, R13 + + // | j8 + + // | w15 @ R13 + MULXQ 64(CX), AX, R8 + ADOXQ AX, R13 + + // | w16 @ 24(SP) + // | move to temp register + MOVQ 24(SP), AX + ADCXQ R8, AX + ADOXQ R12, AX + + // | move to an idle register + // | w16 @ AX + MOVQ AX, R12 + ADCXQ R11, R11 + MOVQ $0x00, AX + ADOXQ AX, R11 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 8 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R10 | 9 R9 + // | 10 BX | 11 SI | 12 DI | 13 R15 | 14 R14 | 15 R13 | 16 R12 | 17 16(SP) | 18 8(SP) | 19 (SP) + + + // | | u8 = w8 * inp + MOVQ R10, DX + MULXQ inp+32(FP), DX, R8 + + // | save u8 + MOVQ DX, 24(SP) + + // | + +/* */ + + // | j0 + + // | w8 @ R10 + MULXQ (CX), AX, R8 + ADOXQ AX, R10 + ADCXQ R8, R9 + + // | j1 + + // | w9 @ R9 + MULXQ 8(CX), AX, R8 + ADOXQ AX, R9 + ADCXQ R8, BX + + // | j2 + + // | w10 @ BX + MULXQ 16(CX), AX, R8 + ADOXQ AX, BX + ADCXQ R8, SI + + // | j3 + + // | w11 @ SI + MULXQ 24(CX), AX, R8 + ADOXQ AX, SI + ADCXQ R8, DI + + // | j4 + + // | w12 @ DI + MULXQ 32(CX), AX, R8 + ADOXQ AX, DI + ADCXQ R8, R15 + + // | j5 + + // | w13 @ R15 + MULXQ 40(CX), AX, R8 + ADOXQ AX, R15 + ADCXQ R8, R14 + + // | j6 + + // | w14 @ R14 + MULXQ 48(CX), AX, R8 + ADOXQ AX, R14 + ADCXQ R8, R13 + + // | j7 + + // | w15 @ R13 + MULXQ 56(CX), AX, R8 + ADOXQ AX, R13 + ADCXQ R8, R12 + + // | j8 + + // | w16 @ R12 + MULXQ 64(CX), AX, R8 + ADOXQ AX, R12 + + // | w17 @ 16(SP) + // | move to temp register + MOVQ 16(SP), AX + ADCXQ R8, AX + ADOXQ R11, AX + + // | move to an idle register + // | w17 @ AX + MOVQ AX, R11 + ADCXQ R10, R10 + MOVQ $0x00, AX + ADOXQ AX, R10 + + // | + // | W montgomery reduction q1 ends + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R9 + // | 10 BX | 11 SI | 12 DI | 13 R15 | 14 R14 | 15 R13 | 16 R12 | 17 R11 | 18 8(SP) | 19 (SP) + + + // | long carry R10 should be added to w18 + // | + +/* montgomerry reduction q2 */ + + MOVQ 72(CX), DX + XORQ AX, AX + + // | + +/* i = 0 */ + + // | w9 @ R9 + MULXQ 80(SP), AX, R8 + ADOXQ AX, R9 + ADCXQ R8, BX + + // | + +/* i = 1 */ + + // | w10 @ BX + MULXQ 88(SP), AX, R8 + ADOXQ AX, BX + ADCXQ R8, SI + + // | + +/* i = 2 */ + + // | w11 @ SI + MULXQ 72(SP), AX, R8 + ADOXQ AX, SI + ADCXQ R8, DI + + // | + +/* i = 3 */ + + // | w12 @ DI + MULXQ 64(SP), AX, R8 + ADOXQ AX, DI + ADCXQ R8, R15 + + // | + +/* i = 4 */ + + // | w13 @ R15 + MULXQ 56(SP), AX, R8 + ADOXQ AX, R15 + ADCXQ R8, R14 + + // | + +/* i = 5 */ + + // | w14 @ R14 + MULXQ 48(SP), AX, R8 + ADOXQ AX, R14 + ADCXQ R8, R13 + + // | + +/* i = 6 */ + + // | w15 @ R13 + MULXQ 40(SP), AX, R8 + ADOXQ AX, R13 + ADCXQ R8, R12 + + // | + +/* i = 7 */ + + // | w16 @ R12 + MULXQ 32(SP), AX, R8 + ADOXQ AX, R12 + ADCXQ R8, R11 + + // | + +/* i = 8 */ + + // | w17 @ R11 + MULXQ 24(SP), AX, R8 + ADOXQ AX, R11 + + // | aggregate carries + // | R10 + R8 should be added to w18 @ 8(SP) + // | notice that aggregated value can be at most (2^64 - 1) + ADCXQ R8, R10 + MOVQ $0x00, AX + ADOXQ AX, R10 + + // | + // | q2 ends + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R9 + // | 10 BX | 11 SI | 12 DI | 13 R15 | 14 R14 | 15 R13 | 16 R12 | 17 R11 | 18 8(SP) | 19 (SP) + + + // | + +/* montgomerry reduction q3 & q4 */ + + // | + +/* i = 9 */ + + // | | u9 = w9 * inp + XORQ AX, AX + MOVQ R9, DX + MULXQ inp+32(FP), DX, R8 + + // | j0 + + // | w9 @ R9 + MULXQ (CX), AX, R8 + ADOXQ AX, R9 + ADCXQ R8, BX + + // | j1 + + // | w10 @ BX + MULXQ 8(CX), AX, R8 + ADOXQ AX, BX + ADCXQ R8, SI + + // | j2 + + // | w11 @ SI + MULXQ 16(CX), AX, R8 + ADOXQ AX, SI + ADCXQ R8, DI + + // | j3 + + // | w12 @ DI + MULXQ 24(CX), AX, R8 + ADOXQ AX, DI + ADCXQ R8, R15 + + // | j4 + + // | w13 @ R15 + MULXQ 32(CX), AX, R8 + ADOXQ AX, R15 + ADCXQ R8, R14 + + // | j5 + + // | w14 @ R14 + MULXQ 40(CX), AX, R8 + ADOXQ AX, R14 + ADCXQ R8, R13 + + // | j6 + + // | w15 @ R13 + MULXQ 48(CX), AX, R8 + ADOXQ AX, R13 + ADCXQ R8, R12 + + // | j7 + + // | w16 @ R12 + MULXQ 56(CX), AX, R8 + ADOXQ AX, R12 + ADCXQ R8, R11 + + // | j8 + + // | w17 @ R11 + MULXQ 64(CX), AX, R8 + ADOXQ AX, R11 + ADCXQ R8, R10 + + // | j9 + + // | w18 @ 8(SP) + MULXQ 72(CX), AX, R8 + ADOXQ AX, R10 + ADCXQ R8, R9 + + // | the last bit + MOVQ $0x00, R8 + ADOXQ R8, R9 + ADDQ 8(SP), R10 + ADCQ (SP), R9 + ADCQ $0x00, R8 + + // | + // | q3 ends + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - + // | 10 BX | 11 SI | 12 DI | 13 R15 | 14 R14 | 15 R13 | 16 R12 | 17 R11 | 18 R10 | 19 R9 + + + // | + +/* modular reduction */ + + MOVQ BX, AX + SUBQ (CX), AX + MOVQ SI, DX + SBBQ 8(CX), DX + MOVQ DX, (SP) + MOVQ DI, DX + SBBQ 16(CX), DX + MOVQ DX, 8(SP) + MOVQ R15, DX + SBBQ 24(CX), DX + MOVQ DX, 16(SP) + MOVQ R14, DX + SBBQ 32(CX), DX + MOVQ DX, 96(SP) + MOVQ R13, DX + SBBQ 40(CX), DX + MOVQ DX, 104(SP) + MOVQ R12, DX + SBBQ 48(CX), DX + MOVQ DX, 112(SP) + MOVQ R11, DX + SBBQ 56(CX), DX + MOVQ DX, 120(SP) + MOVQ R10, DX + SBBQ 64(CX), DX + MOVQ DX, 128(SP) + MOVQ R9, DX + SBBQ 72(CX), DX + MOVQ DX, 136(SP) + SBBQ $0x00, R8 + + // | + +/* out */ + + MOVQ c+0(FP), R8 + CMOVQCC AX, BX + MOVQ BX, (R8) + CMOVQCC (SP), SI + MOVQ SI, 8(R8) + CMOVQCC 8(SP), DI + MOVQ DI, 16(R8) + CMOVQCC 16(SP), R15 + MOVQ R15, 24(R8) + CMOVQCC 96(SP), R14 + MOVQ R14, 32(R8) + CMOVQCC 104(SP), R13 + MOVQ R13, 40(R8) + CMOVQCC 112(SP), R12 + MOVQ R12, 48(R8) + CMOVQCC 120(SP), R11 + MOVQ R11, 56(R8) + CMOVQCC 128(SP), R10 + MOVQ R10, 64(R8) + CMOVQCC 136(SP), R9 + MOVQ R9, 72(R8) + RET + + // | + +/* end */ + + +// func mul_no_adx_bmi2_10(c *[10]uint64, a *[10]uint64, b *[10]uint64, p *[10]uint64, inp uint64) +TEXT ·mul_no_adx_bmi2_10(SB), NOSPLIT, $168-40 + // | + +/* inputs */ + + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + MOVQ $0x00, R9 + MOVQ $0x00, R10 + MOVQ $0x00, R11 + MOVQ $0x00, R12 + MOVQ $0x00, R13 + MOVQ $0x00, R14 + MOVQ $0x00, R15 + + // | + +/* i = 0 */ + + // | a0 @ CX + MOVQ (DI), CX + + // | a0 * b0 + MOVQ (SI), AX + MULQ CX + MOVQ AX, (SP) + MOVQ DX, R8 + + // | a0 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + + // | a0 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + + // | a0 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + + // | a0 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + + // | a0 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + + // | a0 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + + // | a0 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + + // | + +/* i = 1 */ + + // | a1 @ CX + MOVQ 8(DI), CX + MOVQ $0x00, BX + + // | a1 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R10 + ADCQ $0x00, BX + MOVQ R8, 8(SP) + MOVQ $0x00, R8 + + // | a1 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + + // | a1 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + + // | + +/* i = 2 */ + + // | a2 @ CX + MOVQ 16(DI), CX + MOVQ $0x00, BX + + // | a2 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ $0x00, R11 + ADCQ $0x00, BX + MOVQ R9, 16(SP) + MOVQ $0x00, R9 + + // | a2 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + + // | a2 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + + // | + +/* i = 3 */ + + // | a3 @ CX + MOVQ 24(DI), CX + MOVQ $0x00, BX + + // | a3 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ $0x00, R12 + ADCQ $0x00, BX + MOVQ R10, 24(SP) + MOVQ $0x00, R10 + + // | a3 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + + // | a3 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + + // | + +/* i = 4 */ + + // | a4 @ CX + MOVQ 32(DI), CX + MOVQ $0x00, BX + + // | a4 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ $0x00, R13 + ADCQ $0x00, BX + MOVQ R11, 32(SP) + MOVQ $0x00, R11 + + // | a4 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + + // | a4 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + + // | + +/* i = 5 */ + + // | a5 @ CX + MOVQ 40(DI), CX + MOVQ $0x00, BX + + // | a5 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ $0x00, R14 + ADCQ $0x00, BX + MOVQ R12, 40(SP) + MOVQ $0x00, R12 + + // | a5 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + + // | a5 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + + // | + +/* i = 6 */ + + // | a6 @ CX + MOVQ 48(DI), CX + MOVQ $0x00, BX + + // | a6 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ $0x00, R15 + ADCQ $0x00, BX + MOVQ R13, 48(SP) + MOVQ $0x00, R13 + + // | a6 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + + // | a6 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + + // | + +/* i = 7 */ + + // | a7 @ CX + MOVQ 56(DI), CX + MOVQ $0x00, BX + + // | a7 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ $0x00, R8 + ADCQ $0x00, BX + MOVQ R14, 56(SP) + MOVQ $0x00, R14 + + // | a7 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + + // | a7 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + + // | + +/* i = 8 */ + + // | a8 @ CX + MOVQ 64(DI), CX + MOVQ $0x00, BX + + // | a8 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ $0x00, R9 + ADCQ $0x00, BX + MOVQ R15, 64(SP) + MOVQ $0x00, R15 + + // | a8 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + + // | a8 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + + // | + +/* i = 9 */ + + // | a9 @ CX + MOVQ 72(DI), CX + MOVQ $0x00, BX + + // | a9 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R10 + ADCQ $0x00, BX + + // | a9 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a9 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a9 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a9 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a9 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a9 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ $0x00, BX + + // | a9 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, BX + + // | + +/* */ + + // | + // | W part 1 multiplication + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 R8 + // | 10 R9 | 11 R10 | 12 R11 | 13 R12 | 14 R13 | 15 R14 | 16 R15 | 17 BX | 18 - | 19 - + + + MOVQ R8, 72(SP) + MOVQ R9, 80(SP) + MOVQ R10, 88(SP) + MOVQ R11, 96(SP) + MOVQ R12, 104(SP) + MOVQ R13, 112(SP) + MOVQ R14, 120(SP) + MOVQ R15, 128(SP) + MOVQ BX, 136(SP) + + // | + // | W part 1 moved to stack + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) + // | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 112(SP) | 15 120(SP) | 16 128(SP) | 17 136(SP) | 18 - | 19 - + + + MOVQ $0x00, R9 + MOVQ $0x00, R10 + MOVQ $0x00, R11 + MOVQ $0x00, R12 + MOVQ $0x00, R13 + MOVQ $0x00, R14 + MOVQ $0x00, R15 + MOVQ $0x00, BX + + // | + +/* i = 0 */ + + // | a0 @ CX + MOVQ (DI), CX + + // | a0 * b8 + MOVQ 64(SI), AX + MULQ CX + MOVQ AX, 144(SP) + MOVQ DX, R8 + + // | a0 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + + // | + +/* i = 1 */ + + // | a1 @ CX + MOVQ 8(DI), CX + + // | a1 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R10 + MOVQ R8, 152(SP) + MOVQ $0x00, R8 + + // | a1 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + + // | + +/* i = 2 */ + + // | a2 @ CX + MOVQ 16(DI), CX + + // | a2 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ $0x00, R11 + MOVQ R9, 160(SP) + MOVQ $0x00, R9 + + // | a2 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + + // | + +/* i = 3 */ + + // | a3 @ CX + MOVQ 24(DI), CX + + // | a3 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ $0x00, R12 + + // | a3 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + + // | + +/* i = 4 */ + + // | a4 @ CX + MOVQ 32(DI), CX + + // | a4 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ $0x00, R13 + + // | a4 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + + // | + +/* i = 5 */ + + // | a5 @ CX + MOVQ 40(DI), CX + + // | a5 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ $0x00, R14 + + // | a5 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + + // | + +/* i = 6 */ + + // | a6 @ CX + MOVQ 48(DI), CX + + // | a6 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ $0x00, R15 + + // | a6 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + + // | + +/* i = 7 */ + + // | a7 @ CX + MOVQ 56(DI), CX + + // | a7 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ $0x00, R8 + + // | a7 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + + // | + +/* i = 8 */ + + // | a8 @ CX + MOVQ 64(DI), CX + + // | a8 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ $0x00, R9 + + // | a8 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + + // | + +/* i = 9 */ + + // | a9 @ CX + MOVQ 72(DI), CX + + // | a9 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, BX + + // | a9 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, BX + + // | + +/* */ + + // | + // | W part 2 multiplication + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 144(SP) | 9 152(SP) + // | 10 160(SP) | 11 R10 | 12 R11 | 13 R12 | 14 R13 | 15 R14 | 16 R15 | 17 R8 | 18 R9 | 19 BX + + + // | + // | W part 1 + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) + // | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 112(SP) | 15 120(SP) | 16 128(SP) | 17 136(SP) | 18 - | 19 - + + + MOVQ 64(SP), AX + ADDQ AX, 144(SP) + MOVQ 72(SP), AX + ADCQ AX, 152(SP) + MOVQ 80(SP), AX + ADCQ AX, 160(SP) + ADCQ 88(SP), R10 + ADCQ 96(SP), R11 + ADCQ 104(SP), R12 + ADCQ 112(SP), R13 + ADCQ 120(SP), R14 + ADCQ 128(SP), R15 + ADCQ 136(SP), R8 + ADCQ $0x00, R9 + ADCQ $0x00, BX + + // | + // | W combined + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 144(SP) | 9 152(SP) + // | 10 160(SP) | 11 R10 | 12 R11 | 13 R12 | 14 R13 | 15 R14 | 16 R15 | 17 R8 | 18 R9 | 19 BX + + + MOVQ (SP), CX + MOVQ 8(SP), DI + MOVQ 16(SP), SI + MOVQ BX, (SP) + MOVQ 24(SP), BX + MOVQ R9, 8(SP) + MOVQ 32(SP), R9 + MOVQ R8, 16(SP) + MOVQ 40(SP), R8 + MOVQ R15, 24(SP) + MOVQ 48(SP), R15 + MOVQ R14, 32(SP) + MOVQ 56(SP), R14 + MOVQ R13, 40(SP) + MOVQ 144(SP), R13 + MOVQ R12, 48(SP) + MOVQ R11, 56(SP) + MOVQ R10, 64(SP) + + // | fetch modulus + MOVQ p+24(FP), R12 + + // | + +/* montgomery reduction q1 */ + + // | + +/* i = 0 */ + + // | + // | W + // | 0 CX | 1 DI | 2 SI | 3 BX | 4 R9 | 5 R8 | 6 R15 | 7 R14 | 8 R13 | 9 152(SP) + // | 10 160(SP) | 11 64(SP) | 12 56(SP) | 13 48(SP) | 14 40(SP) | 15 32(SP) | 16 24(SP) | 17 16(SP) | 18 8(SP) | 19 (SP) + + + // | | u0 = w0 * inp + MOVQ CX, AX + MULQ inp+32(FP) + MOVQ AX, R11 + MOVQ $0x00, R10 + + // | + +/* */ + + // | save u0 + MOVQ R11, 72(SP) + + // | j0 + + // | w0 @ CX + MOVQ (R12), AX + MULQ R11 + ADDQ AX, CX + ADCQ DX, R10 + + // | j1 + + // | w1 @ DI + MOVQ 8(R12), AX + MULQ R11 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R10, DI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j2 + + // | w2 @ SI + MOVQ 16(R12), AX + MULQ R11 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R10, SI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j3 + + // | w3 @ BX + MOVQ 24(R12), AX + MULQ R11 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R10, BX + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j4 + + // | w4 @ R9 + MOVQ 32(R12), AX + MULQ R11 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R10, R9 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j5 + + // | w5 @ R8 + MOVQ 40(R12), AX + MULQ R11 + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R10, R8 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j6 + + // | w6 @ R15 + MOVQ 48(R12), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R10, R15 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j7 + + // | w7 @ R14 + MOVQ 56(R12), AX + MULQ R11 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R10, R14 + + // | w8 @ R13 + ADCQ DX, R13 + ADCQ $0x00, CX + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 DI | 2 SI | 3 BX | 4 R9 | 5 R8 | 6 R15 | 7 R14 | 8 R13 | 9 152(SP) + // | 10 160(SP) | 11 64(SP) | 12 56(SP) | 13 48(SP) | 14 40(SP) | 15 32(SP) | 16 24(SP) | 17 16(SP) | 18 8(SP) | 19 (SP) + + + // | | u1 = w1 * inp + MOVQ DI, AX + MULQ inp+32(FP) + MOVQ AX, R11 + MOVQ $0x00, R10 + + // | + +/* */ + + // | save u1 + MOVQ R11, 80(SP) + + // | j0 + + // | w1 @ DI + MOVQ (R12), AX + MULQ R11 + ADDQ AX, DI + ADCQ DX, R10 + + // | j1 + + // | w2 @ SI + MOVQ 8(R12), AX + MULQ R11 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R10, SI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j2 + + // | w3 @ BX + MOVQ 16(R12), AX + MULQ R11 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R10, BX + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j3 + + // | w4 @ R9 + MOVQ 24(R12), AX + MULQ R11 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R10, R9 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j4 + + // | w5 @ R8 + MOVQ 32(R12), AX + MULQ R11 + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R10, R8 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j5 + + // | w6 @ R15 + MOVQ 40(R12), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R10, R15 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j6 + + // | w7 @ R14 + MOVQ 48(R12), AX + MULQ R11 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R10, R14 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j7 + + // | w8 @ R13 + MOVQ 56(R12), AX + MULQ R11 + ADDQ AX, R13 + ADCQ DX, CX + ADDQ R10, R13 + + // | move to idle register + MOVQ 152(SP), DI + + // | w9 @ DI + ADCQ CX, DI + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 SI | 3 BX | 4 R9 | 5 R8 | 6 R15 | 7 R14 | 8 R13 | 9 DI + // | 10 160(SP) | 11 64(SP) | 12 56(SP) | 13 48(SP) | 14 40(SP) | 15 32(SP) | 16 24(SP) | 17 16(SP) | 18 8(SP) | 19 (SP) + + + // | | u2 = w2 * inp + MOVQ SI, AX + MULQ inp+32(FP) + MOVQ AX, R11 + MOVQ $0x00, R10 + + // | + +/* */ + + // | save u2 + MOVQ R11, 88(SP) + + // | j0 + + // | w2 @ SI + MOVQ (R12), AX + MULQ R11 + ADDQ AX, SI + ADCQ DX, R10 + + // | j1 + + // | w3 @ BX + MOVQ 8(R12), AX + MULQ R11 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R10, BX + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j2 + + // | w4 @ R9 + MOVQ 16(R12), AX + MULQ R11 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R10, R9 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j3 + + // | w5 @ R8 + MOVQ 24(R12), AX + MULQ R11 + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R10, R8 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j4 + + // | w6 @ R15 + MOVQ 32(R12), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R10, R15 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j5 + + // | w7 @ R14 + MOVQ 40(R12), AX + MULQ R11 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R10, R14 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j6 + + // | w8 @ R13 + MOVQ 48(R12), AX + MULQ R11 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R10, R13 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j7 + + // | w9 @ DI + MOVQ 56(R12), AX + MULQ R11 + ADDQ AX, DI + ADCQ DX, CX + ADDQ R10, DI + + // | move to idle register + MOVQ 160(SP), SI + + // | w10 @ SI + ADCQ CX, SI + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 BX | 4 R9 | 5 R8 | 6 R15 | 7 R14 | 8 R13 | 9 DI + // | 10 SI | 11 64(SP) | 12 56(SP) | 13 48(SP) | 14 40(SP) | 15 32(SP) | 16 24(SP) | 17 16(SP) | 18 8(SP) | 19 (SP) + + + // | | u3 = w3 * inp + MOVQ BX, AX + MULQ inp+32(FP) + MOVQ AX, R11 + MOVQ $0x00, R10 + + // | + +/* */ + + // | save u3 + MOVQ R11, 96(SP) + + // | j0 + + // | w3 @ BX + MOVQ (R12), AX + MULQ R11 + ADDQ AX, BX + ADCQ DX, R10 + + // | j1 + + // | w4 @ R9 + MOVQ 8(R12), AX + MULQ R11 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R10, R9 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j2 + + // | w5 @ R8 + MOVQ 16(R12), AX + MULQ R11 + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R10, R8 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j3 + + // | w6 @ R15 + MOVQ 24(R12), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R10, R15 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j4 + + // | w7 @ R14 + MOVQ 32(R12), AX + MULQ R11 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R10, R14 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j5 + + // | w8 @ R13 + MOVQ 40(R12), AX + MULQ R11 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R10, R13 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j6 + + // | w9 @ DI + MOVQ 48(R12), AX + MULQ R11 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R10, DI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j7 + + // | w10 @ SI + MOVQ 56(R12), AX + MULQ R11 + ADDQ AX, SI + ADCQ DX, CX + ADDQ R10, SI + + // | move to idle register + MOVQ 64(SP), BX + + // | w11 @ BX + ADCQ CX, BX + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 4 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 R9 | 5 R8 | 6 R15 | 7 R14 | 8 R13 | 9 DI + // | 10 SI | 11 BX | 12 56(SP) | 13 48(SP) | 14 40(SP) | 15 32(SP) | 16 24(SP) | 17 16(SP) | 18 8(SP) | 19 (SP) + + + // | | u4 = w4 * inp + MOVQ R9, AX + MULQ inp+32(FP) + MOVQ AX, R11 + MOVQ $0x00, R10 + + // | + +/* */ + + // | save u4 + MOVQ R11, 64(SP) + + // | j0 + + // | w4 @ R9 + MOVQ (R12), AX + MULQ R11 + ADDQ AX, R9 + ADCQ DX, R10 + + // | j1 + + // | w5 @ R8 + MOVQ 8(R12), AX + MULQ R11 + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R10, R8 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j2 + + // | w6 @ R15 + MOVQ 16(R12), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R10, R15 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j3 + + // | w7 @ R14 + MOVQ 24(R12), AX + MULQ R11 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R10, R14 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j4 + + // | w8 @ R13 + MOVQ 32(R12), AX + MULQ R11 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R10, R13 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j5 + + // | w9 @ DI + MOVQ 40(R12), AX + MULQ R11 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R10, DI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j6 + + // | w10 @ SI + MOVQ 48(R12), AX + MULQ R11 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R10, SI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j7 + + // | w11 @ BX + MOVQ 56(R12), AX + MULQ R11 + ADDQ AX, BX + ADCQ DX, CX + ADDQ R10, BX + + // | move to idle register + MOVQ 56(SP), R9 + + // | w12 @ R9 + ADCQ CX, R9 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 5 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 R8 | 6 R15 | 7 R14 | 8 R13 | 9 DI + // | 10 SI | 11 BX | 12 R9 | 13 48(SP) | 14 40(SP) | 15 32(SP) | 16 24(SP) | 17 16(SP) | 18 8(SP) | 19 (SP) + + + // | | u5 = w5 * inp + MOVQ R8, AX + MULQ inp+32(FP) + MOVQ AX, R11 + MOVQ $0x00, R10 + + // | + +/* */ + + // | save u5 + MOVQ R11, 56(SP) + + // | j0 + + // | w5 @ R8 + MOVQ (R12), AX + MULQ R11 + ADDQ AX, R8 + ADCQ DX, R10 + + // | j1 + + // | w6 @ R15 + MOVQ 8(R12), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R10, R15 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j2 + + // | w7 @ R14 + MOVQ 16(R12), AX + MULQ R11 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R10, R14 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j3 + + // | w8 @ R13 + MOVQ 24(R12), AX + MULQ R11 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R10, R13 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j4 + + // | w9 @ DI + MOVQ 32(R12), AX + MULQ R11 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R10, DI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j5 + + // | w10 @ SI + MOVQ 40(R12), AX + MULQ R11 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R10, SI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j6 + + // | w11 @ BX + MOVQ 48(R12), AX + MULQ R11 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R10, BX + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j7 + + // | w12 @ R9 + MOVQ 56(R12), AX + MULQ R11 + ADDQ AX, R9 + ADCQ DX, CX + ADDQ R10, R9 + + // | move to idle register + MOVQ 48(SP), R8 + + // | w13 @ R8 + ADCQ CX, R8 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 6 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 R15 | 7 R14 | 8 R13 | 9 DI + // | 10 SI | 11 BX | 12 R9 | 13 R8 | 14 40(SP) | 15 32(SP) | 16 24(SP) | 17 16(SP) | 18 8(SP) | 19 (SP) + + + // | | u6 = w6 * inp + MOVQ R15, AX + MULQ inp+32(FP) + MOVQ AX, R11 + MOVQ $0x00, R10 + + // | + +/* */ + + // | save u6 + MOVQ R11, 48(SP) + + // | j0 + + // | w6 @ R15 + MOVQ (R12), AX + MULQ R11 + ADDQ AX, R15 + ADCQ DX, R10 + + // | j1 + + // | w7 @ R14 + MOVQ 8(R12), AX + MULQ R11 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R10, R14 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j2 + + // | w8 @ R13 + MOVQ 16(R12), AX + MULQ R11 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R10, R13 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j3 + + // | w9 @ DI + MOVQ 24(R12), AX + MULQ R11 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R10, DI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j4 + + // | w10 @ SI + MOVQ 32(R12), AX + MULQ R11 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R10, SI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j5 + + // | w11 @ BX + MOVQ 40(R12), AX + MULQ R11 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R10, BX + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j6 + + // | w12 @ R9 + MOVQ 48(R12), AX + MULQ R11 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R10, R9 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j7 + + // | w13 @ R8 + MOVQ 56(R12), AX + MULQ R11 + ADDQ AX, R8 + ADCQ DX, CX + ADDQ R10, R8 + + // | move to idle register + MOVQ 40(SP), R15 + + // | w14 @ R15 + ADCQ CX, R15 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 7 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 R14 | 8 R13 | 9 DI + // | 10 SI | 11 BX | 12 R9 | 13 R8 | 14 R15 | 15 32(SP) | 16 24(SP) | 17 16(SP) | 18 8(SP) | 19 (SP) + + + // | | u7 = w7 * inp + MOVQ R14, AX + MULQ inp+32(FP) + MOVQ AX, R11 + MOVQ $0x00, R10 + + // | + +/* */ + + // | save u7 + MOVQ R11, 40(SP) + + // | j0 + + // | w7 @ R14 + MOVQ (R12), AX + MULQ R11 + ADDQ AX, R14 + ADCQ DX, R10 + + // | j1 + + // | w8 @ R13 + MOVQ 8(R12), AX + MULQ R11 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R10, R13 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j2 + + // | w9 @ DI + MOVQ 16(R12), AX + MULQ R11 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R10, DI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j3 + + // | w10 @ SI + MOVQ 24(R12), AX + MULQ R11 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R10, SI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j4 + + // | w11 @ BX + MOVQ 32(R12), AX + MULQ R11 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R10, BX + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j5 + + // | w12 @ R9 + MOVQ 40(R12), AX + MULQ R11 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R10, R9 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j6 + + // | w13 @ R8 + MOVQ 48(R12), AX + MULQ R11 + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R10, R8 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j7 + + // | w14 @ R15 + MOVQ 56(R12), AX + MULQ R11 + ADDQ AX, R15 + ADCQ DX, CX + ADDQ R10, R15 + + // | move to idle register + MOVQ 32(SP), R14 + + // | w15 @ R14 + ADCQ CX, R14 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + // | W q1 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R13 | 9 DI + // | 10 SI | 11 BX | 12 R9 | 13 R8 | 14 R15 | 15 R14 | 16 24(SP) | 17 16(SP) | 18 8(SP) | 19 (SP) + + + // | save the carry from q1 + // | should be added to w16 + MOVQ CX, 32(SP) + + // | + +/* montgomerry reduction q2 */ + + // | + +/* i = 0 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R13 | 9 DI + // | 10 SI | 11 BX | 12 R9 | 13 R8 | 14 R15 | 15 R14 | 16 24(SP) | 17 16(SP) | 18 8(SP) | 19 (SP) + + + MOVQ $0x00, R10 + + // | + +/* */ + + // | j8 + + // | w8 @ R13 + MOVQ 64(R12), AX + MULQ 72(SP) + ADDQ AX, R13 + ADCQ DX, R10 + + // | j9 + + // | w9 @ DI + MOVQ 72(R12), AX + MULQ 72(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R10, DI + + // | w10 @ SI + ADCQ DX, SI + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R13 | 9 DI + // | 10 SI | 11 BX | 12 R9 | 13 R8 | 14 R15 | 15 R14 | 16 24(SP) | 17 16(SP) | 18 8(SP) | 19 (SP) + + + MOVQ $0x00, R10 + + // | + +/* */ + + // | j8 + + // | w9 @ DI + MOVQ 64(R12), AX + MULQ 80(SP) + ADDQ AX, DI + ADCQ DX, R10 + + // | j9 + + // | w10 @ SI + MOVQ 72(R12), AX + MULQ 80(SP) + ADDQ AX, SI + ADCQ DX, CX + ADDQ R10, SI + + // | w11 @ BX + ADCQ CX, BX + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R13 | 9 DI + // | 10 SI | 11 BX | 12 R9 | 13 R8 | 14 R15 | 15 R14 | 16 24(SP) | 17 16(SP) | 18 8(SP) | 19 (SP) + + + MOVQ $0x00, R10 + + // | + +/* */ + + // | j8 + + // | w10 @ SI + MOVQ 64(R12), AX + MULQ 88(SP) + ADDQ AX, SI + ADCQ DX, R10 + + // | j9 + + // | w11 @ BX + MOVQ 72(R12), AX + MULQ 88(SP) + ADDQ AX, BX + ADCQ DX, CX + ADDQ R10, BX + + // | w12 @ R9 + ADCQ CX, R9 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R13 | 9 DI + // | 10 SI | 11 BX | 12 R9 | 13 R8 | 14 R15 | 15 R14 | 16 24(SP) | 17 16(SP) | 18 8(SP) | 19 (SP) + + + MOVQ $0x00, R10 + + // | + +/* */ + + // | j8 + + // | w11 @ BX + MOVQ 64(R12), AX + MULQ 96(SP) + ADDQ AX, BX + ADCQ DX, R10 + + // | j9 + + // | w12 @ R9 + MOVQ 72(R12), AX + MULQ 96(SP) + ADDQ AX, R9 + ADCQ DX, CX + ADDQ R10, R9 + + // | w13 @ R8 + ADCQ CX, R8 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 4 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R13 | 9 DI + // | 10 SI | 11 BX | 12 R9 | 13 R8 | 14 R15 | 15 R14 | 16 24(SP) | 17 16(SP) | 18 8(SP) | 19 (SP) + + + MOVQ $0x00, R10 + + // | + +/* */ + + // | j8 + + // | w12 @ R9 + MOVQ 64(R12), AX + MULQ 64(SP) + ADDQ AX, R9 + ADCQ DX, R10 + + // | j9 + + // | w13 @ R8 + MOVQ 72(R12), AX + MULQ 64(SP) + ADDQ AX, R8 + ADCQ DX, CX + ADDQ R10, R8 + + // | w14 @ R15 + ADCQ CX, R15 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 5 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R13 | 9 DI + // | 10 SI | 11 BX | 12 R9 | 13 R8 | 14 R15 | 15 R14 | 16 24(SP) | 17 16(SP) | 18 8(SP) | 19 (SP) + + + MOVQ $0x00, R10 + + // | + +/* */ + + // | j8 + + // | w13 @ R8 + MOVQ 64(R12), AX + MULQ 56(SP) + ADDQ AX, R8 + ADCQ DX, R10 + + // | j9 + + // | w14 @ R15 + MOVQ 72(R12), AX + MULQ 56(SP) + ADDQ AX, R15 + ADCQ DX, CX + ADDQ R10, R15 + + // | w15 @ R14 + ADCQ CX, R14 + + // | bring the carry from q1 + MOVQ 32(SP), CX + ADCQ $0x00, CX + + // | + +/* i = 6 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R13 | 9 DI + // | 10 SI | 11 BX | 12 R9 | 13 R8 | 14 R15 | 15 R14 | 16 24(SP) | 17 16(SP) | 18 8(SP) | 19 (SP) + + + MOVQ $0x00, R10 + + // | + +/* */ + + // | j8 + + // | w14 @ R15 + MOVQ 64(R12), AX + MULQ 48(SP) + ADDQ AX, R15 + ADCQ DX, R10 + + // | j9 + + // | w15 @ R14 + MOVQ 72(R12), AX + MULQ 48(SP) + ADDQ AX, R14 + ADCQ DX, CX + ADDQ R10, R14 + + // | move to an idle register + MOVQ 24(SP), R11 + + // | w16 @ R11 + ADCQ CX, R11 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 7 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R13 | 9 DI + // | 10 SI | 11 BX | 12 R9 | 13 R8 | 14 R15 | 15 R14 | 16 R11 | 17 16(SP) | 18 8(SP) | 19 (SP) + + + MOVQ $0x00, R10 + + // | + +/* */ + + // | j8 + + // | w15 @ R14 + MOVQ 64(R12), AX + MULQ 40(SP) + ADDQ AX, R14 + ADCQ DX, R10 + + // | j9 + + // | w16 @ R11 + MOVQ 72(R12), AX + MULQ 40(SP) + ADDQ AX, R11 + ADCQ DX, CX + ADDQ R10, R11 + + // | tolarete this limb to stay in stack + // | w17 @ 16(SP) + ADCQ CX, 16(SP) + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + // | q2 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R13 | 9 DI + // | 10 SI | 11 BX | 12 R9 | 13 R8 | 14 R15 | 15 R14 | 16 R11 | 17 16(SP) | 18 8(SP) | 19 (SP) + + + // | save the carry from q2 + // | should be added to w18 + MOVQ CX, 32(SP) + + // | + // | W q2 q3 transition + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R13 | 9 DI + // | 10 SI | 11 BX | 12 R9 | 13 R8 | 14 R15 | 15 R14 | 16 R11 | 17 16(SP) | 18 8(SP) | 19 (SP) + + + // | + +/* montgomery reduction q3 */ + + // | + +/* i = 8 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R13 | 9 DI + // | 10 SI | 11 BX | 12 R9 | 13 R8 | 14 R15 | 15 R14 | 16 R11 | 17 16(SP) | 18 8(SP) | 19 (SP) + + + // | | u8 = w8 * inp + MOVQ R13, AX + MULQ inp+32(FP) + MOVQ AX, CX + MOVQ $0x00, R10 + + // | + +/* */ + + // | save u8 + MOVQ CX, 40(SP) + + // | j0 + + // | w8 @ R13 + MOVQ (R12), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R10 + + // | j1 + + // | w9 @ DI + MOVQ 8(R12), AX + MULQ CX + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R10, DI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j2 + + // | w10 @ SI + MOVQ 16(R12), AX + MULQ CX + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R10, SI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j3 + + // | w11 @ BX + MOVQ 24(R12), AX + MULQ CX + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R10, BX + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j4 + + // | w12 @ R9 + MOVQ 32(R12), AX + MULQ CX + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R10, R9 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j5 + + // | w13 @ R8 + MOVQ 40(R12), AX + MULQ CX + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R10, R8 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j6 + + // | w14 @ R15 + MOVQ 48(R12), AX + MULQ CX + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R10, R15 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j7 + + // | w15 @ R14 + MOVQ 56(R12), AX + MULQ CX + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R10, R14 + + // | w16 @ R11 + ADCQ DX, R11 + ADCQ $0x00, R13 + + // | + +/* i = 9 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 DI + // | 10 SI | 11 BX | 12 R9 | 13 R8 | 14 R15 | 15 R14 | 16 R11 | 17 16(SP) | 18 8(SP) | 19 (SP) + + + // | | u9 = w9 * inp + MOVQ DI, AX + MULQ inp+32(FP) + MOVQ AX, CX + MOVQ $0x00, R10 + + // | + +/* */ + + // | save u9 + MOVQ CX, 48(SP) + + // | j0 + + // | w9 @ DI + MOVQ (R12), AX + MULQ CX + ADDQ AX, DI + ADCQ DX, R10 + + // | j1 + + // | w10 @ SI + MOVQ 8(R12), AX + MULQ CX + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R10, SI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j2 + + // | w11 @ BX + MOVQ 16(R12), AX + MULQ CX + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R10, BX + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j3 + + // | w12 @ R9 + MOVQ 24(R12), AX + MULQ CX + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R10, R9 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j4 + + // | w13 @ R8 + MOVQ 32(R12), AX + MULQ CX + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R10, R8 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j5 + + // | w14 @ R15 + MOVQ 40(R12), AX + MULQ CX + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R10, R15 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j6 + + // | w15 @ R14 + MOVQ 48(R12), AX + MULQ CX + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R10, R14 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j7 + + // | w16 @ R11 + MOVQ 56(R12), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R13 + ADDQ R10, R11 + + // | move to idle register + MOVQ 16(SP), DI + + // | w17 @ DI + ADCQ R13, DI + MOVQ $0x00, R13 + ADCQ $0x00, R13 + + // | + // | W q3 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - + // | 10 SI | 11 BX | 12 R9 | 13 R8 | 14 R15 | 15 R14 | 16 R11 | 17 DI | 18 8(SP) | 19 (SP) + + + // | aggregate carries from q2 & q3 + // | should be added to w18 + ADCQ R13, 32(SP) + + // | + +/* montgomerry reduction q4 */ + + // | + +/* i = 0 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - + // | 10 SI | 11 BX | 12 R9 | 13 R8 | 14 R15 | 15 R14 | 16 R11 | 17 DI | 18 8(SP) | 19 (SP) + + + MOVQ $0x00, R10 + + // | + +/* */ + + // | j8 + + // | w16 @ R11 + MOVQ 64(R12), AX + MULQ 40(SP) + ADDQ AX, R11 + ADCQ DX, R10 + + // | j9 + + // | w17 @ DI + MOVQ 72(R12), AX + MULQ 40(SP) + ADDQ AX, DI + ADCQ 32(SP), DX + ADDQ R10, DI + MOVQ 8(SP), CX + + // | w18 @ CX + ADCQ DX, CX + MOVQ $0x00, R13 + ADCQ $0x00, R13 + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - + // | 10 SI | 11 BX | 12 R9 | 13 R8 | 14 R15 | 15 R14 | 16 R11 | 17 DI | 18 CX | 19 (SP) + + + MOVQ $0x00, R10 + + // | + +/* */ + + // | j8 + + // | w17 @ DI + MOVQ 64(R12), AX + MULQ 48(SP) + ADDQ AX, DI + ADCQ DX, R10 + + // | j9 + + // | w18 @ CX + MOVQ 72(R12), AX + MULQ 48(SP) + ADDQ AX, CX + ADCQ DX, R13 + ADDQ R10, CX + + // | very last limb goes to short carry register + MOVQ (SP), R10 + + // | w-1 @ R10 + ADCQ R13, R10 + MOVQ $0x00, R13 + ADCQ $0x00, R13 + + // | + // | W q4 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - + // | 10 SI | 11 BX | 12 R9 | 13 R8 | 14 R15 | 15 R14 | 16 R11 | 17 DI | 18 CX | 19 R10 + + + // | + +/* modular reduction */ + + MOVQ SI, DX + SUBQ (R12), DX + MOVQ DX, (SP) + MOVQ BX, DX + SBBQ 8(R12), DX + MOVQ DX, 16(SP) + MOVQ R9, DX + SBBQ 16(R12), DX + MOVQ DX, 56(SP) + MOVQ R8, DX + SBBQ 24(R12), DX + MOVQ DX, 64(SP) + MOVQ R15, DX + SBBQ 32(R12), DX + MOVQ DX, 72(SP) + MOVQ R14, DX + SBBQ 40(R12), DX + MOVQ DX, 80(SP) + MOVQ R11, DX + SBBQ 48(R12), DX + MOVQ DX, 88(SP) + MOVQ DI, DX + SBBQ 56(R12), DX + MOVQ DX, 96(SP) + MOVQ CX, DX + SBBQ 64(R12), DX + MOVQ DX, 104(SP) + MOVQ R10, DX + SBBQ 72(R12), DX + MOVQ DX, 112(SP) + SBBQ $0x00, R13 + + // | + +/* out */ + + MOVQ c+0(FP), R13 + CMOVQCC (SP), SI + MOVQ SI, (R13) + CMOVQCC 16(SP), BX + MOVQ BX, 8(R13) + CMOVQCC 56(SP), R9 + MOVQ R9, 16(R13) + CMOVQCC 64(SP), R8 + MOVQ R8, 24(R13) + CMOVQCC 72(SP), R15 + MOVQ R15, 32(R13) + CMOVQCC 80(SP), R14 + MOVQ R14, 40(R13) + CMOVQCC 88(SP), R11 + MOVQ R11, 48(R13) + CMOVQCC 96(SP), DI + MOVQ DI, 56(R13) + CMOVQCC 104(SP), CX + MOVQ CX, 64(R13) + CMOVQCC 112(SP), R10 + MOVQ R10, 72(R13) + RET + + // | + +/* end */ + + +// func cpy11(dst *[11]uint64, src *[11]uint64) +TEXT ·cpy11(SB), NOSPLIT, $0-16 + MOVQ dst+0(FP), DI + MOVQ src+8(FP), SI + MOVQ (SI), R8 + MOVQ R8, (DI) + MOVQ 8(SI), R8 + MOVQ R8, 8(DI) + MOVQ 16(SI), R8 + MOVQ R8, 16(DI) + MOVQ 24(SI), R8 + MOVQ R8, 24(DI) + MOVQ 32(SI), R8 + MOVQ R8, 32(DI) + MOVQ 40(SI), R8 + MOVQ R8, 40(DI) + MOVQ 48(SI), R8 + MOVQ R8, 48(DI) + MOVQ 56(SI), R8 + MOVQ R8, 56(DI) + MOVQ 64(SI), R8 + MOVQ R8, 64(DI) + MOVQ 72(SI), R8 + MOVQ R8, 72(DI) + MOVQ 80(SI), R8 + MOVQ R8, 80(DI) + RET + +// func eq11(a *[11]uint64, b *[11]uint64) bool +TEXT ·eq11(SB), NOSPLIT, $0-17 + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + MOVB $0x00, ret+16(FP) + MOVQ (DI), R8 + CMPQ (SI), R8 + JNE ret + MOVQ 8(DI), R8 + CMPQ 8(SI), R8 + JNE ret + MOVQ 16(DI), R8 + CMPQ 16(SI), R8 + JNE ret + MOVQ 24(DI), R8 + CMPQ 24(SI), R8 + JNE ret + MOVQ 32(DI), R8 + CMPQ 32(SI), R8 + JNE ret + MOVQ 40(DI), R8 + CMPQ 40(SI), R8 + JNE ret + MOVQ 48(DI), R8 + CMPQ 48(SI), R8 + JNE ret + MOVQ 56(DI), R8 + CMPQ 56(SI), R8 + JNE ret + MOVQ 64(DI), R8 + CMPQ 64(SI), R8 + JNE ret + MOVQ 72(DI), R8 + CMPQ 72(SI), R8 + JNE ret + MOVQ 80(DI), R8 + CMPQ 80(SI), R8 + JNE ret + MOVB $0x01, ret+16(FP) + +ret: + RET + +// func cmp11(a *[11]uint64, b *[11]uint64) int8 +TEXT ·cmp11(SB), NOSPLIT, $0-17 + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + MOVQ 80(DI), R8 + CMPQ 80(SI), R8 + JB gt + JA lt + MOVQ 72(DI), R8 + CMPQ 72(SI), R8 + JB gt + JA lt + MOVQ 64(DI), R8 + CMPQ 64(SI), R8 + JB gt + JA lt + MOVQ 56(DI), R8 + CMPQ 56(SI), R8 + JB gt + JA lt + MOVQ 48(DI), R8 + CMPQ 48(SI), R8 + JB gt + JA lt + MOVQ 40(DI), R8 + CMPQ 40(SI), R8 + JB gt + JA lt + MOVQ 32(DI), R8 + CMPQ 32(SI), R8 + JB gt + JA lt + MOVQ 24(DI), R8 + CMPQ 24(SI), R8 + JB gt + JA lt + MOVQ 16(DI), R8 + CMPQ 16(SI), R8 + JB gt + JA lt + MOVQ 8(DI), R8 + CMPQ 8(SI), R8 + JB gt + JA lt + MOVQ (DI), R8 + CMPQ (SI), R8 + JB gt + JA lt + MOVB $0x00, ret+16(FP) + JMP ret + +gt: + MOVB $0x01, ret+16(FP) + JMP ret + +lt: + MOVB $0xff, ret+16(FP) + +ret: + RET + +// func add11(c *[11]uint64, a *[11]uint64, b *[11]uint64, p *[11]uint64) +TEXT ·add11(SB), NOSPLIT, $96-32 + // | + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + XORQ AX, AX + + // | + MOVQ (DI), CX + ADDQ (SI), CX + MOVQ 8(DI), DX + ADCQ 8(SI), DX + MOVQ 16(DI), R8 + ADCQ 16(SI), R8 + MOVQ 24(DI), R9 + ADCQ 24(SI), R9 + MOVQ 32(DI), R10 + ADCQ 32(SI), R10 + MOVQ 40(DI), R11 + ADCQ 40(SI), R11 + MOVQ 48(DI), R12 + ADCQ 48(SI), R12 + MOVQ 56(DI), R13 + ADCQ 56(SI), R13 + MOVQ 64(DI), R14 + ADCQ 64(SI), R14 + MOVQ 72(DI), R15 + ADCQ 72(SI), R15 + MOVQ 80(DI), BX + ADCQ 80(SI), BX + MOVQ BX, (SP) + ADCQ $0x00, AX + + // | + MOVQ p+24(FP), SI + MOVQ CX, BX + SUBQ (SI), BX + MOVQ BX, 8(SP) + MOVQ DX, BX + SBBQ 8(SI), BX + MOVQ BX, 16(SP) + MOVQ R8, BX + SBBQ 16(SI), BX + MOVQ BX, 24(SP) + MOVQ R9, BX + SBBQ 24(SI), BX + MOVQ BX, 32(SP) + MOVQ R10, BX + SBBQ 32(SI), BX + MOVQ BX, 40(SP) + MOVQ R11, BX + SBBQ 40(SI), BX + MOVQ BX, 48(SP) + MOVQ R12, BX + SBBQ 48(SI), BX + MOVQ BX, 56(SP) + MOVQ R13, BX + SBBQ 56(SI), BX + MOVQ BX, 64(SP) + MOVQ R14, BX + SBBQ 64(SI), BX + MOVQ BX, 72(SP) + MOVQ R15, BX + SBBQ 72(SI), BX + MOVQ BX, 80(SP) + MOVQ (SP), BX + SBBQ 80(SI), BX + MOVQ BX, 88(SP) + SBBQ $0x00, AX + + // | + MOVQ c+0(FP), DI + CMOVQCC 8(SP), CX + MOVQ CX, (DI) + CMOVQCC 16(SP), DX + MOVQ DX, 8(DI) + CMOVQCC 24(SP), R8 + MOVQ R8, 16(DI) + CMOVQCC 32(SP), R9 + MOVQ R9, 24(DI) + CMOVQCC 40(SP), R10 + MOVQ R10, 32(DI) + CMOVQCC 48(SP), R11 + MOVQ R11, 40(DI) + CMOVQCC 56(SP), R12 + MOVQ R12, 48(DI) + CMOVQCC 64(SP), R13 + MOVQ R13, 56(DI) + CMOVQCC 72(SP), R14 + MOVQ R14, 64(DI) + CMOVQCC 80(SP), R15 + MOVQ R15, 72(DI) + MOVQ (SP), BX + CMOVQCC 88(SP), BX + MOVQ BX, 80(DI) + RET + + // | + +/* end */ + + RET + +// func addn11(a *[11]uint64, b *[11]uint64) uint64 +TEXT ·addn11(SB), NOSPLIT, $8-24 + // | + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + + // | + MOVQ (DI), CX + ADDQ (SI), CX + MOVQ 8(DI), DX + ADCQ 8(SI), DX + MOVQ 16(DI), R8 + ADCQ 16(SI), R8 + MOVQ 24(DI), R9 + ADCQ 24(SI), R9 + MOVQ 32(DI), R10 + ADCQ 32(SI), R10 + MOVQ 40(DI), R11 + ADCQ 40(SI), R11 + MOVQ 48(DI), R12 + ADCQ 48(SI), R12 + MOVQ 56(DI), R13 + ADCQ 56(SI), R13 + MOVQ 64(DI), R14 + ADCQ 64(SI), R14 + MOVQ 72(DI), R15 + ADCQ 72(SI), R15 + MOVQ 80(DI), BX + ADCQ 80(SI), BX + MOVQ BX, (SP) + ADCQ $0x00, AX + + // | + MOVQ CX, (DI) + MOVQ DX, 8(DI) + MOVQ R8, 16(DI) + MOVQ R9, 24(DI) + MOVQ R10, 32(DI) + MOVQ R11, 40(DI) + MOVQ R12, 48(DI) + MOVQ R13, 56(DI) + MOVQ R14, 64(DI) + MOVQ R15, 72(DI) + MOVQ (SP), BX + MOVQ BX, 80(DI) + MOVQ AX, ret+16(FP) + RET + + // | + +/* end */ + + RET + +// func double11(c *[11]uint64, a *[11]uint64, p *[11]uint64) +TEXT ·double11(SB), NOSPLIT, $96-24 + // | + MOVQ a+8(FP), DI + XORQ AX, AX + MOVQ (DI), CX + ADDQ CX, CX + MOVQ 8(DI), DX + ADCQ DX, DX + MOVQ 16(DI), R8 + ADCQ R8, R8 + MOVQ 24(DI), R9 + ADCQ R9, R9 + MOVQ 32(DI), R10 + ADCQ R10, R10 + MOVQ 40(DI), R11 + ADCQ R11, R11 + MOVQ 48(DI), R12 + ADCQ R12, R12 + MOVQ 56(DI), R13 + ADCQ R13, R13 + MOVQ 64(DI), R14 + ADCQ R14, R14 + MOVQ 72(DI), R15 + ADCQ R15, R15 + MOVQ 80(DI), BX + ADCQ BX, BX + MOVQ BX, (SP) + ADCQ $0x00, AX + + // | + MOVQ p+16(FP), SI + MOVQ CX, BX + SUBQ (SI), BX + MOVQ BX, 8(SP) + MOVQ DX, BX + SBBQ 8(SI), BX + MOVQ BX, 16(SP) + MOVQ R8, BX + SBBQ 16(SI), BX + MOVQ BX, 24(SP) + MOVQ R9, BX + SBBQ 24(SI), BX + MOVQ BX, 32(SP) + MOVQ R10, BX + SBBQ 32(SI), BX + MOVQ BX, 40(SP) + MOVQ R11, BX + SBBQ 40(SI), BX + MOVQ BX, 48(SP) + MOVQ R12, BX + SBBQ 48(SI), BX + MOVQ BX, 56(SP) + MOVQ R13, BX + SBBQ 56(SI), BX + MOVQ BX, 64(SP) + MOVQ R14, BX + SBBQ 64(SI), BX + MOVQ BX, 72(SP) + MOVQ R15, BX + SBBQ 72(SI), BX + MOVQ BX, 80(SP) + MOVQ (SP), BX + SBBQ 80(SI), BX + MOVQ BX, 88(SP) + SBBQ $0x00, AX + + // | + MOVQ c+0(FP), DI + CMOVQCC 8(SP), CX + MOVQ CX, (DI) + CMOVQCC 16(SP), DX + MOVQ DX, 8(DI) + CMOVQCC 24(SP), R8 + MOVQ R8, 16(DI) + CMOVQCC 32(SP), R9 + MOVQ R9, 24(DI) + CMOVQCC 40(SP), R10 + MOVQ R10, 32(DI) + CMOVQCC 48(SP), R11 + MOVQ R11, 40(DI) + CMOVQCC 56(SP), R12 + MOVQ R12, 48(DI) + CMOVQCC 64(SP), R13 + MOVQ R13, 56(DI) + CMOVQCC 72(SP), R14 + MOVQ R14, 64(DI) + CMOVQCC 80(SP), R15 + MOVQ R15, 72(DI) + MOVQ (SP), BX + CMOVQCC 88(SP), BX + MOVQ BX, 80(DI) + RET + + // | + +/* end */ + + RET + +// func sub11(c *[11]uint64, a *[11]uint64, b *[11]uint64, p *[11]uint64) +TEXT ·sub11(SB), NOSPLIT, $96-32 + // | + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + XORQ AX, AX + MOVQ (DI), CX + SUBQ (SI), CX + MOVQ 8(DI), DX + SBBQ 8(SI), DX + MOVQ 16(DI), R8 + SBBQ 16(SI), R8 + MOVQ 24(DI), R9 + SBBQ 24(SI), R9 + MOVQ 32(DI), R10 + SBBQ 32(SI), R10 + MOVQ 40(DI), R11 + SBBQ 40(SI), R11 + MOVQ 48(DI), R12 + SBBQ 48(SI), R12 + MOVQ 56(DI), R13 + SBBQ 56(SI), R13 + MOVQ 64(DI), R14 + SBBQ 64(SI), R14 + MOVQ 72(DI), R15 + SBBQ 72(SI), R15 + MOVQ 80(DI), BX + SBBQ 80(SI), BX + MOVQ BX, (SP) + + // | + MOVQ p+24(FP), SI + CMOVQCS (SI), AX + MOVQ AX, 8(SP) + CMOVQCS 8(SI), AX + MOVQ AX, 16(SP) + CMOVQCS 16(SI), AX + MOVQ AX, 24(SP) + CMOVQCS 24(SI), AX + MOVQ AX, 32(SP) + CMOVQCS 32(SI), AX + MOVQ AX, 40(SP) + CMOVQCS 40(SI), AX + MOVQ AX, 48(SP) + CMOVQCS 48(SI), AX + MOVQ AX, 56(SP) + CMOVQCS 56(SI), AX + MOVQ AX, 64(SP) + CMOVQCS 64(SI), AX + MOVQ AX, 72(SP) + CMOVQCS 72(SI), AX + MOVQ AX, 80(SP) + CMOVQCS 80(SI), AX + MOVQ AX, 88(SP) + + // | + MOVQ c+0(FP), DI + ADDQ 8(SP), CX + MOVQ CX, (DI) + ADCQ 16(SP), DX + MOVQ DX, 8(DI) + ADCQ 24(SP), R8 + MOVQ R8, 16(DI) + ADCQ 32(SP), R9 + MOVQ R9, 24(DI) + ADCQ 40(SP), R10 + MOVQ R10, 32(DI) + ADCQ 48(SP), R11 + MOVQ R11, 40(DI) + ADCQ 56(SP), R12 + MOVQ R12, 48(DI) + ADCQ 64(SP), R13 + MOVQ R13, 56(DI) + ADCQ 72(SP), R14 + MOVQ R14, 64(DI) + ADCQ 80(SP), R15 + MOVQ R15, 72(DI) + MOVQ (SP), BX + ADCQ 88(SP), BX + MOVQ BX, 80(DI) + RET + + // | + +/* end */ + + RET + +// func subn11(a *[11]uint64, b *[11]uint64) uint64 +TEXT ·subn11(SB), NOSPLIT, $8-24 + // | + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + XORQ AX, AX + + // | + MOVQ (DI), CX + SUBQ (SI), CX + MOVQ 8(DI), DX + SBBQ 8(SI), DX + MOVQ 16(DI), R8 + SBBQ 16(SI), R8 + MOVQ 24(DI), R9 + SBBQ 24(SI), R9 + MOVQ 32(DI), R10 + SBBQ 32(SI), R10 + MOVQ 40(DI), R11 + SBBQ 40(SI), R11 + MOVQ 48(DI), R12 + SBBQ 48(SI), R12 + MOVQ 56(DI), R13 + SBBQ 56(SI), R13 + MOVQ 64(DI), R14 + SBBQ 64(SI), R14 + MOVQ 72(DI), R15 + SBBQ 72(SI), R15 + MOVQ 80(DI), BX + SBBQ 80(SI), BX + MOVQ BX, (SP) + ADCQ $0x00, AX + + // | + MOVQ CX, (DI) + MOVQ DX, 8(DI) + MOVQ R8, 16(DI) + MOVQ R9, 24(DI) + MOVQ R10, 32(DI) + MOVQ R11, 40(DI) + MOVQ R12, 48(DI) + MOVQ R13, 56(DI) + MOVQ R14, 64(DI) + MOVQ R15, 72(DI) + MOVQ (SP), BX + MOVQ BX, 80(DI) + MOVQ AX, ret+16(FP) + RET + + // | + +/* end */ + + RET + +// func _neg11(c *[11]uint64, a *[11]uint64, p *[11]uint64) +TEXT ·_neg11(SB), NOSPLIT, $8-24 + // | + MOVQ a+8(FP), DI + + // | + MOVQ p+16(FP), SI + MOVQ (SI), CX + SUBQ (DI), CX + MOVQ 8(SI), DX + SBBQ 8(DI), DX + MOVQ 16(SI), R8 + SBBQ 16(DI), R8 + MOVQ 24(SI), R9 + SBBQ 24(DI), R9 + MOVQ 32(SI), R10 + SBBQ 32(DI), R10 + MOVQ 40(SI), R11 + SBBQ 40(DI), R11 + MOVQ 48(SI), R12 + SBBQ 48(DI), R12 + MOVQ 56(SI), R13 + SBBQ 56(DI), R13 + MOVQ 64(SI), R14 + SBBQ 64(DI), R14 + MOVQ 72(SI), R15 + SBBQ 72(DI), R15 + MOVQ 80(SI), BX + SBBQ 80(DI), BX + MOVQ BX, (SP) + + // | + MOVQ c+0(FP), DI + MOVQ CX, (DI) + MOVQ DX, 8(DI) + MOVQ R8, 16(DI) + MOVQ R9, 24(DI) + MOVQ R10, 32(DI) + MOVQ R11, 40(DI) + MOVQ R12, 48(DI) + MOVQ R13, 56(DI) + MOVQ R14, 64(DI) + MOVQ R15, 72(DI) + MOVQ (SP), BX + MOVQ BX, 80(DI) + RET + + // | + +/* end */ + + RET + +// func mul_two_11(a *[11]uint64) +TEXT ·mul_two_11(SB), NOSPLIT, $0-8 + MOVQ a+0(FP), DI + XORQ AX, AX + RCLQ $0x01, (DI) + RCLQ $0x01, 8(DI) + RCLQ $0x01, 16(DI) + RCLQ $0x01, 24(DI) + RCLQ $0x01, 32(DI) + RCLQ $0x01, 40(DI) + RCLQ $0x01, 48(DI) + RCLQ $0x01, 56(DI) + RCLQ $0x01, 64(DI) + RCLQ $0x01, 72(DI) + RCLQ $0x01, 80(DI) + RET + +// func div_two_11(a *[11]uint64) +TEXT ·div_two_11(SB), NOSPLIT, $0-8 + MOVQ a+0(FP), DI + XORQ AX, AX + RCRQ $0x01, 80(DI) + RCRQ $0x01, 72(DI) + RCRQ $0x01, 64(DI) + RCRQ $0x01, 56(DI) + RCRQ $0x01, 48(DI) + RCRQ $0x01, 40(DI) + RCRQ $0x01, 32(DI) + RCRQ $0x01, 24(DI) + RCRQ $0x01, 16(DI) + RCRQ $0x01, 8(DI) + RCRQ $0x01, (DI) + RET + +// func mul11(c *[11]uint64, a *[11]uint64, b *[11]uint64, p *[11]uint64, inp uint64) +TEXT ·mul11(SB), NOSPLIT, $184-40 + // | + +/* inputs */ + + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + XORQ AX, AX + + // | + +/* i = 0 */ + + // | a0 @ DX + MOVQ (DI), DX + + // | a0 * b0 + MULXQ (SI), AX, CX + MOVQ AX, (SP) + + // | a0 * b1 + MULXQ 8(SI), AX, R8 + ADCXQ AX, CX + + // | a0 * b2 + MULXQ 16(SI), AX, R9 + ADCXQ AX, R8 + + // | a0 * b3 + MULXQ 24(SI), AX, R10 + ADCXQ AX, R9 + + // | a0 * b4 + MULXQ 32(SI), AX, R11 + ADCXQ AX, R10 + + // | a0 * b5 + MULXQ 40(SI), AX, R12 + ADCXQ AX, R11 + + // | a0 * b6 + MULXQ 48(SI), AX, R13 + ADCXQ AX, R12 + + // | a0 * b7 + MULXQ 56(SI), AX, R14 + ADCXQ AX, R13 + + // | a0 * b8 + MULXQ 64(SI), AX, R15 + ADCXQ AX, R14 + ADCQ $0x00, R15 + + // | + +/* i = 1 */ + + // | a1 @ DX + MOVQ 8(DI), DX + XORQ AX, AX + + // | a1 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + MOVQ CX, 8(SP) + MOVQ $0x00, CX + + // | a1 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a1 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a1 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a1 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a1 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a1 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a1 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a1 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R15 + ADOXQ CX, CX + ADCXQ BX, CX + + // | + +/* i = 2 */ + + // | a2 @ DX + MOVQ 16(DI), DX + XORQ AX, AX + + // | a2 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + MOVQ R8, 16(SP) + MOVQ $0x00, R8 + + // | a2 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a2 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a2 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a2 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a2 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a2 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a2 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a2 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, CX + ADOXQ R8, R8 + ADCXQ BX, R8 + + // | + +/* i = 3 */ + + // | a3 @ DX + MOVQ 24(DI), DX + XORQ AX, AX + + // | a3 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + MOVQ R9, 24(SP) + MOVQ $0x00, R9 + + // | a3 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a3 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a3 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a3 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a3 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a3 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a3 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a3 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R8 + ADOXQ R9, R9 + ADCXQ BX, R9 + + // | + +/* i = 4 */ + + // | a4 @ DX + MOVQ 32(DI), DX + XORQ AX, AX + + // | a4 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + MOVQ R10, 32(SP) + MOVQ $0x00, R10 + + // | a4 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a4 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a4 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a4 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a4 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a4 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a4 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a4 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R9 + ADOXQ R10, R10 + ADCXQ BX, R10 + + // | + +/* i = 5 */ + + // | a5 @ DX + MOVQ 40(DI), DX + XORQ AX, AX + + // | a5 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + MOVQ R11, 40(SP) + MOVQ $0x00, R11 + + // | a5 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a5 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a5 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a5 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a5 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a5 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a5 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a5 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R10 + ADOXQ R11, R11 + ADCXQ BX, R11 + + // | + +/* i = 6 */ + + // | a6 @ DX + MOVQ 48(DI), DX + XORQ AX, AX + + // | a6 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + MOVQ R12, 48(SP) + MOVQ $0x00, R12 + + // | a6 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a6 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a6 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a6 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a6 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a6 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a6 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a6 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R11 + ADOXQ R12, R12 + ADCXQ BX, R12 + + // | + +/* i = 7 */ + + // | a7 @ DX + MOVQ 56(DI), DX + XORQ AX, AX + + // | a7 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + MOVQ R13, 56(SP) + MOVQ $0x00, R13 + + // | a7 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a7 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a7 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a7 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a7 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a7 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a7 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a7 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R12 + ADOXQ R13, R13 + ADCXQ BX, R13 + + // | + +/* i = 8 */ + + // | a8 @ DX + MOVQ 64(DI), DX + XORQ AX, AX + + // | a8 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + MOVQ R14, 64(SP) + MOVQ $0x00, R14 + + // | a8 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a8 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a8 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a8 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a8 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a8 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a8 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a8 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R13 + ADOXQ R14, R14 + ADCXQ BX, R14 + + // | + +/* i = 9 */ + + // | a9 @ DX + MOVQ 72(DI), DX + XORQ AX, AX + + // | a9 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + MOVQ R15, 72(SP) + MOVQ $0x00, R15 + + // | a9 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a9 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a9 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a9 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a9 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a9 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a9 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a9 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R14 + ADOXQ R15, R15 + ADCXQ BX, R15 + + // | + +/* i = 10 */ + + // | a10 @ DX + MOVQ 80(DI), DX + XORQ AX, AX + + // | a10 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + MOVQ CX, 80(SP) + MOVQ $0x00, CX + + // | a10 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a10 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a10 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a10 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a10 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a10 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a10 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a10 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R15 + ADOXQ BX, CX + ADCQ $0x00, CX + + // | + +/* */ + + // | + // | W right + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) + // | 11 R8 | 12 R9 | 13 R10 | 14 R11 | 15 R12 | 16 R13 | 17 R14 | 18 R15 | 19 CX | 20 - | 21 - + + + MOVQ R8, 88(SP) + MOVQ R9, 96(SP) + MOVQ R10, 104(SP) + MOVQ R11, 112(SP) + MOVQ R12, 120(SP) + MOVQ R13, 128(SP) + MOVQ R14, 136(SP) + MOVQ R15, 144(SP) + MOVQ CX, 152(SP) + + // | + // | W right at stack + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) + // | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 112(SP) | 15 120(SP) | 16 128(SP) | 17 136(SP) | 18 144(SP) | 19 152(SP) | 20 - | 21 - + + + XORQ AX, AX + + // | + +/* i = 0 */ + + // | a0 @ DX + MOVQ (DI), DX + + // | a0 * b9 + MULXQ 72(SI), AX, CX + MOVQ AX, 160(SP) + + // | a0 * b10 + MULXQ 80(SI), AX, R8 + ADCXQ AX, CX + ADCQ $0x00, R8 + + // | + +/* i = 1 */ + + // | a1 @ DX + MOVQ 8(DI), DX + XORQ R9, R9 + + // | a1 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + MOVQ CX, 168(SP) + + // | a1 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R8 + ADOXQ R9, R9 + ADCXQ BX, R9 + + // | + +/* i = 2 */ + + // | a2 @ DX + MOVQ 16(DI), DX + XORQ R10, R10 + + // | a2 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + MOVQ R8, 176(SP) + + // | a2 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R9 + ADOXQ R10, R10 + ADCXQ BX, R10 + + // | + +/* i = 3 */ + + // | a3 @ DX + MOVQ 24(DI), DX + XORQ R11, R11 + + // | a3 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a3 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R10 + ADOXQ R11, R11 + ADCXQ BX, R11 + + // | + +/* i = 4 */ + + // | a4 @ DX + MOVQ 32(DI), DX + XORQ R12, R12 + + // | a4 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a4 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R11 + ADOXQ R12, R12 + ADCXQ BX, R12 + + // | + +/* i = 5 */ + + // | a5 @ DX + MOVQ 40(DI), DX + XORQ R13, R13 + + // | a5 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a5 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R12 + ADOXQ R13, R13 + ADCXQ BX, R13 + + // | + +/* i = 6 */ + + // | a6 @ DX + MOVQ 48(DI), DX + XORQ R14, R14 + + // | a6 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a6 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R13 + ADOXQ R14, R14 + ADCXQ BX, R14 + + // | + +/* i = 7 */ + + // | a7 @ DX + MOVQ 56(DI), DX + XORQ R15, R15 + + // | a7 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a7 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R14 + ADOXQ R15, R15 + ADCXQ BX, R15 + + // | + +/* i = 8 */ + + // | a8 @ DX + MOVQ 64(DI), DX + XORQ CX, CX + + // | a8 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a8 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R15 + ADOXQ CX, CX + ADCXQ BX, CX + + // | + +/* i = 9 */ + + // | a9 @ DX + MOVQ 72(DI), DX + XORQ R8, R8 + + // | a9 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a9 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, CX + ADOXQ R8, R8 + ADCXQ BX, R8 + + // | + +/* i = 10 */ + + // | a10 @ DX + MOVQ 80(DI), DX + XORQ DI, DI + + // | a10 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a10 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R8 + ADOXQ BX, DI + ADCQ $0x00, DI + + // | + +/* */ + + // | + // | W left + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 160(SP) | 10 168(SP) + // | 11 176(SP) | 12 R9 | 13 R10 | 14 R11 | 15 R12 | 16 R13 | 17 R14 | 18 R15 | 19 CX | 20 R8 | 21 DI + + + // | + // | W right + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) + // | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 112(SP) | 15 120(SP) | 16 128(SP) | 17 136(SP) | 18 144(SP) | 19 152(SP) | 20 - | 21 - + + + MOVQ 72(SP), AX + ADDQ AX, 160(SP) + MOVQ 80(SP), AX + ADCQ AX, 168(SP) + MOVQ 88(SP), AX + ADCQ AX, 176(SP) + ADCQ 96(SP), R9 + ADCQ 104(SP), R10 + ADCQ 112(SP), R11 + ADCQ 120(SP), R12 + ADCQ 128(SP), R13 + ADCQ 136(SP), R14 + ADCQ 144(SP), R15 + ADCQ 152(SP), CX + ADCQ $0x00, R8 + ADCQ $0x00, DI + + // | + // | W combined + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 160(SP) | 10 168(SP) + // | 11 176(SP) | 12 R9 | 13 R10 | 14 R11 | 15 R12 | 16 R13 | 17 R14 | 18 R15 | 19 CX | 20 R8 | 21 DI + + + MOVQ (SP), BX + MOVQ 8(SP), SI + MOVQ DI, (SP) + MOVQ 16(SP), DI + MOVQ R8, 8(SP) + MOVQ 24(SP), R8 + MOVQ CX, 16(SP) + MOVQ 32(SP), CX + MOVQ R15, 24(SP) + MOVQ 40(SP), R15 + MOVQ R14, 32(SP) + MOVQ 48(SP), R14 + MOVQ R13, 40(SP) + MOVQ 56(SP), R13 + MOVQ R12, 48(SP) + MOVQ 64(SP), R12 + MOVQ R11, 56(SP) + MOVQ 160(SP), R11 + MOVQ R10, 64(SP) + MOVQ R9, 72(SP) + + // | fetch modulus + MOVQ p+24(FP), R9 + + // | + // | W ready to mont + // | 0 BX | 1 SI | 2 DI | 3 R8 | 4 CX | 5 R15 | 6 R14 | 7 R13 | 8 R12 | 9 R11 | 10 168(SP) + // | 11 176(SP) | 12 72(SP) | 13 64(SP) | 14 56(SP) | 15 48(SP) | 16 40(SP) | 17 32(SP) | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | + +/* montgomery reduction q1 */ + + // | clear flags + XORQ AX, AX + + // | + +/* i = 0 */ + + // | + // | W + // | 0 BX | 1 SI | 2 DI | 3 R8 | 4 CX | 5 R15 | 6 R14 | 7 R13 | 8 R12 | 9 R11 | 10 168(SP) + // | 11 176(SP) | 12 72(SP) | 13 64(SP) | 14 56(SP) | 15 48(SP) | 16 40(SP) | 17 32(SP) | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | | u0 = w0 * inp + MOVQ BX, DX + MULXQ inp+32(FP), DX, R10 + + // | save u0 + MOVQ DX, 80(SP) + + // | + +/* */ + + // | j0 + + // | w0 @ BX + MULXQ (R9), AX, R10 + ADOXQ AX, BX + ADCXQ R10, SI + + // | j1 + + // | w1 @ SI + MULXQ 8(R9), AX, R10 + ADOXQ AX, SI + ADCXQ R10, DI + + // | j2 + + // | w2 @ DI + MULXQ 16(R9), AX, R10 + ADOXQ AX, DI + ADCXQ R10, R8 + + // | j3 + + // | w3 @ R8 + MULXQ 24(R9), AX, R10 + ADOXQ AX, R8 + ADCXQ R10, CX + + // | j4 + + // | w4 @ CX + MULXQ 32(R9), AX, R10 + ADOXQ AX, CX + ADCXQ R10, R15 + + // | j5 + + // | w5 @ R15 + MULXQ 40(R9), AX, R10 + ADOXQ AX, R15 + ADCXQ R10, R14 + + // | j6 + + // | w6 @ R14 + MULXQ 48(R9), AX, R10 + ADOXQ AX, R14 + ADCXQ R10, R13 + + // | j7 + + // | w7 @ R13 + MULXQ 56(R9), AX, R10 + ADOXQ AX, R13 + ADCXQ R10, R12 + + // | j8 + + // | w8 @ R12 + MULXQ 64(R9), AX, R10 + ADOXQ AX, R12 + ADCXQ R10, R11 + ADOXQ BX, R11 + ADCXQ BX, BX + MOVQ $0x00, AX + ADOXQ AX, BX + + // | clear flags + XORQ AX, AX + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 SI | 2 DI | 3 R8 | 4 CX | 5 R15 | 6 R14 | 7 R13 | 8 R12 | 9 R11 | 10 168(SP) + // | 11 176(SP) | 12 72(SP) | 13 64(SP) | 14 56(SP) | 15 48(SP) | 16 40(SP) | 17 32(SP) | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | | u1 = w1 * inp + MOVQ SI, DX + MULXQ inp+32(FP), DX, R10 + + // | save u1 + MOVQ DX, 88(SP) + + // | + +/* */ + + // | j0 + + // | w1 @ SI + MULXQ (R9), AX, R10 + ADOXQ AX, SI + ADCXQ R10, DI + + // | j1 + + // | w2 @ DI + MULXQ 8(R9), AX, R10 + ADOXQ AX, DI + ADCXQ R10, R8 + + // | j2 + + // | w3 @ R8 + MULXQ 16(R9), AX, R10 + ADOXQ AX, R8 + ADCXQ R10, CX + + // | j3 + + // | w4 @ CX + MULXQ 24(R9), AX, R10 + ADOXQ AX, CX + ADCXQ R10, R15 + + // | j4 + + // | w5 @ R15 + MULXQ 32(R9), AX, R10 + ADOXQ AX, R15 + ADCXQ R10, R14 + + // | j5 + + // | w6 @ R14 + MULXQ 40(R9), AX, R10 + ADOXQ AX, R14 + ADCXQ R10, R13 + + // | j6 + + // | w7 @ R13 + MULXQ 48(R9), AX, R10 + ADOXQ AX, R13 + ADCXQ R10, R12 + + // | j7 + + // | w8 @ R12 + MULXQ 56(R9), AX, R10 + ADOXQ AX, R12 + ADCXQ R10, R11 + + // | j8 + + // | w9 @ R11 + MULXQ 64(R9), AX, R10 + ADOXQ AX, R11 + + // | w10 @ 168(SP) + // | move to temp register + MOVQ 168(SP), AX + ADCXQ R10, AX + ADOXQ BX, AX + + // | move to an idle register + // | w10 @ AX + MOVQ AX, BX + ADCXQ SI, SI + MOVQ $0x00, AX + ADOXQ AX, SI + + // | clear flags + XORQ AX, AX + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 DI | 3 R8 | 4 CX | 5 R15 | 6 R14 | 7 R13 | 8 R12 | 9 R11 | 10 BX + // | 11 176(SP) | 12 72(SP) | 13 64(SP) | 14 56(SP) | 15 48(SP) | 16 40(SP) | 17 32(SP) | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | | u2 = w2 * inp + MOVQ DI, DX + MULXQ inp+32(FP), DX, R10 + + // | save u2 + MOVQ DX, 96(SP) + + // | + +/* */ + + // | j0 + + // | w2 @ DI + MULXQ (R9), AX, R10 + ADOXQ AX, DI + ADCXQ R10, R8 + + // | j1 + + // | w3 @ R8 + MULXQ 8(R9), AX, R10 + ADOXQ AX, R8 + ADCXQ R10, CX + + // | j2 + + // | w4 @ CX + MULXQ 16(R9), AX, R10 + ADOXQ AX, CX + ADCXQ R10, R15 + + // | j3 + + // | w5 @ R15 + MULXQ 24(R9), AX, R10 + ADOXQ AX, R15 + ADCXQ R10, R14 + + // | j4 + + // | w6 @ R14 + MULXQ 32(R9), AX, R10 + ADOXQ AX, R14 + ADCXQ R10, R13 + + // | j5 + + // | w7 @ R13 + MULXQ 40(R9), AX, R10 + ADOXQ AX, R13 + ADCXQ R10, R12 + + // | j6 + + // | w8 @ R12 + MULXQ 48(R9), AX, R10 + ADOXQ AX, R12 + ADCXQ R10, R11 + + // | j7 + + // | w9 @ R11 + MULXQ 56(R9), AX, R10 + ADOXQ AX, R11 + ADCXQ R10, BX + + // | j8 + + // | w10 @ BX + MULXQ 64(R9), AX, R10 + ADOXQ AX, BX + + // | w11 @ 176(SP) + // | move to temp register + MOVQ 176(SP), AX + ADCXQ R10, AX + ADOXQ SI, AX + + // | move to an idle register + // | w11 @ AX + MOVQ AX, SI + ADCXQ DI, DI + MOVQ $0x00, AX + ADOXQ AX, DI + + // | clear flags + XORQ AX, AX + + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 R8 | 4 CX | 5 R15 | 6 R14 | 7 R13 | 8 R12 | 9 R11 | 10 BX + // | 11 SI | 12 72(SP) | 13 64(SP) | 14 56(SP) | 15 48(SP) | 16 40(SP) | 17 32(SP) | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | | u3 = w3 * inp + MOVQ R8, DX + MULXQ inp+32(FP), DX, R10 + + // | save u3 + MOVQ DX, 104(SP) + + // | + +/* */ + + // | j0 + + // | w3 @ R8 + MULXQ (R9), AX, R10 + ADOXQ AX, R8 + ADCXQ R10, CX + + // | j1 + + // | w4 @ CX + MULXQ 8(R9), AX, R10 + ADOXQ AX, CX + ADCXQ R10, R15 + + // | j2 + + // | w5 @ R15 + MULXQ 16(R9), AX, R10 + ADOXQ AX, R15 + ADCXQ R10, R14 + + // | j3 + + // | w6 @ R14 + MULXQ 24(R9), AX, R10 + ADOXQ AX, R14 + ADCXQ R10, R13 + + // | j4 + + // | w7 @ R13 + MULXQ 32(R9), AX, R10 + ADOXQ AX, R13 + ADCXQ R10, R12 + + // | j5 + + // | w8 @ R12 + MULXQ 40(R9), AX, R10 + ADOXQ AX, R12 + ADCXQ R10, R11 + + // | j6 + + // | w9 @ R11 + MULXQ 48(R9), AX, R10 + ADOXQ AX, R11 + ADCXQ R10, BX + + // | j7 + + // | w10 @ BX + MULXQ 56(R9), AX, R10 + ADOXQ AX, BX + ADCXQ R10, SI + + // | j8 + + // | w11 @ SI + MULXQ 64(R9), AX, R10 + ADOXQ AX, SI + + // | w12 @ 72(SP) + // | move to temp register + MOVQ 72(SP), AX + ADCXQ R10, AX + ADOXQ DI, AX + + // | move to an idle register + // | w12 @ AX + MOVQ AX, DI + ADCXQ R8, R8 + MOVQ $0x00, AX + ADOXQ AX, R8 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 4 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 CX | 5 R15 | 6 R14 | 7 R13 | 8 R12 | 9 R11 | 10 BX + // | 11 SI | 12 DI | 13 64(SP) | 14 56(SP) | 15 48(SP) | 16 40(SP) | 17 32(SP) | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | | u4 = w4 * inp + MOVQ CX, DX + MULXQ inp+32(FP), DX, R10 + + // | save u4 + MOVQ DX, 72(SP) + + // | + +/* */ + + // | j0 + + // | w4 @ CX + MULXQ (R9), AX, R10 + ADOXQ AX, CX + ADCXQ R10, R15 + + // | j1 + + // | w5 @ R15 + MULXQ 8(R9), AX, R10 + ADOXQ AX, R15 + ADCXQ R10, R14 + + // | j2 + + // | w6 @ R14 + MULXQ 16(R9), AX, R10 + ADOXQ AX, R14 + ADCXQ R10, R13 + + // | j3 + + // | w7 @ R13 + MULXQ 24(R9), AX, R10 + ADOXQ AX, R13 + ADCXQ R10, R12 + + // | j4 + + // | w8 @ R12 + MULXQ 32(R9), AX, R10 + ADOXQ AX, R12 + ADCXQ R10, R11 + + // | j5 + + // | w9 @ R11 + MULXQ 40(R9), AX, R10 + ADOXQ AX, R11 + ADCXQ R10, BX + + // | j6 + + // | w10 @ BX + MULXQ 48(R9), AX, R10 + ADOXQ AX, BX + ADCXQ R10, SI + + // | j7 + + // | w11 @ SI + MULXQ 56(R9), AX, R10 + ADOXQ AX, SI + ADCXQ R10, DI + + // | j8 + + // | w12 @ DI + MULXQ 64(R9), AX, R10 + ADOXQ AX, DI + + // | w13 @ 64(SP) + // | move to temp register + MOVQ 64(SP), AX + ADCXQ R10, AX + ADOXQ R8, AX + + // | move to an idle register + // | w13 @ AX + MOVQ AX, R8 + ADCXQ CX, CX + MOVQ $0x00, AX + ADOXQ AX, CX + + // | clear flags + XORQ AX, AX + + // | + +/* i = 5 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 R15 | 6 R14 | 7 R13 | 8 R12 | 9 R11 | 10 BX + // | 11 SI | 12 DI | 13 R8 | 14 56(SP) | 15 48(SP) | 16 40(SP) | 17 32(SP) | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | | u5 = w5 * inp + MOVQ R15, DX + MULXQ inp+32(FP), DX, R10 + + // | save u5 + MOVQ DX, 64(SP) + + // | + +/* */ + + // | j0 + + // | w5 @ R15 + MULXQ (R9), AX, R10 + ADOXQ AX, R15 + ADCXQ R10, R14 + + // | j1 + + // | w6 @ R14 + MULXQ 8(R9), AX, R10 + ADOXQ AX, R14 + ADCXQ R10, R13 + + // | j2 + + // | w7 @ R13 + MULXQ 16(R9), AX, R10 + ADOXQ AX, R13 + ADCXQ R10, R12 + + // | j3 + + // | w8 @ R12 + MULXQ 24(R9), AX, R10 + ADOXQ AX, R12 + ADCXQ R10, R11 + + // | j4 + + // | w9 @ R11 + MULXQ 32(R9), AX, R10 + ADOXQ AX, R11 + ADCXQ R10, BX + + // | j5 + + // | w10 @ BX + MULXQ 40(R9), AX, R10 + ADOXQ AX, BX + ADCXQ R10, SI + + // | j6 + + // | w11 @ SI + MULXQ 48(R9), AX, R10 + ADOXQ AX, SI + ADCXQ R10, DI + + // | j7 + + // | w12 @ DI + MULXQ 56(R9), AX, R10 + ADOXQ AX, DI + ADCXQ R10, R8 + + // | j8 + + // | w13 @ R8 + MULXQ 64(R9), AX, R10 + ADOXQ AX, R8 + + // | w14 @ 56(SP) + // | move to temp register + MOVQ 56(SP), AX + ADCXQ R10, AX + ADOXQ CX, AX + + // | move to an idle register + // | w14 @ AX + MOVQ AX, CX + ADCXQ R15, R15 + MOVQ $0x00, AX + ADOXQ AX, R15 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 6 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 R14 | 7 R13 | 8 R12 | 9 R11 | 10 BX + // | 11 SI | 12 DI | 13 R8 | 14 CX | 15 48(SP) | 16 40(SP) | 17 32(SP) | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | | u6 = w6 * inp + MOVQ R14, DX + MULXQ inp+32(FP), DX, R10 + + // | save u6 + MOVQ DX, 56(SP) + + // | + +/* */ + + // | j0 + + // | w6 @ R14 + MULXQ (R9), AX, R10 + ADOXQ AX, R14 + ADCXQ R10, R13 + + // | j1 + + // | w7 @ R13 + MULXQ 8(R9), AX, R10 + ADOXQ AX, R13 + ADCXQ R10, R12 + + // | j2 + + // | w8 @ R12 + MULXQ 16(R9), AX, R10 + ADOXQ AX, R12 + ADCXQ R10, R11 + + // | j3 + + // | w9 @ R11 + MULXQ 24(R9), AX, R10 + ADOXQ AX, R11 + ADCXQ R10, BX + + // | j4 + + // | w10 @ BX + MULXQ 32(R9), AX, R10 + ADOXQ AX, BX + ADCXQ R10, SI + + // | j5 + + // | w11 @ SI + MULXQ 40(R9), AX, R10 + ADOXQ AX, SI + ADCXQ R10, DI + + // | j6 + + // | w12 @ DI + MULXQ 48(R9), AX, R10 + ADOXQ AX, DI + ADCXQ R10, R8 + + // | j7 + + // | w13 @ R8 + MULXQ 56(R9), AX, R10 + ADOXQ AX, R8 + ADCXQ R10, CX + + // | j8 + + // | w14 @ CX + MULXQ 64(R9), AX, R10 + ADOXQ AX, CX + + // | w15 @ 48(SP) + // | move to temp register + MOVQ 48(SP), AX + ADCXQ R10, AX + ADOXQ R15, AX + + // | move to an idle register + // | w15 @ AX + MOVQ AX, R15 + ADCXQ R14, R14 + MOVQ $0x00, AX + ADOXQ AX, R14 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 7 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 R13 | 8 R12 | 9 R11 | 10 BX + // | 11 SI | 12 DI | 13 R8 | 14 CX | 15 R15 | 16 40(SP) | 17 32(SP) | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | | u7 = w7 * inp + MOVQ R13, DX + MULXQ inp+32(FP), DX, R10 + + // | save u7 + MOVQ DX, 48(SP) + + // | + +/* */ + + // | j0 + + // | w7 @ R13 + MULXQ (R9), AX, R10 + ADOXQ AX, R13 + ADCXQ R10, R12 + + // | j1 + + // | w8 @ R12 + MULXQ 8(R9), AX, R10 + ADOXQ AX, R12 + ADCXQ R10, R11 + + // | j2 + + // | w9 @ R11 + MULXQ 16(R9), AX, R10 + ADOXQ AX, R11 + ADCXQ R10, BX + + // | j3 + + // | w10 @ BX + MULXQ 24(R9), AX, R10 + ADOXQ AX, BX + ADCXQ R10, SI + + // | j4 + + // | w11 @ SI + MULXQ 32(R9), AX, R10 + ADOXQ AX, SI + ADCXQ R10, DI + + // | j5 + + // | w12 @ DI + MULXQ 40(R9), AX, R10 + ADOXQ AX, DI + ADCXQ R10, R8 + + // | j6 + + // | w13 @ R8 + MULXQ 48(R9), AX, R10 + ADOXQ AX, R8 + ADCXQ R10, CX + + // | j7 + + // | w14 @ CX + MULXQ 56(R9), AX, R10 + ADOXQ AX, CX + ADCXQ R10, R15 + + // | j8 + + // | w15 @ R15 + MULXQ 64(R9), AX, R10 + ADOXQ AX, R15 + + // | w16 @ 40(SP) + // | move to temp register + MOVQ 40(SP), AX + ADCXQ R10, AX + ADOXQ R14, AX + + // | move to an idle register + // | w16 @ AX + MOVQ AX, R14 + ADCXQ R13, R13 + MOVQ $0x00, AX + ADOXQ AX, R13 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 8 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R12 | 9 R11 | 10 BX + // | 11 SI | 12 DI | 13 R8 | 14 CX | 15 R15 | 16 R14 | 17 32(SP) | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | | u8 = w8 * inp + MOVQ R12, DX + MULXQ inp+32(FP), DX, R10 + + // | save u8 + MOVQ DX, 40(SP) + + // | + +/* */ + + // | j0 + + // | w8 @ R12 + MULXQ (R9), AX, R10 + ADOXQ AX, R12 + ADCXQ R10, R11 + + // | j1 + + // | w9 @ R11 + MULXQ 8(R9), AX, R10 + ADOXQ AX, R11 + ADCXQ R10, BX + + // | j2 + + // | w10 @ BX + MULXQ 16(R9), AX, R10 + ADOXQ AX, BX + ADCXQ R10, SI + + // | j3 + + // | w11 @ SI + MULXQ 24(R9), AX, R10 + ADOXQ AX, SI + ADCXQ R10, DI + + // | j4 + + // | w12 @ DI + MULXQ 32(R9), AX, R10 + ADOXQ AX, DI + ADCXQ R10, R8 + + // | j5 + + // | w13 @ R8 + MULXQ 40(R9), AX, R10 + ADOXQ AX, R8 + ADCXQ R10, CX + + // | j6 + + // | w14 @ CX + MULXQ 48(R9), AX, R10 + ADOXQ AX, CX + ADCXQ R10, R15 + + // | j7 + + // | w15 @ R15 + MULXQ 56(R9), AX, R10 + ADOXQ AX, R15 + ADCXQ R10, R14 + + // | j8 + + // | w16 @ R14 + MULXQ 64(R9), AX, R10 + ADOXQ AX, R14 + + // | w17 @ 32(SP) + // | move to temp register + MOVQ 32(SP), AX + ADCXQ R10, AX + ADOXQ R13, AX + + // | move to an idle register + // | w17 @ AX + MOVQ AX, R13 + ADCXQ R12, R12 + MOVQ $0x00, AX + ADOXQ AX, R12 + + // | + // | W montgomery reduction q1 ends + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R11 | 10 BX + // | 11 SI | 12 DI | 13 R8 | 14 CX | 15 R15 | 16 R14 | 17 R13 | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | carry from q1 should be added to w18 + MOVQ R12, 32(SP) + + // | + +/* montgomerry reduction q2 */ + + // | clear flags + XORQ R12, R12 + + // | + +/* i = 0 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R11 | 10 BX + // | 11 SI | 12 DI | 13 R8 | 14 CX | 15 R15 | 16 R14 | 17 R13 | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | u0 @ 80(SP) + MOVQ 80(SP), DX + + // | + +/* */ + + // | j9 + + // | w9 @ R11 + MULXQ 72(R9), AX, R10 + ADOXQ AX, R11 + ADCXQ R10, BX + + // | j10 + + // | w10 @ BX + MULXQ 80(R9), AX, R10 + ADOXQ AX, BX + ADCXQ R10, SI + ADOXQ R12, SI + MOVQ $0x00, R12 + ADCXQ R12, R12 + MOVQ $0x00, AX + ADOXQ AX, R12 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R11 | 10 BX + // | 11 SI | 12 DI | 13 R8 | 14 CX | 15 R15 | 16 R14 | 17 R13 | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | u1 @ 88(SP) + MOVQ 88(SP), DX + + // | + +/* */ + + // | j9 + + // | w10 @ BX + MULXQ 72(R9), AX, R10 + ADOXQ AX, BX + MOVQ BX, 80(SP) + ADCXQ R10, SI + + // | j10 + + // | w11 @ SI + MULXQ 80(R9), AX, R10 + ADOXQ AX, SI + ADCXQ R10, DI + ADOXQ R12, DI + MOVQ $0x00, R12 + ADCXQ R12, R12 + MOVQ $0x00, AX + ADOXQ AX, R12 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R11 | 10 80(SP) + // | 11 SI | 12 DI | 13 R8 | 14 CX | 15 R15 | 16 R14 | 17 R13 | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | u2 @ 96(SP) + MOVQ 96(SP), DX + + // | + +/* */ + + // | j9 + + // | w11 @ SI + MULXQ 72(R9), AX, R10 + ADOXQ AX, SI + MOVQ SI, 88(SP) + ADCXQ R10, DI + + // | j10 + + // | w12 @ DI + MULXQ 80(R9), AX, R10 + ADOXQ AX, DI + ADCXQ R10, R8 + ADOXQ R12, R8 + MOVQ $0x00, R12 + ADCXQ R12, R12 + MOVQ $0x00, AX + ADOXQ AX, R12 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R11 | 10 80(SP) + // | 11 88(SP) | 12 DI | 13 R8 | 14 CX | 15 R15 | 16 R14 | 17 R13 | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | u3 @ 104(SP) + MOVQ 104(SP), DX + + // | + +/* */ + + // | j9 + + // | w12 @ DI + MULXQ 72(R9), AX, R10 + ADOXQ AX, DI + ADCXQ R10, R8 + + // | j10 + + // | w13 @ R8 + MULXQ 80(R9), AX, R10 + ADOXQ AX, R8 + ADCXQ R10, CX + ADOXQ R12, CX + MOVQ $0x00, R12 + ADCXQ R12, R12 + MOVQ $0x00, AX + ADOXQ AX, R12 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 4 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R11 | 10 80(SP) + // | 11 88(SP) | 12 DI | 13 R8 | 14 CX | 15 R15 | 16 R14 | 17 R13 | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | u4 @ 72(SP) + MOVQ 72(SP), DX + + // | + +/* */ + + // | j9 + + // | w13 @ R8 + MULXQ 72(R9), AX, R10 + ADOXQ AX, R8 + ADCXQ R10, CX + + // | j10 + + // | w14 @ CX + MULXQ 80(R9), AX, R10 + ADOXQ AX, CX + ADCXQ R10, R15 + ADOXQ R12, R15 + MOVQ $0x00, R12 + ADCXQ R12, R12 + MOVQ $0x00, AX + ADOXQ AX, R12 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 5 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R11 | 10 80(SP) + // | 11 88(SP) | 12 DI | 13 R8 | 14 CX | 15 R15 | 16 R14 | 17 R13 | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | u5 @ 64(SP) + MOVQ 64(SP), DX + + // | + +/* */ + + // | j9 + + // | w14 @ CX + MULXQ 72(R9), AX, R10 + ADOXQ AX, CX + ADCXQ R10, R15 + + // | j10 + + // | w15 @ R15 + MULXQ 80(R9), AX, R10 + ADOXQ AX, R15 + ADCXQ R10, R14 + ADOXQ R12, R14 + MOVQ $0x00, R12 + ADCXQ R12, R12 + MOVQ $0x00, AX + ADOXQ AX, R12 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 6 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R11 | 10 80(SP) + // | 11 88(SP) | 12 DI | 13 R8 | 14 CX | 15 R15 | 16 R14 | 17 R13 | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | u6 @ 56(SP) + MOVQ 56(SP), DX + + // | + +/* */ + + // | j9 + + // | w15 @ R15 + MULXQ 72(R9), AX, R10 + ADOXQ AX, R15 + ADCXQ R10, R14 + + // | j10 + + // | w16 @ R14 + MULXQ 80(R9), AX, R10 + ADOXQ AX, R14 + ADCXQ R10, R13 + ADOXQ R12, R13 + + // | bring the carry from q1 + MOVQ 32(SP), R12 + MOVQ $0x00, AX + ADCXQ AX, R12 + ADOXQ AX, R12 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 7 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R11 | 10 80(SP) + // | 11 88(SP) | 12 DI | 13 R8 | 14 CX | 15 R15 | 16 R14 | 17 R13 | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | u7 @ 48(SP) + MOVQ 48(SP), DX + + // | + +/* */ + + // | j9 + + // | w16 @ R14 + MULXQ 72(R9), AX, R10 + ADOXQ AX, R14 + ADCXQ R10, R13 + + // | j10 + + // | w17 @ R13 + MULXQ 80(R9), AX, R10 + ADOXQ AX, R13 + + // | w18 @ 24(SP) + // | move to an idle register + MOVQ 24(SP), BX + + // | w18 @ BX + ADCXQ R10, BX + ADOXQ R12, BX + MOVQ $0x00, R12 + ADCXQ R12, R12 + MOVQ $0x00, AX + ADOXQ AX, R12 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 8 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R11 | 10 80(SP) + // | 11 88(SP) | 12 DI | 13 R8 | 14 CX | 15 R15 | 16 R14 | 17 R13 | 18 BX | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | u8 @ 40(SP) + MOVQ 40(SP), DX + + // | + +/* */ + + // | j9 + + // | w17 @ R13 + MULXQ 72(R9), AX, R10 + ADOXQ AX, R13 + ADCXQ R10, BX + + // | j10 + + // | w18 @ BX + MULXQ 80(R9), AX, R10 + ADOXQ AX, BX + + // | w19 @ 16(SP) + // | move to an idle register + MOVQ 16(SP), SI + + // | w19 @ SI + ADCXQ R10, SI + ADOXQ R12, SI + MOVQ $0x00, R12 + ADCXQ R12, R12 + MOVQ $0x00, AX + ADOXQ AX, R12 + + // | + // | q2 ends + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R11 | 10 80(SP) + // | 11 88(SP) | 12 DI | 13 R8 | 14 CX | 15 R15 | 16 R14 | 17 R13 | 18 BX | 19 SI | 20 8(SP) | 21 (SP) + + + // | save the carry from q2 + // | should be added to w20 + MOVQ R12, 32(SP) + + // | + +/* q2 q3 transition swap */ + + MOVQ 80(SP), R12 + MOVQ SI, 16(SP) + MOVQ 88(SP), SI + + // | + // | W q2 q3 transition + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R11 | 10 R12 + // | 11 SI | 12 DI | 13 R8 | 14 CX | 15 R15 | 16 R14 | 17 R13 | 18 BX | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | + +/* montgomery reduction q3 */ + + // | clear flags + XORQ AX, AX + + // | + +/* i = 9 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R11 | 10 R12 + // | 11 SI | 12 DI | 13 R8 | 14 CX | 15 R15 | 16 R14 | 17 R13 | 18 BX | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | | u9 = w9 * inp + MOVQ R11, DX + MULXQ inp+32(FP), DX, R10 + + // | save u9 + MOVQ DX, 24(SP) + + // | + +/* */ + + // | j0 + + // | w9 @ R11 + MULXQ (R9), AX, R10 + ADOXQ AX, R11 + ADCXQ R10, R12 + + // | j1 + + // | w10 @ R12 + MULXQ 8(R9), AX, R10 + ADOXQ AX, R12 + ADCXQ R10, SI + + // | j2 + + // | w11 @ SI + MULXQ 16(R9), AX, R10 + ADOXQ AX, SI + ADCXQ R10, DI + + // | j3 + + // | w12 @ DI + MULXQ 24(R9), AX, R10 + ADOXQ AX, DI + ADCXQ R10, R8 + + // | j4 + + // | w13 @ R8 + MULXQ 32(R9), AX, R10 + ADOXQ AX, R8 + ADCXQ R10, CX + + // | j5 + + // | w14 @ CX + MULXQ 40(R9), AX, R10 + ADOXQ AX, CX + ADCXQ R10, R15 + + // | j6 + + // | w15 @ R15 + MULXQ 48(R9), AX, R10 + ADOXQ AX, R15 + ADCXQ R10, R14 + + // | j7 + + // | w16 @ R14 + MULXQ 56(R9), AX, R10 + ADOXQ AX, R14 + ADCXQ R10, R13 + + // | j8 + + // | w17 @ R13 + MULXQ 64(R9), AX, R10 + ADOXQ AX, R13 + ADCXQ R10, BX + ADOXQ R11, BX + ADCXQ R11, R11 + MOVQ $0x00, AX + ADOXQ AX, R11 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 10 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 R12 + // | 11 SI | 12 DI | 13 R8 | 14 CX | 15 R15 | 16 R14 | 17 R13 | 18 BX | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | | u10 = w10 * inp + MOVQ R12, DX + MULXQ inp+32(FP), DX, R10 + + // | save u10 + MOVQ DX, 40(SP) + + // | + +/* */ + + // | j0 + + // | w10 @ R12 + MULXQ (R9), AX, R10 + ADOXQ AX, R12 + ADCXQ R10, SI + + // | j1 + + // | w11 @ SI + MULXQ 8(R9), AX, R10 + ADOXQ AX, SI + ADCXQ R10, DI + + // | j2 + + // | w12 @ DI + MULXQ 16(R9), AX, R10 + ADOXQ AX, DI + ADCXQ R10, R8 + + // | j3 + + // | w13 @ R8 + MULXQ 24(R9), AX, R10 + ADOXQ AX, R8 + ADCXQ R10, CX + + // | j4 + + // | w14 @ CX + MULXQ 32(R9), AX, R10 + ADOXQ AX, CX + ADCXQ R10, R15 + + // | j5 + + // | w15 @ R15 + MULXQ 40(R9), AX, R10 + ADOXQ AX, R15 + ADCXQ R10, R14 + + // | j6 + + // | w16 @ R14 + MULXQ 48(R9), AX, R10 + ADOXQ AX, R14 + ADCXQ R10, R13 + + // | j7 + + // | w17 @ R13 + MULXQ 56(R9), AX, R10 + ADOXQ AX, R13 + ADCXQ R10, BX + + // | j8 + + // | w18 @ BX + MULXQ 64(R9), AX, R10 + ADOXQ AX, BX + + // | w19 @ 16(SP) + // | move to temp register + MOVQ 16(SP), AX + ADCXQ R10, AX + ADOXQ R11, AX + + // | move to an idle register + // | w19 @ AX + MOVQ AX, R11 + ADCXQ R12, R12 + MOVQ $0x00, AX + ADOXQ AX, R12 + + // | + // | W q3 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - + // | 11 SI | 12 DI | 13 R8 | 14 CX | 15 R15 | 16 R14 | 17 R13 | 18 BX | 19 R11 | 20 8(SP) | 21 (SP) + + + // | aggregate carries from q2 & q3 + // | should be added to w20 + ADCQ 32(SP), R12 + + // | + +/* montgomerry reduction q4 */ + + // | clear flags + XORQ AX, AX + + // | + +/* i = 0 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - + // | 11 SI | 12 DI | 13 R8 | 14 CX | 15 R15 | 16 R14 | 17 R13 | 18 BX | 19 R11 | 20 8(SP) | 21 (SP) + + + // | u0 @ 24(SP) + MOVQ 24(SP), DX + + // | + +/* */ + + // | j9 + + // | w18 @ BX + MULXQ 72(R9), AX, R10 + ADOXQ AX, BX + ADCXQ R10, R11 + MOVQ BX, 16(SP) + + // | j10 + + // | w19 @ R11 + MULXQ 80(R9), AX, R10 + ADOXQ AX, R11 + + // | w20 @ 8(SP) + // | move to an idle register + MOVQ 8(SP), BX + ADCXQ R10, BX + + // | bring carry from q2 & q3 + // | w20 @ BX + ADOXQ R12, BX + MOVQ $0x00, R12 + ADCXQ R12, R12 + MOVQ $0x00, R10 + ADOXQ R10, R12 + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - + // | 11 SI | 12 DI | 13 R8 | 14 CX | 15 R15 | 16 R14 | 17 R13 | 18 16(SP) | 19 R11 | 20 BX | 21 (SP) + + + // | u1 @ 40(SP) + MOVQ 40(SP), DX + + // | + +/* */ + + // | j9 + + // | w19 @ R11 + MULXQ 72(R9), AX, R10 + ADOXQ AX, R11 + ADCXQ R10, BX + + // | j10 + + // | w20 @ BX + MULXQ 80(R9), AX, R10 + ADOXQ AX, BX + + // | w21 @ (SP) + // | move to an idle register + MOVQ (SP), AX + ADCXQ R10, AX + + // | w21 @ AX + ADOXQ R12, AX + MOVQ $0x00, R12 + ADCXQ R12, R12 + MOVQ $0x00, R10 + ADOXQ R10, R12 + + // | + // | W q4 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - + // | 11 SI | 12 DI | 13 R8 | 14 CX | 15 R15 | 16 R14 | 17 R13 | 18 16(SP) | 19 R11 | 20 BX | 21 AX + + + // | + +/* modular reduction */ + + MOVQ SI, R10 + SUBQ (R9), R10 + MOVQ DI, DX + SBBQ 8(R9), DX + MOVQ DX, (SP) + MOVQ R8, DX + SBBQ 16(R9), DX + MOVQ DX, 8(SP) + MOVQ CX, DX + SBBQ 24(R9), DX + MOVQ DX, 24(SP) + MOVQ R15, DX + SBBQ 32(R9), DX + MOVQ DX, 32(SP) + MOVQ R14, DX + SBBQ 40(R9), DX + MOVQ DX, 40(SP) + MOVQ R13, DX + SBBQ 48(R9), DX + MOVQ DX, 48(SP) + MOVQ 16(SP), DX + SBBQ 56(R9), DX + MOVQ DX, 56(SP) + MOVQ R11, DX + SBBQ 64(R9), DX + MOVQ DX, 64(SP) + MOVQ BX, DX + SBBQ 72(R9), DX + MOVQ DX, 72(SP) + MOVQ AX, DX + SBBQ 80(R9), DX + MOVQ DX, 80(SP) + SBBQ $0x00, R12 + + // | + +/* out */ + + MOVQ c+0(FP), R12 + CMOVQCC R10, SI + MOVQ SI, (R12) + CMOVQCC (SP), DI + MOVQ DI, 8(R12) + CMOVQCC 8(SP), R8 + MOVQ R8, 16(R12) + CMOVQCC 24(SP), CX + MOVQ CX, 24(R12) + CMOVQCC 32(SP), R15 + MOVQ R15, 32(R12) + CMOVQCC 40(SP), R14 + MOVQ R14, 40(R12) + CMOVQCC 48(SP), R13 + MOVQ R13, 48(R12) + MOVQ 16(SP), DX + CMOVQCC 56(SP), DX + MOVQ DX, 56(R12) + CMOVQCC 64(SP), R11 + MOVQ R11, 64(R12) + CMOVQCC 72(SP), BX + MOVQ BX, 72(R12) + CMOVQCC 80(SP), AX + MOVQ AX, 80(R12) + RET + + // | + +/* end */ + + +// func mul_no_adx_bmi2_11(c *[11]uint64, a *[11]uint64, b *[11]uint64, p *[11]uint64, inp uint64) +TEXT ·mul_no_adx_bmi2_11(SB), NOSPLIT, $192-40 + // | + +/* inputs */ + + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + MOVQ $0x00, R9 + MOVQ $0x00, R10 + MOVQ $0x00, R11 + MOVQ $0x00, R12 + MOVQ $0x00, R13 + MOVQ $0x00, R14 + MOVQ $0x00, R15 + + // | + +/* i = 0 */ + + // | a0 @ CX + MOVQ (DI), CX + + // | a0 * b0 + MOVQ (SI), AX + MULQ CX + MOVQ AX, (SP) + MOVQ DX, R8 + + // | a0 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + + // | a0 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + + // | a0 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + + // | a0 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + + // | a0 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + + // | a0 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + + // | a0 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + + // | + +/* i = 1 */ + + // | a1 @ CX + MOVQ 8(DI), CX + MOVQ $0x00, BX + + // | a1 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R10 + ADCQ $0x00, BX + MOVQ R8, 8(SP) + MOVQ $0x00, R8 + + // | a1 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + + // | a1 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + + // | + +/* i = 2 */ + + // | a2 @ CX + MOVQ 16(DI), CX + MOVQ $0x00, BX + + // | a2 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ $0x00, R11 + ADCQ $0x00, BX + MOVQ R9, 16(SP) + MOVQ $0x00, R9 + + // | a2 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + + // | a2 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + + // | + +/* i = 3 */ + + // | a3 @ CX + MOVQ 24(DI), CX + MOVQ $0x00, BX + + // | a3 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ $0x00, R12 + ADCQ $0x00, BX + MOVQ R10, 24(SP) + MOVQ $0x00, R10 + + // | a3 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + + // | a3 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + + // | + +/* i = 4 */ + + // | a4 @ CX + MOVQ 32(DI), CX + MOVQ $0x00, BX + + // | a4 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ $0x00, R13 + ADCQ $0x00, BX + MOVQ R11, 32(SP) + MOVQ $0x00, R11 + + // | a4 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + + // | a4 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + + // | + +/* i = 5 */ + + // | a5 @ CX + MOVQ 40(DI), CX + MOVQ $0x00, BX + + // | a5 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ $0x00, R14 + ADCQ $0x00, BX + MOVQ R12, 40(SP) + MOVQ $0x00, R12 + + // | a5 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + + // | a5 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + + // | + +/* i = 6 */ + + // | a6 @ CX + MOVQ 48(DI), CX + MOVQ $0x00, BX + + // | a6 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ $0x00, R15 + ADCQ $0x00, BX + MOVQ R13, 48(SP) + MOVQ $0x00, R13 + + // | a6 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + + // | a6 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + + // | + +/* i = 7 */ + + // | a7 @ CX + MOVQ 56(DI), CX + MOVQ $0x00, BX + + // | a7 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ $0x00, R8 + ADCQ $0x00, BX + MOVQ R14, 56(SP) + MOVQ $0x00, R14 + + // | a7 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + + // | a7 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + + // | + +/* i = 8 */ + + // | a8 @ CX + MOVQ 64(DI), CX + MOVQ $0x00, BX + + // | a8 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ $0x00, R9 + ADCQ $0x00, BX + MOVQ R15, 64(SP) + MOVQ $0x00, R15 + + // | a8 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + + // | a8 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + + // | + +/* i = 9 */ + + // | a9 @ CX + MOVQ 72(DI), CX + MOVQ $0x00, BX + + // | a9 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R10 + ADCQ $0x00, BX + MOVQ R8, 72(SP) + MOVQ $0x00, R8 + + // | a9 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a9 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a9 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a9 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a9 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a9 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + + // | a9 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + + // | + +/* i = 10 */ + + // | a10 @ CX + MOVQ 80(DI), CX + MOVQ $0x00, BX + + // | a10 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ $0x00, R11 + ADCQ $0x00, BX + + // | a10 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a10 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a10 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a10 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a10 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a10 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ $0x00, BX + + // | a10 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, BX + + // | + +/* */ + + // | + // | W part 1 multiplication + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 R9 + // | 11 R10 | 12 R11 | 13 R12 | 14 R13 | 15 R14 | 16 R15 | 17 R8 | 18 BX | 19 - | 20 - | 21 - + + + MOVQ R9, 80(SP) + MOVQ R10, 88(SP) + MOVQ R11, 96(SP) + MOVQ R12, 104(SP) + MOVQ R13, 112(SP) + MOVQ R14, 120(SP) + MOVQ R15, 128(SP) + MOVQ R8, 136(SP) + MOVQ BX, 144(SP) + + // | + // | W part 1 moved to stack + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) + // | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 112(SP) | 15 120(SP) | 16 128(SP) | 17 136(SP) | 18 144(SP) | 19 - | 20 - | 21 - + + + MOVQ $0x00, R9 + MOVQ $0x00, R10 + MOVQ $0x00, R11 + MOVQ $0x00, R12 + MOVQ $0x00, R13 + MOVQ $0x00, R14 + MOVQ $0x00, R15 + + // | + +/* i = 0 */ + + // | a0 @ CX + MOVQ (DI), CX + + // | a0 * b8 + MOVQ 64(SI), AX + MULQ CX + MOVQ AX, 152(SP) + MOVQ DX, R8 + + // | a0 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + + // | a0 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + + // | + +/* i = 1 */ + + // | a1 @ CX + MOVQ 8(DI), CX + MOVQ $0x00, BX + + // | a1 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R10 + ADCQ $0x00, BX + MOVQ R8, 160(SP) + MOVQ $0x00, R8 + + // | a1 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + + // | a1 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + + // | + +/* i = 2 */ + + // | a2 @ CX + MOVQ 16(DI), CX + MOVQ $0x00, BX + + // | a2 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ $0x00, R11 + ADCQ $0x00, BX + MOVQ R9, 168(SP) + MOVQ $0x00, R9 + + // | a2 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + + // | a2 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + + // | + +/* i = 3 */ + + // | a3 @ CX + MOVQ 24(DI), CX + MOVQ $0x00, BX + + // | a3 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ $0x00, R12 + ADCQ $0x00, BX + MOVQ R10, 176(SP) + MOVQ $0x00, R10 + + // | a3 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + + // | a3 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + + // | + +/* i = 4 */ + + // | a4 @ CX + MOVQ 32(DI), CX + MOVQ $0x00, BX + + // | a4 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ $0x00, R13 + ADCQ $0x00, BX + MOVQ R11, 184(SP) + MOVQ $0x00, R11 + + // | a4 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + + // | a4 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + + // | + +/* i = 5 */ + + // | a5 @ CX + MOVQ 40(DI), CX + MOVQ $0x00, BX + + // | a5 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ $0x00, R14 + ADCQ $0x00, BX + + // | a5 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + + // | a5 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + + // | + +/* i = 6 */ + + // | a6 @ CX + MOVQ 48(DI), CX + MOVQ $0x00, BX + + // | a6 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ $0x00, R15 + ADCQ $0x00, BX + + // | a6 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + + // | a6 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + + // | + +/* i = 7 */ + + // | a7 @ CX + MOVQ 56(DI), CX + MOVQ $0x00, BX + + // | a7 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ $0x00, R8 + ADCQ $0x00, BX + + // | a7 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + + // | a7 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + + // | + +/* i = 8 */ + + // | a8 @ CX + MOVQ 64(DI), CX + MOVQ $0x00, BX + + // | a8 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ $0x00, R9 + ADCQ $0x00, BX + + // | a8 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + + // | a8 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + + // | + +/* i = 9 */ + + // | a9 @ CX + MOVQ 72(DI), CX + MOVQ $0x00, BX + + // | a9 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R10 + ADCQ $0x00, BX + + // | a9 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + + // | a9 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + + // | + +/* i = 10 */ + + // | a10 @ CX + MOVQ 80(DI), CX + MOVQ $0x00, BX + + // | a10 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ $0x00, R11 + ADCQ $0x00, BX + + // | a10 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ $0x00, BX + + // | a10 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, BX + + // | + +/* */ + + // | + // | W part 2 multiplication + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 152(SP) | 9 160(SP) | 10 168(SP) + // | 11 176(SP) | 12 184(SP) | 13 R12 | 14 R13 | 15 R14 | 16 R15 | 17 R8 | 18 R9 | 19 R10 | 20 R11 | 21 BX + + + // | + // | W part 1 + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) + // | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 112(SP) | 15 120(SP) | 16 128(SP) | 17 136(SP) | 18 144(SP) | 19 - | 20 - | 21 - + + + MOVQ 64(SP), AX + ADDQ AX, 152(SP) + MOVQ 72(SP), AX + ADCQ AX, 160(SP) + MOVQ 80(SP), AX + ADCQ AX, 168(SP) + MOVQ 88(SP), AX + ADCQ AX, 176(SP) + MOVQ 96(SP), AX + ADCQ AX, 184(SP) + ADCQ 104(SP), R12 + ADCQ 112(SP), R13 + ADCQ 120(SP), R14 + ADCQ 128(SP), R15 + ADCQ 136(SP), R8 + ADCQ 144(SP), R9 + ADCQ $0x00, R10 + ADCQ $0x00, R11 + ADCQ $0x00, BX + + // | + // | W combined + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 152(SP) | 9 160(SP) | 10 168(SP) + // | 11 176(SP) | 12 184(SP) | 13 R12 | 14 R13 | 15 R14 | 16 R15 | 17 R8 | 18 R9 | 19 R10 | 20 R11 | 21 BX + + + MOVQ (SP), CX + MOVQ 8(SP), DI + MOVQ 16(SP), SI + MOVQ BX, (SP) + MOVQ 24(SP), BX + MOVQ R11, 8(SP) + MOVQ 32(SP), R11 + MOVQ R10, 16(SP) + MOVQ 40(SP), R10 + MOVQ R9, 24(SP) + MOVQ 48(SP), R9 + MOVQ R8, 32(SP) + MOVQ 56(SP), R8 + MOVQ R15, 40(SP) + MOVQ 152(SP), R15 + MOVQ R14, 48(SP) + MOVQ R13, 56(SP) + MOVQ R12, 64(SP) + + // | fetch modulus + MOVQ p+24(FP), R14 + + // | + +/* montgomery reduction q1 */ + + // | + +/* i = 0 */ + + // | + // | W + // | 0 CX | 1 DI | 2 SI | 3 BX | 4 R11 | 5 R10 | 6 R9 | 7 R8 | 8 R15 | 9 160(SP) | 10 168(SP) + // | 11 176(SP) | 12 184(SP) | 13 64(SP) | 14 56(SP) | 15 48(SP) | 16 40(SP) | 17 32(SP) | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | | u0 = w0 * inp + MOVQ CX, AX + MULQ inp+32(FP) + MOVQ AX, R13 + MOVQ $0x00, R12 + + // | + +/* */ + + // | save u0 + MOVQ R13, 72(SP) + + // | j0 + + // | w0 @ CX + MOVQ (R14), AX + MULQ R13 + ADDQ AX, CX + ADCQ DX, R12 + + // | j1 + + // | w1 @ DI + MOVQ 8(R14), AX + MULQ R13 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R12, DI + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j2 + + // | w2 @ SI + MOVQ 16(R14), AX + MULQ R13 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R12, SI + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j3 + + // | w3 @ BX + MOVQ 24(R14), AX + MULQ R13 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R12, BX + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j4 + + // | w4 @ R11 + MOVQ 32(R14), AX + MULQ R13 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R12, R11 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j5 + + // | w5 @ R10 + MOVQ 40(R14), AX + MULQ R13 + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R12, R10 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j6 + + // | w6 @ R9 + MOVQ 48(R14), AX + MULQ R13 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R12, R9 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j7 + + // | w7 @ R8 + MOVQ 56(R14), AX + MULQ R13 + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R12, R8 + + // | w8 @ R15 + ADCQ DX, R15 + ADCQ $0x00, CX + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 DI | 2 SI | 3 BX | 4 R11 | 5 R10 | 6 R9 | 7 R8 | 8 R15 | 9 160(SP) | 10 168(SP) + // | 11 176(SP) | 12 184(SP) | 13 64(SP) | 14 56(SP) | 15 48(SP) | 16 40(SP) | 17 32(SP) | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | | u1 = w1 * inp + MOVQ DI, AX + MULQ inp+32(FP) + MOVQ AX, R13 + MOVQ $0x00, R12 + + // | + +/* */ + + // | save u1 + MOVQ R13, 80(SP) + + // | j0 + + // | w1 @ DI + MOVQ (R14), AX + MULQ R13 + ADDQ AX, DI + ADCQ DX, R12 + + // | j1 + + // | w2 @ SI + MOVQ 8(R14), AX + MULQ R13 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R12, SI + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j2 + + // | w3 @ BX + MOVQ 16(R14), AX + MULQ R13 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R12, BX + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j3 + + // | w4 @ R11 + MOVQ 24(R14), AX + MULQ R13 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R12, R11 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j4 + + // | w5 @ R10 + MOVQ 32(R14), AX + MULQ R13 + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R12, R10 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j5 + + // | w6 @ R9 + MOVQ 40(R14), AX + MULQ R13 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R12, R9 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j6 + + // | w7 @ R8 + MOVQ 48(R14), AX + MULQ R13 + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R12, R8 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j7 + + // | w8 @ R15 + MOVQ 56(R14), AX + MULQ R13 + ADDQ AX, R15 + ADCQ DX, CX + ADDQ R12, R15 + + // | move to idle register + MOVQ 160(SP), DI + + // | w9 @ DI + ADCQ CX, DI + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 SI | 3 BX | 4 R11 | 5 R10 | 6 R9 | 7 R8 | 8 R15 | 9 DI | 10 168(SP) + // | 11 176(SP) | 12 184(SP) | 13 64(SP) | 14 56(SP) | 15 48(SP) | 16 40(SP) | 17 32(SP) | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | | u2 = w2 * inp + MOVQ SI, AX + MULQ inp+32(FP) + MOVQ AX, R13 + MOVQ $0x00, R12 + + // | + +/* */ + + // | save u2 + MOVQ R13, 88(SP) + + // | j0 + + // | w2 @ SI + MOVQ (R14), AX + MULQ R13 + ADDQ AX, SI + ADCQ DX, R12 + + // | j1 + + // | w3 @ BX + MOVQ 8(R14), AX + MULQ R13 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R12, BX + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j2 + + // | w4 @ R11 + MOVQ 16(R14), AX + MULQ R13 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R12, R11 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j3 + + // | w5 @ R10 + MOVQ 24(R14), AX + MULQ R13 + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R12, R10 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j4 + + // | w6 @ R9 + MOVQ 32(R14), AX + MULQ R13 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R12, R9 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j5 + + // | w7 @ R8 + MOVQ 40(R14), AX + MULQ R13 + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R12, R8 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j6 + + // | w8 @ R15 + MOVQ 48(R14), AX + MULQ R13 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R12, R15 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j7 + + // | w9 @ DI + MOVQ 56(R14), AX + MULQ R13 + ADDQ AX, DI + ADCQ DX, CX + ADDQ R12, DI + + // | move to idle register + MOVQ 168(SP), SI + + // | w10 @ SI + ADCQ CX, SI + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 BX | 4 R11 | 5 R10 | 6 R9 | 7 R8 | 8 R15 | 9 DI | 10 SI + // | 11 176(SP) | 12 184(SP) | 13 64(SP) | 14 56(SP) | 15 48(SP) | 16 40(SP) | 17 32(SP) | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | | u3 = w3 * inp + MOVQ BX, AX + MULQ inp+32(FP) + MOVQ AX, R13 + MOVQ $0x00, R12 + + // | + +/* */ + + // | save u3 + MOVQ R13, 96(SP) + + // | j0 + + // | w3 @ BX + MOVQ (R14), AX + MULQ R13 + ADDQ AX, BX + ADCQ DX, R12 + + // | j1 + + // | w4 @ R11 + MOVQ 8(R14), AX + MULQ R13 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R12, R11 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j2 + + // | w5 @ R10 + MOVQ 16(R14), AX + MULQ R13 + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R12, R10 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j3 + + // | w6 @ R9 + MOVQ 24(R14), AX + MULQ R13 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R12, R9 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j4 + + // | w7 @ R8 + MOVQ 32(R14), AX + MULQ R13 + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R12, R8 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j5 + + // | w8 @ R15 + MOVQ 40(R14), AX + MULQ R13 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R12, R15 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j6 + + // | w9 @ DI + MOVQ 48(R14), AX + MULQ R13 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R12, DI + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j7 + + // | w10 @ SI + MOVQ 56(R14), AX + MULQ R13 + ADDQ AX, SI + ADCQ DX, CX + ADDQ R12, SI + + // | move to idle register + MOVQ 176(SP), BX + + // | w11 @ BX + ADCQ CX, BX + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 4 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 R11 | 5 R10 | 6 R9 | 7 R8 | 8 R15 | 9 DI | 10 SI + // | 11 BX | 12 184(SP) | 13 64(SP) | 14 56(SP) | 15 48(SP) | 16 40(SP) | 17 32(SP) | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | | u4 = w4 * inp + MOVQ R11, AX + MULQ inp+32(FP) + MOVQ AX, R13 + MOVQ $0x00, R12 + + // | + +/* */ + + // | save u4 + MOVQ R13, 104(SP) + + // | j0 + + // | w4 @ R11 + MOVQ (R14), AX + MULQ R13 + ADDQ AX, R11 + ADCQ DX, R12 + + // | j1 + + // | w5 @ R10 + MOVQ 8(R14), AX + MULQ R13 + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R12, R10 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j2 + + // | w6 @ R9 + MOVQ 16(R14), AX + MULQ R13 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R12, R9 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j3 + + // | w7 @ R8 + MOVQ 24(R14), AX + MULQ R13 + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R12, R8 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j4 + + // | w8 @ R15 + MOVQ 32(R14), AX + MULQ R13 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R12, R15 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j5 + + // | w9 @ DI + MOVQ 40(R14), AX + MULQ R13 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R12, DI + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j6 + + // | w10 @ SI + MOVQ 48(R14), AX + MULQ R13 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R12, SI + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j7 + + // | w11 @ BX + MOVQ 56(R14), AX + MULQ R13 + ADDQ AX, BX + ADCQ DX, CX + ADDQ R12, BX + + // | move to idle register + MOVQ 184(SP), R11 + + // | w12 @ R11 + ADCQ CX, R11 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 5 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 R10 | 6 R9 | 7 R8 | 8 R15 | 9 DI | 10 SI + // | 11 BX | 12 R11 | 13 64(SP) | 14 56(SP) | 15 48(SP) | 16 40(SP) | 17 32(SP) | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | | u5 = w5 * inp + MOVQ R10, AX + MULQ inp+32(FP) + MOVQ AX, R13 + MOVQ $0x00, R12 + + // | + +/* */ + + // | save u5 + MOVQ R13, 112(SP) + + // | j0 + + // | w5 @ R10 + MOVQ (R14), AX + MULQ R13 + ADDQ AX, R10 + ADCQ DX, R12 + + // | j1 + + // | w6 @ R9 + MOVQ 8(R14), AX + MULQ R13 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R12, R9 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j2 + + // | w7 @ R8 + MOVQ 16(R14), AX + MULQ R13 + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R12, R8 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j3 + + // | w8 @ R15 + MOVQ 24(R14), AX + MULQ R13 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R12, R15 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j4 + + // | w9 @ DI + MOVQ 32(R14), AX + MULQ R13 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R12, DI + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j5 + + // | w10 @ SI + MOVQ 40(R14), AX + MULQ R13 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R12, SI + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j6 + + // | w11 @ BX + MOVQ 48(R14), AX + MULQ R13 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R12, BX + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j7 + + // | w12 @ R11 + MOVQ 56(R14), AX + MULQ R13 + ADDQ AX, R11 + ADCQ DX, CX + ADDQ R12, R11 + + // | move to idle register + MOVQ 64(SP), R10 + + // | w13 @ R10 + ADCQ CX, R10 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 6 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 R9 | 7 R8 | 8 R15 | 9 DI | 10 SI + // | 11 BX | 12 R11 | 13 R10 | 14 56(SP) | 15 48(SP) | 16 40(SP) | 17 32(SP) | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | | u6 = w6 * inp + MOVQ R9, AX + MULQ inp+32(FP) + MOVQ AX, R13 + MOVQ $0x00, R12 + + // | + +/* */ + + // | save u6 + MOVQ R13, 64(SP) + + // | j0 + + // | w6 @ R9 + MOVQ (R14), AX + MULQ R13 + ADDQ AX, R9 + ADCQ DX, R12 + + // | j1 + + // | w7 @ R8 + MOVQ 8(R14), AX + MULQ R13 + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R12, R8 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j2 + + // | w8 @ R15 + MOVQ 16(R14), AX + MULQ R13 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R12, R15 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j3 + + // | w9 @ DI + MOVQ 24(R14), AX + MULQ R13 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R12, DI + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j4 + + // | w10 @ SI + MOVQ 32(R14), AX + MULQ R13 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R12, SI + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j5 + + // | w11 @ BX + MOVQ 40(R14), AX + MULQ R13 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R12, BX + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j6 + + // | w12 @ R11 + MOVQ 48(R14), AX + MULQ R13 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R12, R11 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j7 + + // | w13 @ R10 + MOVQ 56(R14), AX + MULQ R13 + ADDQ AX, R10 + ADCQ DX, CX + ADDQ R12, R10 + + // | move to idle register + MOVQ 56(SP), R9 + + // | w14 @ R9 + ADCQ CX, R9 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 7 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 R8 | 8 R15 | 9 DI | 10 SI + // | 11 BX | 12 R11 | 13 R10 | 14 R9 | 15 48(SP) | 16 40(SP) | 17 32(SP) | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | | u7 = w7 * inp + MOVQ R8, AX + MULQ inp+32(FP) + MOVQ AX, R13 + MOVQ $0x00, R12 + + // | + +/* */ + + // | save u7 + MOVQ R13, 56(SP) + + // | j0 + + // | w7 @ R8 + MOVQ (R14), AX + MULQ R13 + ADDQ AX, R8 + ADCQ DX, R12 + + // | j1 + + // | w8 @ R15 + MOVQ 8(R14), AX + MULQ R13 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R12, R15 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j2 + + // | w9 @ DI + MOVQ 16(R14), AX + MULQ R13 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R12, DI + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j3 + + // | w10 @ SI + MOVQ 24(R14), AX + MULQ R13 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R12, SI + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j4 + + // | w11 @ BX + MOVQ 32(R14), AX + MULQ R13 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R12, BX + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j5 + + // | w12 @ R11 + MOVQ 40(R14), AX + MULQ R13 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R12, R11 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j6 + + // | w13 @ R10 + MOVQ 48(R14), AX + MULQ R13 + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R12, R10 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j7 + + // | w14 @ R9 + MOVQ 56(R14), AX + MULQ R13 + ADDQ AX, R9 + ADCQ DX, CX + ADDQ R12, R9 + + // | move to idle register + MOVQ 48(SP), R8 + + // | w15 @ R8 + ADCQ CX, R8 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + // | W q1 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R15 | 9 DI | 10 SI + // | 11 BX | 12 R11 | 13 R10 | 14 R9 | 15 R8 | 16 40(SP) | 17 32(SP) | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | save the carry from q1 + // | should be added to w16 + MOVQ CX, 48(SP) + + // | + +/* montgomerry reduction q2 */ + + // | + +/* i = 0 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R15 | 9 DI | 10 SI + // | 11 BX | 12 R11 | 13 R10 | 14 R9 | 15 R8 | 16 40(SP) | 17 32(SP) | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + MOVQ $0x00, R12 + + // | + +/* */ + + // | j8 + + // | w8 @ R15 + MOVQ 64(R14), AX + MULQ 72(SP) + ADDQ AX, R15 + ADCQ DX, R12 + + // | j9 + + // | w9 @ DI + MOVQ 72(R14), AX + MULQ 72(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R12, DI + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j10 + + // | w10 @ SI + MOVQ 80(R14), AX + MULQ 72(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R12, SI + + // | w11 @ BX + ADCQ DX, BX + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R15 | 9 DI | 10 SI + // | 11 BX | 12 R11 | 13 R10 | 14 R9 | 15 R8 | 16 40(SP) | 17 32(SP) | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + MOVQ $0x00, R12 + + // | + +/* */ + + // | j8 + + // | w9 @ DI + MOVQ 64(R14), AX + MULQ 80(SP) + ADDQ AX, DI + ADCQ DX, R12 + MOVQ DI, 72(SP) + + // | j9 + + // | w10 @ SI + MOVQ 72(R14), AX + MULQ 80(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R12, SI + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j10 + + // | w11 @ BX + MOVQ 80(R14), AX + MULQ 80(SP) + ADDQ AX, BX + ADCQ DX, CX + ADDQ R12, BX + + // | w12 @ R11 + ADCQ CX, R11 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R15 | 9 72(SP) | 10 SI + // | 11 BX | 12 R11 | 13 R10 | 14 R9 | 15 R8 | 16 40(SP) | 17 32(SP) | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + MOVQ $0x00, R12 + + // | + +/* */ + + // | j8 + + // | w10 @ SI + MOVQ 64(R14), AX + MULQ 88(SP) + ADDQ AX, SI + ADCQ DX, R12 + + // | j9 + + // | w11 @ BX + MOVQ 72(R14), AX + MULQ 88(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R12, BX + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j10 + + // | w12 @ R11 + MOVQ 80(R14), AX + MULQ 88(SP) + ADDQ AX, R11 + ADCQ DX, CX + ADDQ R12, R11 + + // | w13 @ R10 + ADCQ CX, R10 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R15 | 9 72(SP) | 10 SI + // | 11 BX | 12 R11 | 13 R10 | 14 R9 | 15 R8 | 16 40(SP) | 17 32(SP) | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + MOVQ $0x00, R12 + + // | + +/* */ + + // | j8 + + // | w11 @ BX + MOVQ 64(R14), AX + MULQ 96(SP) + ADDQ AX, BX + ADCQ DX, R12 + + // | j9 + + // | w12 @ R11 + MOVQ 72(R14), AX + MULQ 96(SP) + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R12, R11 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j10 + + // | w13 @ R10 + MOVQ 80(R14), AX + MULQ 96(SP) + ADDQ AX, R10 + ADCQ DX, CX + ADDQ R12, R10 + + // | w14 @ R9 + ADCQ CX, R9 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 4 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R15 | 9 72(SP) | 10 SI + // | 11 BX | 12 R11 | 13 R10 | 14 R9 | 15 R8 | 16 40(SP) | 17 32(SP) | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + MOVQ $0x00, R12 + + // | + +/* */ + + // | j8 + + // | w12 @ R11 + MOVQ 64(R14), AX + MULQ 104(SP) + ADDQ AX, R11 + ADCQ DX, R12 + + // | j9 + + // | w13 @ R10 + MOVQ 72(R14), AX + MULQ 104(SP) + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R12, R10 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j10 + + // | w14 @ R9 + MOVQ 80(R14), AX + MULQ 104(SP) + ADDQ AX, R9 + ADCQ DX, CX + ADDQ R12, R9 + + // | w15 @ R8 + ADCQ CX, R8 + + // | bring the carry from q1 + MOVQ 48(SP), CX + ADCQ $0x00, CX + + // | + +/* i = 5 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R15 | 9 72(SP) | 10 SI + // | 11 BX | 12 R11 | 13 R10 | 14 R9 | 15 R8 | 16 40(SP) | 17 32(SP) | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + MOVQ $0x00, R12 + + // | + +/* */ + + // | j8 + + // | w13 @ R10 + MOVQ 64(R14), AX + MULQ 112(SP) + ADDQ AX, R10 + ADCQ DX, R12 + + // | j9 + + // | w14 @ R9 + MOVQ 72(R14), AX + MULQ 112(SP) + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R12, R9 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j10 + + // | w15 @ R8 + MOVQ 80(R14), AX + MULQ 112(SP) + ADDQ AX, R8 + ADCQ DX, CX + ADDQ R12, R8 + + // | move to an idle register + MOVQ 40(SP), R13 + + // | w16 @ R13 + ADCQ CX, R13 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 6 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R15 | 9 72(SP) | 10 SI + // | 11 BX | 12 R11 | 13 R10 | 14 R9 | 15 R8 | 16 R13 | 17 32(SP) | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + MOVQ $0x00, R12 + + // | + +/* */ + + // | j8 + + // | w14 @ R9 + MOVQ 64(R14), AX + MULQ 64(SP) + ADDQ AX, R9 + ADCQ DX, R12 + + // | j9 + + // | w15 @ R8 + MOVQ 72(R14), AX + MULQ 64(SP) + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R12, R8 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j10 + + // | w16 @ R13 + MOVQ 80(R14), AX + MULQ 64(SP) + ADDQ AX, R13 + ADCQ DX, CX + ADDQ R12, R13 + + // | move to an idle register + MOVQ 32(SP), DI + + // | w17 @ DI + ADCQ CX, DI + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 7 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R15 | 9 72(SP) | 10 SI + // | 11 BX | 12 R11 | 13 R10 | 14 R9 | 15 R8 | 16 R13 | 17 DI | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + MOVQ $0x00, R12 + + // | + +/* */ + + // | j8 + + // | w15 @ R8 + MOVQ 64(R14), AX + MULQ 56(SP) + ADDQ AX, R8 + ADCQ DX, R12 + + // | j9 + + // | w16 @ R13 + MOVQ 72(R14), AX + MULQ 56(SP) + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R12, R13 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j10 + + // | w17 @ DI + MOVQ 80(R14), AX + MULQ 56(SP) + ADDQ AX, DI + ADCQ DX, CX + ADDQ R12, DI + + // | tolarete this limb to stay in stack + // | w18 @ 24(SP) + ADCQ CX, 24(SP) + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + // | q2 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R15 | 9 72(SP) | 10 SI + // | 11 BX | 12 R11 | 13 R10 | 14 R9 | 15 R8 | 16 R13 | 17 DI | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | save the carry from q2 + // | should be added to w19 + MOVQ CX, 48(SP) + + // | + +/* q2 q3 transition swap */ + + MOVQ 72(SP), CX + MOVQ DI, 72(SP) + + // | + // | W q2 q3 transition + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R15 | 9 CX | 10 SI + // | 11 BX | 12 R11 | 13 R10 | 14 R9 | 15 R8 | 16 R13 | 17 72(SP) | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | + +/* montgomery reduction q3 */ + + // | + +/* i = 8 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R15 | 9 CX | 10 SI + // | 11 BX | 12 R11 | 13 R10 | 14 R9 | 15 R8 | 16 R13 | 17 72(SP) | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | | u8 = w8 * inp + MOVQ R15, AX + MULQ inp+32(FP) + MOVQ AX, DI + MOVQ $0x00, R12 + + // | + +/* */ + + // | save u8 + MOVQ DI, 56(SP) + + // | j0 + + // | w8 @ R15 + MOVQ (R14), AX + MULQ DI + ADDQ AX, R15 + ADCQ DX, R12 + + // | j1 + + // | w9 @ CX + MOVQ 8(R14), AX + MULQ DI + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R12, CX + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j2 + + // | w10 @ SI + MOVQ 16(R14), AX + MULQ DI + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R12, SI + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j3 + + // | w11 @ BX + MOVQ 24(R14), AX + MULQ DI + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R12, BX + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j4 + + // | w12 @ R11 + MOVQ 32(R14), AX + MULQ DI + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R12, R11 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j5 + + // | w13 @ R10 + MOVQ 40(R14), AX + MULQ DI + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R12, R10 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j6 + + // | w14 @ R9 + MOVQ 48(R14), AX + MULQ DI + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R12, R9 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j7 + + // | w15 @ R8 + MOVQ 56(R14), AX + MULQ DI + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R12, R8 + + // | w16 @ R13 + ADCQ DX, R13 + ADCQ $0x00, R15 + + // | + +/* i = 9 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 CX | 10 SI + // | 11 BX | 12 R11 | 13 R10 | 14 R9 | 15 R8 | 16 R13 | 17 72(SP) | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | | u9 = w9 * inp + MOVQ CX, AX + MULQ inp+32(FP) + MOVQ AX, DI + MOVQ $0x00, R12 + + // | + +/* */ + + // | save u9 + MOVQ DI, 64(SP) + + // | j0 + + // | w9 @ CX + MOVQ (R14), AX + MULQ DI + ADDQ AX, CX + ADCQ DX, R12 + + // | j1 + + // | w10 @ SI + MOVQ 8(R14), AX + MULQ DI + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R12, SI + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j2 + + // | w11 @ BX + MOVQ 16(R14), AX + MULQ DI + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R12, BX + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j3 + + // | w12 @ R11 + MOVQ 24(R14), AX + MULQ DI + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R12, R11 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j4 + + // | w13 @ R10 + MOVQ 32(R14), AX + MULQ DI + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R12, R10 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j5 + + // | w14 @ R9 + MOVQ 40(R14), AX + MULQ DI + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R12, R9 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j6 + + // | w15 @ R8 + MOVQ 48(R14), AX + MULQ DI + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R12, R8 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j7 + + // | w16 @ R13 + MOVQ 56(R14), AX + MULQ DI + ADDQ AX, R13 + ADCQ DX, R15 + ADDQ R12, R13 + + // | move to idle register + MOVQ 72(SP), CX + + // | w17 @ CX + ADCQ R15, CX + MOVQ $0x00, R15 + ADCQ $0x00, R15 + + // | + +/* i = 10 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 SI + // | 11 BX | 12 R11 | 13 R10 | 14 R9 | 15 R8 | 16 R13 | 17 CX | 18 24(SP) | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | | u10 = w10 * inp + MOVQ SI, AX + MULQ inp+32(FP) + MOVQ AX, DI + MOVQ $0x00, R12 + + // | + +/* */ + + // | save u10 + MOVQ DI, 72(SP) + + // | j0 + + // | w10 @ SI + MOVQ (R14), AX + MULQ DI + ADDQ AX, SI + ADCQ DX, R12 + + // | j1 + + // | w11 @ BX + MOVQ 8(R14), AX + MULQ DI + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R12, BX + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j2 + + // | w12 @ R11 + MOVQ 16(R14), AX + MULQ DI + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R12, R11 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j3 + + // | w13 @ R10 + MOVQ 24(R14), AX + MULQ DI + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R12, R10 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j4 + + // | w14 @ R9 + MOVQ 32(R14), AX + MULQ DI + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R12, R9 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j5 + + // | w15 @ R8 + MOVQ 40(R14), AX + MULQ DI + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R12, R8 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j6 + + // | w16 @ R13 + MOVQ 48(R14), AX + MULQ DI + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R12, R13 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j7 + + // | w17 @ CX + MOVQ 56(R14), AX + MULQ DI + ADDQ AX, CX + ADCQ DX, R15 + ADDQ R12, CX + + // | move to idle register + MOVQ 24(SP), SI + + // | w18 @ SI + ADCQ R15, SI + MOVQ $0x00, R15 + ADCQ $0x00, R15 + + // | + // | W q3 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - + // | 11 BX | 12 R11 | 13 R10 | 14 R9 | 15 R8 | 16 R13 | 17 CX | 18 SI | 19 16(SP) | 20 8(SP) | 21 (SP) + + + // | aggregate carries from q2 & q3 + // | should be added to w19 + ADCQ R15, 48(SP) + + // | + +/* montgomerry reduction q4 */ + + // | + +/* i = 0 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - + // | 11 BX | 12 R11 | 13 R10 | 14 R9 | 15 R8 | 16 R13 | 17 CX | 18 SI | 19 16(SP) | 20 8(SP) | 21 (SP) + + + MOVQ $0x00, R12 + + // | + +/* */ + + // | j8 + + // | w16 @ R13 + MOVQ 64(R14), AX + MULQ 56(SP) + ADDQ AX, R13 + ADCQ DX, R12 + + // | j9 + + // | w17 @ CX + MOVQ 72(R14), AX + MULQ 56(SP) + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R12, CX + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j10 + + // | w18 @ SI + MOVQ 80(R14), AX + MULQ 56(SP) + ADDQ AX, SI + ADCQ 48(SP), DX + ADDQ R12, SI + MOVQ 16(SP), DI + + // | w19 @ DI + ADCQ DX, DI + MOVQ $0x00, R15 + ADCQ $0x00, R15 + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - + // | 11 BX | 12 R11 | 13 R10 | 14 R9 | 15 R8 | 16 R13 | 17 CX | 18 SI | 19 DI | 20 8(SP) | 21 (SP) + + + MOVQ $0x00, R12 + + // | + +/* */ + + // | j8 + + // | w17 @ CX + MOVQ 64(R14), AX + MULQ 64(SP) + ADDQ AX, CX + ADCQ DX, R12 + MOVQ CX, 24(SP) + + // | j9 + + // | w18 @ SI + MOVQ 72(R14), AX + MULQ 64(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R12, SI + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j10 + + // | w19 @ DI + MOVQ 80(R14), AX + MULQ 64(SP) + ADDQ AX, DI + ADCQ DX, R15 + ADDQ R12, DI + MOVQ 8(SP), CX + + // | w20 @ CX + ADCQ R15, CX + MOVQ $0x00, R15 + ADCQ $0x00, R15 + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - + // | 11 BX | 12 R11 | 13 R10 | 14 R9 | 15 R8 | 16 R13 | 17 24(SP) | 18 SI | 19 DI | 20 CX | 21 (SP) + + + MOVQ $0x00, R12 + + // | + +/* */ + + // | j8 + + // | w18 @ SI + MOVQ 64(R14), AX + MULQ 72(SP) + ADDQ AX, SI + ADCQ DX, R12 + + // | j9 + + // | w19 @ DI + MOVQ 72(R14), AX + MULQ 72(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R12, DI + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j10 + + // | w20 @ CX + MOVQ 80(R14), AX + MULQ 72(SP) + ADDQ AX, CX + ADCQ DX, R15 + ADDQ R12, CX + + // | very last limb goes to short carry register + MOVQ (SP), R12 + + // | w-1 @ R12 + ADCQ R15, R12 + MOVQ $0x00, R15 + ADCQ $0x00, R15 + + // | + // | W q4 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - + // | 11 BX | 12 R11 | 13 R10 | 14 R9 | 15 R8 | 16 R13 | 17 24(SP) | 18 SI | 19 DI | 20 CX | 21 R12 + + + // | + +/* modular reduction */ + + MOVQ BX, DX + SUBQ (R14), DX + MOVQ DX, (SP) + MOVQ R11, DX + SBBQ 8(R14), DX + MOVQ DX, 8(SP) + MOVQ R10, DX + SBBQ 16(R14), DX + MOVQ DX, 80(SP) + MOVQ R9, DX + SBBQ 24(R14), DX + MOVQ DX, 88(SP) + MOVQ R8, DX + SBBQ 32(R14), DX + MOVQ DX, 96(SP) + MOVQ R13, DX + SBBQ 40(R14), DX + MOVQ DX, 104(SP) + MOVQ 24(SP), DX + SBBQ 48(R14), DX + MOVQ DX, 112(SP) + MOVQ SI, DX + SBBQ 56(R14), DX + MOVQ DX, 120(SP) + MOVQ DI, DX + SBBQ 64(R14), DX + MOVQ DX, 128(SP) + MOVQ CX, DX + SBBQ 72(R14), DX + MOVQ DX, 136(SP) + MOVQ R12, DX + SBBQ 80(R14), DX + MOVQ DX, 144(SP) + SBBQ $0x00, R15 + + // | + +/* out */ + + MOVQ c+0(FP), R15 + CMOVQCC (SP), BX + MOVQ BX, (R15) + CMOVQCC 8(SP), R11 + MOVQ R11, 8(R15) + CMOVQCC 80(SP), R10 + MOVQ R10, 16(R15) + CMOVQCC 88(SP), R9 + MOVQ R9, 24(R15) + CMOVQCC 96(SP), R8 + MOVQ R8, 32(R15) + CMOVQCC 104(SP), R13 + MOVQ R13, 40(R15) + MOVQ 24(SP), DX + CMOVQCC 112(SP), DX + MOVQ DX, 48(R15) + CMOVQCC 120(SP), SI + MOVQ SI, 56(R15) + CMOVQCC 128(SP), DI + MOVQ DI, 64(R15) + CMOVQCC 136(SP), CX + MOVQ CX, 72(R15) + CMOVQCC 144(SP), R12 + MOVQ R12, 80(R15) + RET + + // | + +/* end */ + + +// func cpy12(dst *[12]uint64, src *[12]uint64) +TEXT ·cpy12(SB), NOSPLIT, $0-16 + MOVQ dst+0(FP), DI + MOVQ src+8(FP), SI + MOVQ (SI), R8 + MOVQ R8, (DI) + MOVQ 8(SI), R8 + MOVQ R8, 8(DI) + MOVQ 16(SI), R8 + MOVQ R8, 16(DI) + MOVQ 24(SI), R8 + MOVQ R8, 24(DI) + MOVQ 32(SI), R8 + MOVQ R8, 32(DI) + MOVQ 40(SI), R8 + MOVQ R8, 40(DI) + MOVQ 48(SI), R8 + MOVQ R8, 48(DI) + MOVQ 56(SI), R8 + MOVQ R8, 56(DI) + MOVQ 64(SI), R8 + MOVQ R8, 64(DI) + MOVQ 72(SI), R8 + MOVQ R8, 72(DI) + MOVQ 80(SI), R8 + MOVQ R8, 80(DI) + MOVQ 88(SI), R8 + MOVQ R8, 88(DI) + RET + +// func eq12(a *[12]uint64, b *[12]uint64) bool +TEXT ·eq12(SB), NOSPLIT, $0-17 + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + MOVB $0x00, ret+16(FP) + MOVQ (DI), R8 + CMPQ (SI), R8 + JNE ret + MOVQ 8(DI), R8 + CMPQ 8(SI), R8 + JNE ret + MOVQ 16(DI), R8 + CMPQ 16(SI), R8 + JNE ret + MOVQ 24(DI), R8 + CMPQ 24(SI), R8 + JNE ret + MOVQ 32(DI), R8 + CMPQ 32(SI), R8 + JNE ret + MOVQ 40(DI), R8 + CMPQ 40(SI), R8 + JNE ret + MOVQ 48(DI), R8 + CMPQ 48(SI), R8 + JNE ret + MOVQ 56(DI), R8 + CMPQ 56(SI), R8 + JNE ret + MOVQ 64(DI), R8 + CMPQ 64(SI), R8 + JNE ret + MOVQ 72(DI), R8 + CMPQ 72(SI), R8 + JNE ret + MOVQ 80(DI), R8 + CMPQ 80(SI), R8 + JNE ret + MOVQ 88(DI), R8 + CMPQ 88(SI), R8 + JNE ret + MOVB $0x01, ret+16(FP) + +ret: + RET + +// func cmp12(a *[12]uint64, b *[12]uint64) int8 +TEXT ·cmp12(SB), NOSPLIT, $0-17 + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + MOVQ 88(DI), R8 + CMPQ 88(SI), R8 + JB gt + JA lt + MOVQ 80(DI), R8 + CMPQ 80(SI), R8 + JB gt + JA lt + MOVQ 72(DI), R8 + CMPQ 72(SI), R8 + JB gt + JA lt + MOVQ 64(DI), R8 + CMPQ 64(SI), R8 + JB gt + JA lt + MOVQ 56(DI), R8 + CMPQ 56(SI), R8 + JB gt + JA lt + MOVQ 48(DI), R8 + CMPQ 48(SI), R8 + JB gt + JA lt + MOVQ 40(DI), R8 + CMPQ 40(SI), R8 + JB gt + JA lt + MOVQ 32(DI), R8 + CMPQ 32(SI), R8 + JB gt + JA lt + MOVQ 24(DI), R8 + CMPQ 24(SI), R8 + JB gt + JA lt + MOVQ 16(DI), R8 + CMPQ 16(SI), R8 + JB gt + JA lt + MOVQ 8(DI), R8 + CMPQ 8(SI), R8 + JB gt + JA lt + MOVQ (DI), R8 + CMPQ (SI), R8 + JB gt + JA lt + MOVB $0x00, ret+16(FP) + JMP ret + +gt: + MOVB $0x01, ret+16(FP) + JMP ret + +lt: + MOVB $0xff, ret+16(FP) + +ret: + RET + +// func add12(c *[12]uint64, a *[12]uint64, b *[12]uint64, p *[12]uint64) +TEXT ·add12(SB), NOSPLIT, $112-32 + // | + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + XORQ AX, AX + + // | + MOVQ (DI), CX + ADDQ (SI), CX + MOVQ 8(DI), DX + ADCQ 8(SI), DX + MOVQ 16(DI), R8 + ADCQ 16(SI), R8 + MOVQ 24(DI), R9 + ADCQ 24(SI), R9 + MOVQ 32(DI), R10 + ADCQ 32(SI), R10 + MOVQ 40(DI), R11 + ADCQ 40(SI), R11 + MOVQ 48(DI), R12 + ADCQ 48(SI), R12 + MOVQ 56(DI), R13 + ADCQ 56(SI), R13 + MOVQ 64(DI), R14 + ADCQ 64(SI), R14 + MOVQ 72(DI), R15 + ADCQ 72(SI), R15 + MOVQ 80(DI), BX + ADCQ 80(SI), BX + MOVQ BX, (SP) + MOVQ 88(DI), BX + ADCQ 88(SI), BX + MOVQ BX, 8(SP) + ADCQ $0x00, AX + + // | + MOVQ p+24(FP), SI + MOVQ CX, BX + SUBQ (SI), BX + MOVQ BX, 16(SP) + MOVQ DX, BX + SBBQ 8(SI), BX + MOVQ BX, 24(SP) + MOVQ R8, BX + SBBQ 16(SI), BX + MOVQ BX, 32(SP) + MOVQ R9, BX + SBBQ 24(SI), BX + MOVQ BX, 40(SP) + MOVQ R10, BX + SBBQ 32(SI), BX + MOVQ BX, 48(SP) + MOVQ R11, BX + SBBQ 40(SI), BX + MOVQ BX, 56(SP) + MOVQ R12, BX + SBBQ 48(SI), BX + MOVQ BX, 64(SP) + MOVQ R13, BX + SBBQ 56(SI), BX + MOVQ BX, 72(SP) + MOVQ R14, BX + SBBQ 64(SI), BX + MOVQ BX, 80(SP) + MOVQ R15, BX + SBBQ 72(SI), BX + MOVQ BX, 88(SP) + MOVQ (SP), BX + SBBQ 80(SI), BX + MOVQ BX, 96(SP) + MOVQ 8(SP), BX + SBBQ 88(SI), BX + MOVQ BX, 104(SP) + SBBQ $0x00, AX + + // | + MOVQ c+0(FP), DI + CMOVQCC 16(SP), CX + MOVQ CX, (DI) + CMOVQCC 24(SP), DX + MOVQ DX, 8(DI) + CMOVQCC 32(SP), R8 + MOVQ R8, 16(DI) + CMOVQCC 40(SP), R9 + MOVQ R9, 24(DI) + CMOVQCC 48(SP), R10 + MOVQ R10, 32(DI) + CMOVQCC 56(SP), R11 + MOVQ R11, 40(DI) + CMOVQCC 64(SP), R12 + MOVQ R12, 48(DI) + CMOVQCC 72(SP), R13 + MOVQ R13, 56(DI) + CMOVQCC 80(SP), R14 + MOVQ R14, 64(DI) + CMOVQCC 88(SP), R15 + MOVQ R15, 72(DI) + MOVQ (SP), BX + CMOVQCC 96(SP), BX + MOVQ BX, 80(DI) + MOVQ 8(SP), BX + CMOVQCC 104(SP), BX + MOVQ BX, 88(DI) + RET + + // | + +/* end */ + + RET + +// func addn12(a *[12]uint64, b *[12]uint64) uint64 +TEXT ·addn12(SB), NOSPLIT, $16-24 + // | + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + + // | + MOVQ (DI), CX + ADDQ (SI), CX + MOVQ 8(DI), DX + ADCQ 8(SI), DX + MOVQ 16(DI), R8 + ADCQ 16(SI), R8 + MOVQ 24(DI), R9 + ADCQ 24(SI), R9 + MOVQ 32(DI), R10 + ADCQ 32(SI), R10 + MOVQ 40(DI), R11 + ADCQ 40(SI), R11 + MOVQ 48(DI), R12 + ADCQ 48(SI), R12 + MOVQ 56(DI), R13 + ADCQ 56(SI), R13 + MOVQ 64(DI), R14 + ADCQ 64(SI), R14 + MOVQ 72(DI), R15 + ADCQ 72(SI), R15 + MOVQ 80(DI), BX + ADCQ 80(SI), BX + MOVQ BX, (SP) + MOVQ 88(DI), BX + ADCQ 88(SI), BX + MOVQ BX, 8(SP) + ADCQ $0x00, AX + + // | + MOVQ CX, (DI) + MOVQ DX, 8(DI) + MOVQ R8, 16(DI) + MOVQ R9, 24(DI) + MOVQ R10, 32(DI) + MOVQ R11, 40(DI) + MOVQ R12, 48(DI) + MOVQ R13, 56(DI) + MOVQ R14, 64(DI) + MOVQ R15, 72(DI) + MOVQ (SP), BX + MOVQ BX, 80(DI) + MOVQ 8(SP), BX + MOVQ BX, 88(DI) + MOVQ AX, ret+16(FP) + RET + + // | + +/* end */ + + RET + +// func double12(c *[12]uint64, a *[12]uint64, p *[12]uint64) +TEXT ·double12(SB), NOSPLIT, $112-24 + // | + MOVQ a+8(FP), DI + XORQ AX, AX + MOVQ (DI), CX + ADDQ CX, CX + MOVQ 8(DI), DX + ADCQ DX, DX + MOVQ 16(DI), R8 + ADCQ R8, R8 + MOVQ 24(DI), R9 + ADCQ R9, R9 + MOVQ 32(DI), R10 + ADCQ R10, R10 + MOVQ 40(DI), R11 + ADCQ R11, R11 + MOVQ 48(DI), R12 + ADCQ R12, R12 + MOVQ 56(DI), R13 + ADCQ R13, R13 + MOVQ 64(DI), R14 + ADCQ R14, R14 + MOVQ 72(DI), R15 + ADCQ R15, R15 + MOVQ 80(DI), BX + ADCQ BX, BX + MOVQ BX, (SP) + MOVQ 88(DI), BX + ADCQ BX, BX + MOVQ BX, 8(SP) + ADCQ $0x00, AX + + // | + MOVQ p+16(FP), SI + MOVQ CX, BX + SUBQ (SI), BX + MOVQ BX, 16(SP) + MOVQ DX, BX + SBBQ 8(SI), BX + MOVQ BX, 24(SP) + MOVQ R8, BX + SBBQ 16(SI), BX + MOVQ BX, 32(SP) + MOVQ R9, BX + SBBQ 24(SI), BX + MOVQ BX, 40(SP) + MOVQ R10, BX + SBBQ 32(SI), BX + MOVQ BX, 48(SP) + MOVQ R11, BX + SBBQ 40(SI), BX + MOVQ BX, 56(SP) + MOVQ R12, BX + SBBQ 48(SI), BX + MOVQ BX, 64(SP) + MOVQ R13, BX + SBBQ 56(SI), BX + MOVQ BX, 72(SP) + MOVQ R14, BX + SBBQ 64(SI), BX + MOVQ BX, 80(SP) + MOVQ R15, BX + SBBQ 72(SI), BX + MOVQ BX, 88(SP) + MOVQ (SP), BX + SBBQ 80(SI), BX + MOVQ BX, 96(SP) + MOVQ 8(SP), BX + SBBQ 88(SI), BX + MOVQ BX, 104(SP) + SBBQ $0x00, AX + + // | + MOVQ c+0(FP), DI + CMOVQCC 16(SP), CX + MOVQ CX, (DI) + CMOVQCC 24(SP), DX + MOVQ DX, 8(DI) + CMOVQCC 32(SP), R8 + MOVQ R8, 16(DI) + CMOVQCC 40(SP), R9 + MOVQ R9, 24(DI) + CMOVQCC 48(SP), R10 + MOVQ R10, 32(DI) + CMOVQCC 56(SP), R11 + MOVQ R11, 40(DI) + CMOVQCC 64(SP), R12 + MOVQ R12, 48(DI) + CMOVQCC 72(SP), R13 + MOVQ R13, 56(DI) + CMOVQCC 80(SP), R14 + MOVQ R14, 64(DI) + CMOVQCC 88(SP), R15 + MOVQ R15, 72(DI) + MOVQ (SP), BX + CMOVQCC 96(SP), BX + MOVQ BX, 80(DI) + MOVQ 8(SP), BX + CMOVQCC 104(SP), BX + MOVQ BX, 88(DI) + RET + + // | + +/* end */ + + RET + +// func sub12(c *[12]uint64, a *[12]uint64, b *[12]uint64, p *[12]uint64) +TEXT ·sub12(SB), NOSPLIT, $112-32 + // | + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + XORQ AX, AX + MOVQ (DI), CX + SUBQ (SI), CX + MOVQ 8(DI), DX + SBBQ 8(SI), DX + MOVQ 16(DI), R8 + SBBQ 16(SI), R8 + MOVQ 24(DI), R9 + SBBQ 24(SI), R9 + MOVQ 32(DI), R10 + SBBQ 32(SI), R10 + MOVQ 40(DI), R11 + SBBQ 40(SI), R11 + MOVQ 48(DI), R12 + SBBQ 48(SI), R12 + MOVQ 56(DI), R13 + SBBQ 56(SI), R13 + MOVQ 64(DI), R14 + SBBQ 64(SI), R14 + MOVQ 72(DI), R15 + SBBQ 72(SI), R15 + MOVQ 80(DI), BX + SBBQ 80(SI), BX + MOVQ BX, (SP) + MOVQ 88(DI), BX + SBBQ 88(SI), BX + MOVQ BX, 8(SP) + + // | + MOVQ p+24(FP), SI + CMOVQCS (SI), AX + MOVQ AX, 16(SP) + CMOVQCS 8(SI), AX + MOVQ AX, 24(SP) + CMOVQCS 16(SI), AX + MOVQ AX, 32(SP) + CMOVQCS 24(SI), AX + MOVQ AX, 40(SP) + CMOVQCS 32(SI), AX + MOVQ AX, 48(SP) + CMOVQCS 40(SI), AX + MOVQ AX, 56(SP) + CMOVQCS 48(SI), AX + MOVQ AX, 64(SP) + CMOVQCS 56(SI), AX + MOVQ AX, 72(SP) + CMOVQCS 64(SI), AX + MOVQ AX, 80(SP) + CMOVQCS 72(SI), AX + MOVQ AX, 88(SP) + CMOVQCS 80(SI), AX + MOVQ AX, 96(SP) + CMOVQCS 88(SI), AX + MOVQ AX, 104(SP) + + // | + MOVQ c+0(FP), DI + ADDQ 16(SP), CX + MOVQ CX, (DI) + ADCQ 24(SP), DX + MOVQ DX, 8(DI) + ADCQ 32(SP), R8 + MOVQ R8, 16(DI) + ADCQ 40(SP), R9 + MOVQ R9, 24(DI) + ADCQ 48(SP), R10 + MOVQ R10, 32(DI) + ADCQ 56(SP), R11 + MOVQ R11, 40(DI) + ADCQ 64(SP), R12 + MOVQ R12, 48(DI) + ADCQ 72(SP), R13 + MOVQ R13, 56(DI) + ADCQ 80(SP), R14 + MOVQ R14, 64(DI) + ADCQ 88(SP), R15 + MOVQ R15, 72(DI) + MOVQ (SP), BX + ADCQ 96(SP), BX + MOVQ BX, 80(DI) + MOVQ 8(SP), BX + ADCQ 104(SP), BX + MOVQ BX, 88(DI) + RET + + // | + +/* end */ + + RET + +// func subn12(a *[12]uint64, b *[12]uint64) uint64 +TEXT ·subn12(SB), NOSPLIT, $16-24 + // | + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + XORQ AX, AX + + // | + MOVQ (DI), CX + SUBQ (SI), CX + MOVQ 8(DI), DX + SBBQ 8(SI), DX + MOVQ 16(DI), R8 + SBBQ 16(SI), R8 + MOVQ 24(DI), R9 + SBBQ 24(SI), R9 + MOVQ 32(DI), R10 + SBBQ 32(SI), R10 + MOVQ 40(DI), R11 + SBBQ 40(SI), R11 + MOVQ 48(DI), R12 + SBBQ 48(SI), R12 + MOVQ 56(DI), R13 + SBBQ 56(SI), R13 + MOVQ 64(DI), R14 + SBBQ 64(SI), R14 + MOVQ 72(DI), R15 + SBBQ 72(SI), R15 + MOVQ 80(DI), BX + SBBQ 80(SI), BX + MOVQ BX, (SP) + MOVQ 88(DI), BX + SBBQ 88(SI), BX + MOVQ BX, 8(SP) + ADCQ $0x00, AX + + // | + MOVQ CX, (DI) + MOVQ DX, 8(DI) + MOVQ R8, 16(DI) + MOVQ R9, 24(DI) + MOVQ R10, 32(DI) + MOVQ R11, 40(DI) + MOVQ R12, 48(DI) + MOVQ R13, 56(DI) + MOVQ R14, 64(DI) + MOVQ R15, 72(DI) + MOVQ (SP), BX + MOVQ BX, 80(DI) + MOVQ 8(SP), BX + MOVQ BX, 88(DI) + MOVQ AX, ret+16(FP) + RET + + // | + +/* end */ + + RET + +// func _neg12(c *[12]uint64, a *[12]uint64, p *[12]uint64) +TEXT ·_neg12(SB), NOSPLIT, $16-24 + // | + MOVQ a+8(FP), DI + + // | + MOVQ p+16(FP), SI + MOVQ (SI), CX + SUBQ (DI), CX + MOVQ 8(SI), DX + SBBQ 8(DI), DX + MOVQ 16(SI), R8 + SBBQ 16(DI), R8 + MOVQ 24(SI), R9 + SBBQ 24(DI), R9 + MOVQ 32(SI), R10 + SBBQ 32(DI), R10 + MOVQ 40(SI), R11 + SBBQ 40(DI), R11 + MOVQ 48(SI), R12 + SBBQ 48(DI), R12 + MOVQ 56(SI), R13 + SBBQ 56(DI), R13 + MOVQ 64(SI), R14 + SBBQ 64(DI), R14 + MOVQ 72(SI), R15 + SBBQ 72(DI), R15 + MOVQ 80(SI), BX + SBBQ 80(DI), BX + MOVQ BX, (SP) + MOVQ 88(SI), BX + SBBQ 88(DI), BX + MOVQ BX, 8(SP) + + // | + MOVQ c+0(FP), DI + MOVQ CX, (DI) + MOVQ DX, 8(DI) + MOVQ R8, 16(DI) + MOVQ R9, 24(DI) + MOVQ R10, 32(DI) + MOVQ R11, 40(DI) + MOVQ R12, 48(DI) + MOVQ R13, 56(DI) + MOVQ R14, 64(DI) + MOVQ R15, 72(DI) + MOVQ (SP), BX + MOVQ BX, 80(DI) + MOVQ 8(SP), BX + MOVQ BX, 88(DI) + RET + + // | + +/* end */ + + RET + +// func mul_two_12(a *[12]uint64) +TEXT ·mul_two_12(SB), NOSPLIT, $0-8 + MOVQ a+0(FP), DI + XORQ AX, AX + RCLQ $0x01, (DI) + RCLQ $0x01, 8(DI) + RCLQ $0x01, 16(DI) + RCLQ $0x01, 24(DI) + RCLQ $0x01, 32(DI) + RCLQ $0x01, 40(DI) + RCLQ $0x01, 48(DI) + RCLQ $0x01, 56(DI) + RCLQ $0x01, 64(DI) + RCLQ $0x01, 72(DI) + RCLQ $0x01, 80(DI) + RCLQ $0x01, 88(DI) + RET + +// func div_two_12(a *[12]uint64) +TEXT ·div_two_12(SB), NOSPLIT, $0-8 + MOVQ a+0(FP), DI + XORQ AX, AX + RCRQ $0x01, 88(DI) + RCRQ $0x01, 80(DI) + RCRQ $0x01, 72(DI) + RCRQ $0x01, 64(DI) + RCRQ $0x01, 56(DI) + RCRQ $0x01, 48(DI) + RCRQ $0x01, 40(DI) + RCRQ $0x01, 32(DI) + RCRQ $0x01, 24(DI) + RCRQ $0x01, 16(DI) + RCRQ $0x01, 8(DI) + RCRQ $0x01, (DI) + RET + +// func mul12(c *[12]uint64, a *[12]uint64, b *[12]uint64, p *[12]uint64, inp uint64) +TEXT ·mul12(SB), NOSPLIT, $208-40 + // | + +/* inputs */ + + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + XORQ AX, AX + + // | + +/* i = 0 */ + + // | a0 @ DX + MOVQ (DI), DX + + // | a0 * b0 + MULXQ (SI), AX, CX + MOVQ AX, (SP) + + // | a0 * b1 + MULXQ 8(SI), AX, R8 + ADCXQ AX, CX + + // | a0 * b2 + MULXQ 16(SI), AX, R9 + ADCXQ AX, R8 + + // | a0 * b3 + MULXQ 24(SI), AX, R10 + ADCXQ AX, R9 + + // | a0 * b4 + MULXQ 32(SI), AX, R11 + ADCXQ AX, R10 + + // | a0 * b5 + MULXQ 40(SI), AX, R12 + ADCXQ AX, R11 + + // | a0 * b6 + MULXQ 48(SI), AX, R13 + ADCXQ AX, R12 + + // | a0 * b7 + MULXQ 56(SI), AX, R14 + ADCXQ AX, R13 + + // | a0 * b8 + MULXQ 64(SI), AX, R15 + ADCXQ AX, R14 + ADCQ $0x00, R15 + + // | + +/* i = 1 */ + + // | a1 @ DX + MOVQ 8(DI), DX + XORQ AX, AX + + // | a1 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + MOVQ CX, 8(SP) + MOVQ $0x00, CX + + // | a1 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a1 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a1 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a1 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a1 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a1 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a1 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a1 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R15 + ADOXQ CX, CX + ADCXQ BX, CX + + // | + +/* i = 2 */ + + // | a2 @ DX + MOVQ 16(DI), DX + XORQ AX, AX + + // | a2 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + MOVQ R8, 16(SP) + MOVQ $0x00, R8 + + // | a2 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a2 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a2 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a2 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a2 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a2 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a2 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a2 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, CX + ADOXQ R8, R8 + ADCXQ BX, R8 + + // | + +/* i = 3 */ + + // | a3 @ DX + MOVQ 24(DI), DX + XORQ AX, AX + + // | a3 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + MOVQ R9, 24(SP) + MOVQ $0x00, R9 + + // | a3 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a3 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a3 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a3 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a3 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a3 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a3 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a3 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R8 + ADOXQ R9, R9 + ADCXQ BX, R9 + + // | + +/* i = 4 */ + + // | a4 @ DX + MOVQ 32(DI), DX + XORQ AX, AX + + // | a4 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + MOVQ R10, 32(SP) + MOVQ $0x00, R10 + + // | a4 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a4 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a4 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a4 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a4 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a4 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a4 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a4 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R9 + ADOXQ R10, R10 + ADCXQ BX, R10 + + // | + +/* i = 5 */ + + // | a5 @ DX + MOVQ 40(DI), DX + XORQ AX, AX + + // | a5 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + MOVQ R11, 40(SP) + MOVQ $0x00, R11 + + // | a5 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a5 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a5 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a5 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a5 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a5 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a5 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a5 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R10 + ADOXQ R11, R11 + ADCXQ BX, R11 + + // | + +/* i = 6 */ + + // | a6 @ DX + MOVQ 48(DI), DX + XORQ AX, AX + + // | a6 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + MOVQ R12, 48(SP) + MOVQ $0x00, R12 + + // | a6 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a6 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a6 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a6 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a6 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a6 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a6 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a6 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R11 + ADOXQ R12, R12 + ADCXQ BX, R12 + + // | + +/* i = 7 */ + + // | a7 @ DX + MOVQ 56(DI), DX + XORQ AX, AX + + // | a7 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + MOVQ R13, 56(SP) + MOVQ $0x00, R13 + + // | a7 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a7 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a7 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a7 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a7 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a7 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a7 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a7 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R12 + ADOXQ R13, R13 + ADCXQ BX, R13 + + // | + +/* i = 8 */ + + // | a8 @ DX + MOVQ 64(DI), DX + XORQ AX, AX + + // | a8 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + MOVQ R14, 64(SP) + MOVQ $0x00, R14 + + // | a8 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a8 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a8 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a8 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a8 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a8 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a8 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a8 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R13 + ADOXQ R14, R14 + ADCXQ BX, R14 + + // | + +/* i = 9 */ + + // | a9 @ DX + MOVQ 72(DI), DX + XORQ AX, AX + + // | a9 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + MOVQ R15, 72(SP) + MOVQ $0x00, R15 + + // | a9 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a9 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a9 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a9 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a9 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a9 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a9 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a9 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R14 + ADOXQ R15, R15 + ADCXQ BX, R15 + + // | + +/* i = 10 */ + + // | a10 @ DX + MOVQ 80(DI), DX + XORQ AX, AX + + // | a10 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + MOVQ CX, 80(SP) + MOVQ $0x00, CX + + // | a10 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a10 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a10 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a10 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a10 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a10 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a10 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a10 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R15 + ADOXQ CX, CX + ADCXQ BX, CX + + // | + +/* i = 11 */ + + // | a11 @ DX + MOVQ 88(DI), DX + XORQ AX, AX + + // | a11 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + MOVQ R8, 88(SP) + MOVQ $0x00, R8 + + // | a11 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a11 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a11 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a11 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a11 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a11 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a11 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a11 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, CX + ADOXQ BX, R8 + ADCQ $0x00, R8 + + // | + +/* */ + + // | + // | W right + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) | 11 88(SP) + // | 12 R9 | 13 R10 | 14 R11 | 15 R12 | 16 R13 | 17 R14 | 18 R15 | 19 CX | 20 R8 | 21 - | 22 - | 23 - + + + MOVQ R9, 96(SP) + MOVQ R10, 104(SP) + MOVQ R11, 112(SP) + MOVQ R12, 120(SP) + MOVQ R13, 128(SP) + MOVQ R14, 136(SP) + MOVQ R15, 144(SP) + MOVQ CX, 152(SP) + MOVQ R8, 160(SP) + + // | + // | W right at stack + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) | 11 88(SP) + // | 12 96(SP) | 13 104(SP) | 14 112(SP) | 15 120(SP) | 16 128(SP) | 17 136(SP) | 18 144(SP) | 19 152(SP) | 20 160(SP) | 21 - | 22 - | 23 - + + + XORQ AX, AX + + // | + +/* i = 0 */ + + // | a0 @ DX + MOVQ (DI), DX + + // | a0 * b9 + MULXQ 72(SI), AX, CX + MOVQ AX, 168(SP) + + // | a0 * b10 + MULXQ 80(SI), AX, R8 + ADCXQ AX, CX + + // | a0 * b11 + MULXQ 88(SI), AX, R9 + ADCXQ AX, R8 + ADCQ $0x00, R9 + + // | + +/* i = 1 */ + + // | a1 @ DX + MOVQ 8(DI), DX + XORQ R10, R10 + + // | a1 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + MOVQ CX, 176(SP) + + // | a1 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a1 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R9 + ADOXQ R10, R10 + ADCXQ BX, R10 + + // | + +/* i = 2 */ + + // | a2 @ DX + MOVQ 16(DI), DX + XORQ R11, R11 + + // | a2 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + MOVQ R8, 184(SP) + + // | a2 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a2 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R10 + ADOXQ R11, R11 + ADCXQ BX, R11 + + // | + +/* i = 3 */ + + // | a3 @ DX + MOVQ 24(DI), DX + XORQ R12, R12 + + // | a3 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + MOVQ R9, 192(SP) + + // | a3 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a3 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R11 + ADOXQ R12, R12 + ADCXQ BX, R12 + + // | + +/* i = 4 */ + + // | a4 @ DX + MOVQ 32(DI), DX + XORQ R13, R13 + + // | a4 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + MOVQ R10, 200(SP) + + // | a4 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a4 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R12 + ADOXQ R13, R13 + ADCXQ BX, R13 + + // | + +/* i = 5 */ + + // | a5 @ DX + MOVQ 40(DI), DX + XORQ R14, R14 + + // | a5 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a5 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a5 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R13 + ADOXQ R14, R14 + ADCXQ BX, R14 + + // | + +/* i = 6 */ + + // | a6 @ DX + MOVQ 48(DI), DX + XORQ R15, R15 + + // | a6 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a6 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a6 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R14 + ADOXQ R15, R15 + ADCXQ BX, R15 + + // | + +/* i = 7 */ + + // | a7 @ DX + MOVQ 56(DI), DX + XORQ CX, CX + + // | a7 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a7 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a7 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R15 + ADOXQ CX, CX + ADCXQ BX, CX + + // | + +/* i = 8 */ + + // | a8 @ DX + MOVQ 64(DI), DX + XORQ R8, R8 + + // | a8 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a8 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a8 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, CX + ADOXQ R8, R8 + ADCXQ BX, R8 + + // | + +/* i = 9 */ + + // | a9 @ DX + MOVQ 72(DI), DX + XORQ R9, R9 + + // | a9 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a9 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a9 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R8 + ADOXQ R9, R9 + ADCXQ BX, R9 + + // | + +/* i = 10 */ + + // | a10 @ DX + MOVQ 80(DI), DX + XORQ R10, R10 + + // | a10 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a10 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a10 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R9 + ADOXQ R10, R10 + ADCXQ BX, R10 + + // | + +/* i = 11 */ + + // | a11 @ DX + MOVQ 88(DI), DX + XORQ DI, DI + + // | a11 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a11 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a11 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R10 + ADOXQ BX, DI + ADCQ $0x00, DI + + // | + +/* */ + + // | + // | W left + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 168(SP) | 10 176(SP) | 11 184(SP) + // | 12 192(SP) | 13 200(SP) | 14 R11 | 15 R12 | 16 R13 | 17 R14 | 18 R15 | 19 CX | 20 R8 | 21 R9 | 22 R10 | 23 DI + + + // | + // | W right + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) | 11 88(SP) + // | 12 96(SP) | 13 104(SP) | 14 112(SP) | 15 120(SP) | 16 128(SP) | 17 136(SP) | 18 144(SP) | 19 152(SP) | 20 160(SP) | 21 - | 22 - | 23 - + + + MOVQ 72(SP), AX + ADDQ AX, 168(SP) + MOVQ 80(SP), AX + ADCQ AX, 176(SP) + MOVQ 88(SP), AX + ADCQ AX, 184(SP) + MOVQ 96(SP), AX + ADCQ AX, 192(SP) + MOVQ 104(SP), AX + ADCQ AX, 200(SP) + ADCQ 112(SP), R11 + ADCQ 120(SP), R12 + ADCQ 128(SP), R13 + ADCQ 136(SP), R14 + ADCQ 144(SP), R15 + ADCQ 152(SP), CX + ADCQ 160(SP), R8 + ADCQ $0x00, R9 + ADCQ $0x00, R10 + ADCQ $0x00, DI + + // | + // | W combined + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 168(SP) | 10 176(SP) | 11 184(SP) + // | 12 192(SP) | 13 200(SP) | 14 R11 | 15 R12 | 16 R13 | 17 R14 | 18 R15 | 19 CX | 20 R8 | 21 R9 | 22 R10 | 23 DI + + + MOVQ (SP), BX + MOVQ 8(SP), SI + MOVQ DI, (SP) + MOVQ 16(SP), DI + MOVQ R10, 8(SP) + MOVQ 24(SP), R10 + MOVQ R9, 16(SP) + MOVQ 32(SP), R9 + MOVQ R8, 24(SP) + MOVQ 40(SP), R8 + MOVQ CX, 32(SP) + MOVQ 48(SP), CX + MOVQ R15, 40(SP) + MOVQ 56(SP), R15 + MOVQ R14, 48(SP) + MOVQ 64(SP), R14 + MOVQ R13, 56(SP) + MOVQ 168(SP), R13 + MOVQ R12, 64(SP) + MOVQ R11, 72(SP) + + // | fetch modulus + MOVQ p+24(FP), R11 + + // | + // | W ready to mont + // | 0 BX | 1 SI | 2 DI | 3 R10 | 4 R9 | 5 R8 | 6 CX | 7 R15 | 8 R14 | 9 R13 | 10 176(SP) | 11 184(SP) + // | 12 192(SP) | 13 200(SP) | 14 72(SP) | 15 64(SP) | 16 56(SP) | 17 48(SP) | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | + +/* montgomery reduction q1 */ + + // | clear flags + XORQ AX, AX + + // | + +/* i = 0 */ + + // | + // | W + // | 0 BX | 1 SI | 2 DI | 3 R10 | 4 R9 | 5 R8 | 6 CX | 7 R15 | 8 R14 | 9 R13 | 10 176(SP) | 11 184(SP) + // | 12 192(SP) | 13 200(SP) | 14 72(SP) | 15 64(SP) | 16 56(SP) | 17 48(SP) | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | | u0 = w0 * inp + MOVQ BX, DX + MULXQ inp+32(FP), DX, R12 + + // | save u0 + MOVQ DX, 80(SP) + + // | + +/* */ + + // | j0 + + // | w0 @ BX + MULXQ (R11), AX, R12 + ADOXQ AX, BX + ADCXQ R12, SI + + // | j1 + + // | w1 @ SI + MULXQ 8(R11), AX, R12 + ADOXQ AX, SI + ADCXQ R12, DI + + // | j2 + + // | w2 @ DI + MULXQ 16(R11), AX, R12 + ADOXQ AX, DI + ADCXQ R12, R10 + + // | j3 + + // | w3 @ R10 + MULXQ 24(R11), AX, R12 + ADOXQ AX, R10 + ADCXQ R12, R9 + + // | j4 + + // | w4 @ R9 + MULXQ 32(R11), AX, R12 + ADOXQ AX, R9 + ADCXQ R12, R8 + + // | j5 + + // | w5 @ R8 + MULXQ 40(R11), AX, R12 + ADOXQ AX, R8 + ADCXQ R12, CX + + // | j6 + + // | w6 @ CX + MULXQ 48(R11), AX, R12 + ADOXQ AX, CX + ADCXQ R12, R15 + + // | j7 + + // | w7 @ R15 + MULXQ 56(R11), AX, R12 + ADOXQ AX, R15 + ADCXQ R12, R14 + + // | j8 + + // | w8 @ R14 + MULXQ 64(R11), AX, R12 + ADOXQ AX, R14 + ADCXQ R12, R13 + ADOXQ BX, R13 + ADCXQ BX, BX + MOVQ $0x00, AX + ADOXQ AX, BX + + // | clear flags + XORQ AX, AX + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 SI | 2 DI | 3 R10 | 4 R9 | 5 R8 | 6 CX | 7 R15 | 8 R14 | 9 R13 | 10 176(SP) | 11 184(SP) + // | 12 192(SP) | 13 200(SP) | 14 72(SP) | 15 64(SP) | 16 56(SP) | 17 48(SP) | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | | u1 = w1 * inp + MOVQ SI, DX + MULXQ inp+32(FP), DX, R12 + + // | save u1 + MOVQ DX, 88(SP) + + // | + +/* */ + + // | j0 + + // | w1 @ SI + MULXQ (R11), AX, R12 + ADOXQ AX, SI + ADCXQ R12, DI + + // | j1 + + // | w2 @ DI + MULXQ 8(R11), AX, R12 + ADOXQ AX, DI + ADCXQ R12, R10 + + // | j2 + + // | w3 @ R10 + MULXQ 16(R11), AX, R12 + ADOXQ AX, R10 + ADCXQ R12, R9 + + // | j3 + + // | w4 @ R9 + MULXQ 24(R11), AX, R12 + ADOXQ AX, R9 + ADCXQ R12, R8 + + // | j4 + + // | w5 @ R8 + MULXQ 32(R11), AX, R12 + ADOXQ AX, R8 + ADCXQ R12, CX + + // | j5 + + // | w6 @ CX + MULXQ 40(R11), AX, R12 + ADOXQ AX, CX + ADCXQ R12, R15 + + // | j6 + + // | w7 @ R15 + MULXQ 48(R11), AX, R12 + ADOXQ AX, R15 + ADCXQ R12, R14 + + // | j7 + + // | w8 @ R14 + MULXQ 56(R11), AX, R12 + ADOXQ AX, R14 + ADCXQ R12, R13 + + // | j8 + + // | w9 @ R13 + MULXQ 64(R11), AX, R12 + ADOXQ AX, R13 + + // | w10 @ 176(SP) + // | move to temp register + MOVQ 176(SP), AX + ADCXQ R12, AX + ADOXQ BX, AX + + // | move to an idle register + // | w10 @ AX + MOVQ AX, BX + ADCXQ SI, SI + MOVQ $0x00, AX + ADOXQ AX, SI + + // | clear flags + XORQ AX, AX + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 DI | 3 R10 | 4 R9 | 5 R8 | 6 CX | 7 R15 | 8 R14 | 9 R13 | 10 BX | 11 184(SP) + // | 12 192(SP) | 13 200(SP) | 14 72(SP) | 15 64(SP) | 16 56(SP) | 17 48(SP) | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | | u2 = w2 * inp + MOVQ DI, DX + MULXQ inp+32(FP), DX, R12 + + // | save u2 + MOVQ DX, 96(SP) + + // | + +/* */ + + // | j0 + + // | w2 @ DI + MULXQ (R11), AX, R12 + ADOXQ AX, DI + ADCXQ R12, R10 + + // | j1 + + // | w3 @ R10 + MULXQ 8(R11), AX, R12 + ADOXQ AX, R10 + ADCXQ R12, R9 + + // | j2 + + // | w4 @ R9 + MULXQ 16(R11), AX, R12 + ADOXQ AX, R9 + ADCXQ R12, R8 + + // | j3 + + // | w5 @ R8 + MULXQ 24(R11), AX, R12 + ADOXQ AX, R8 + ADCXQ R12, CX + + // | j4 + + // | w6 @ CX + MULXQ 32(R11), AX, R12 + ADOXQ AX, CX + ADCXQ R12, R15 + + // | j5 + + // | w7 @ R15 + MULXQ 40(R11), AX, R12 + ADOXQ AX, R15 + ADCXQ R12, R14 + + // | j6 + + // | w8 @ R14 + MULXQ 48(R11), AX, R12 + ADOXQ AX, R14 + ADCXQ R12, R13 + + // | j7 + + // | w9 @ R13 + MULXQ 56(R11), AX, R12 + ADOXQ AX, R13 + ADCXQ R12, BX + + // | j8 + + // | w10 @ BX + MULXQ 64(R11), AX, R12 + ADOXQ AX, BX + + // | w11 @ 184(SP) + // | move to temp register + MOVQ 184(SP), AX + ADCXQ R12, AX + ADOXQ SI, AX + + // | move to an idle register + // | w11 @ AX + MOVQ AX, SI + ADCXQ DI, DI + MOVQ $0x00, AX + ADOXQ AX, DI + + // | clear flags + XORQ AX, AX + + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 R10 | 4 R9 | 5 R8 | 6 CX | 7 R15 | 8 R14 | 9 R13 | 10 BX | 11 SI + // | 12 192(SP) | 13 200(SP) | 14 72(SP) | 15 64(SP) | 16 56(SP) | 17 48(SP) | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | | u3 = w3 * inp + MOVQ R10, DX + MULXQ inp+32(FP), DX, R12 + + // | save u3 + MOVQ DX, 104(SP) + + // | + +/* */ + + // | j0 + + // | w3 @ R10 + MULXQ (R11), AX, R12 + ADOXQ AX, R10 + ADCXQ R12, R9 + + // | j1 + + // | w4 @ R9 + MULXQ 8(R11), AX, R12 + ADOXQ AX, R9 + ADCXQ R12, R8 + + // | j2 + + // | w5 @ R8 + MULXQ 16(R11), AX, R12 + ADOXQ AX, R8 + ADCXQ R12, CX + + // | j3 + + // | w6 @ CX + MULXQ 24(R11), AX, R12 + ADOXQ AX, CX + ADCXQ R12, R15 + + // | j4 + + // | w7 @ R15 + MULXQ 32(R11), AX, R12 + ADOXQ AX, R15 + ADCXQ R12, R14 + + // | j5 + + // | w8 @ R14 + MULXQ 40(R11), AX, R12 + ADOXQ AX, R14 + ADCXQ R12, R13 + + // | j6 + + // | w9 @ R13 + MULXQ 48(R11), AX, R12 + ADOXQ AX, R13 + ADCXQ R12, BX + + // | j7 + + // | w10 @ BX + MULXQ 56(R11), AX, R12 + ADOXQ AX, BX + ADCXQ R12, SI + + // | j8 + + // | w11 @ SI + MULXQ 64(R11), AX, R12 + ADOXQ AX, SI + + // | w12 @ 192(SP) + // | move to temp register + MOVQ 192(SP), AX + ADCXQ R12, AX + ADOXQ DI, AX + + // | move to an idle register + // | w12 @ AX + MOVQ AX, DI + ADCXQ R10, R10 + MOVQ $0x00, AX + ADOXQ AX, R10 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 4 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 R9 | 5 R8 | 6 CX | 7 R15 | 8 R14 | 9 R13 | 10 BX | 11 SI + // | 12 DI | 13 200(SP) | 14 72(SP) | 15 64(SP) | 16 56(SP) | 17 48(SP) | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | | u4 = w4 * inp + MOVQ R9, DX + MULXQ inp+32(FP), DX, R12 + + // | save u4 + MOVQ DX, 112(SP) + + // | + +/* */ + + // | j0 + + // | w4 @ R9 + MULXQ (R11), AX, R12 + ADOXQ AX, R9 + ADCXQ R12, R8 + + // | j1 + + // | w5 @ R8 + MULXQ 8(R11), AX, R12 + ADOXQ AX, R8 + ADCXQ R12, CX + + // | j2 + + // | w6 @ CX + MULXQ 16(R11), AX, R12 + ADOXQ AX, CX + ADCXQ R12, R15 + + // | j3 + + // | w7 @ R15 + MULXQ 24(R11), AX, R12 + ADOXQ AX, R15 + ADCXQ R12, R14 + + // | j4 + + // | w8 @ R14 + MULXQ 32(R11), AX, R12 + ADOXQ AX, R14 + ADCXQ R12, R13 + + // | j5 + + // | w9 @ R13 + MULXQ 40(R11), AX, R12 + ADOXQ AX, R13 + ADCXQ R12, BX + + // | j6 + + // | w10 @ BX + MULXQ 48(R11), AX, R12 + ADOXQ AX, BX + ADCXQ R12, SI + + // | j7 + + // | w11 @ SI + MULXQ 56(R11), AX, R12 + ADOXQ AX, SI + ADCXQ R12, DI + + // | j8 + + // | w12 @ DI + MULXQ 64(R11), AX, R12 + ADOXQ AX, DI + + // | w13 @ 200(SP) + // | move to temp register + MOVQ 200(SP), AX + ADCXQ R12, AX + ADOXQ R10, AX + + // | move to an idle register + // | w13 @ AX + MOVQ AX, R10 + ADCXQ R9, R9 + MOVQ $0x00, AX + ADOXQ AX, R9 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 5 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 R8 | 6 CX | 7 R15 | 8 R14 | 9 R13 | 10 BX | 11 SI + // | 12 DI | 13 R10 | 14 72(SP) | 15 64(SP) | 16 56(SP) | 17 48(SP) | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | | u5 = w5 * inp + MOVQ R8, DX + MULXQ inp+32(FP), DX, R12 + + // | save u5 + MOVQ DX, 120(SP) + + // | + +/* */ + + // | j0 + + // | w5 @ R8 + MULXQ (R11), AX, R12 + ADOXQ AX, R8 + ADCXQ R12, CX + + // | j1 + + // | w6 @ CX + MULXQ 8(R11), AX, R12 + ADOXQ AX, CX + ADCXQ R12, R15 + + // | j2 + + // | w7 @ R15 + MULXQ 16(R11), AX, R12 + ADOXQ AX, R15 + ADCXQ R12, R14 + + // | j3 + + // | w8 @ R14 + MULXQ 24(R11), AX, R12 + ADOXQ AX, R14 + ADCXQ R12, R13 + + // | j4 + + // | w9 @ R13 + MULXQ 32(R11), AX, R12 + ADOXQ AX, R13 + ADCXQ R12, BX + + // | j5 + + // | w10 @ BX + MULXQ 40(R11), AX, R12 + ADOXQ AX, BX + ADCXQ R12, SI + + // | j6 + + // | w11 @ SI + MULXQ 48(R11), AX, R12 + ADOXQ AX, SI + ADCXQ R12, DI + + // | j7 + + // | w12 @ DI + MULXQ 56(R11), AX, R12 + ADOXQ AX, DI + ADCXQ R12, R10 + + // | j8 + + // | w13 @ R10 + MULXQ 64(R11), AX, R12 + ADOXQ AX, R10 + + // | w14 @ 72(SP) + // | move to temp register + MOVQ 72(SP), AX + ADCXQ R12, AX + ADOXQ R9, AX + + // | move to an idle register + // | w14 @ AX + MOVQ AX, R9 + ADCXQ R8, R8 + MOVQ $0x00, AX + ADOXQ AX, R8 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 6 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 CX | 7 R15 | 8 R14 | 9 R13 | 10 BX | 11 SI + // | 12 DI | 13 R10 | 14 R9 | 15 64(SP) | 16 56(SP) | 17 48(SP) | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | | u6 = w6 * inp + MOVQ CX, DX + MULXQ inp+32(FP), DX, R12 + + // | save u6 + MOVQ DX, 72(SP) + + // | + +/* */ + + // | j0 + + // | w6 @ CX + MULXQ (R11), AX, R12 + ADOXQ AX, CX + ADCXQ R12, R15 + + // | j1 + + // | w7 @ R15 + MULXQ 8(R11), AX, R12 + ADOXQ AX, R15 + ADCXQ R12, R14 + + // | j2 + + // | w8 @ R14 + MULXQ 16(R11), AX, R12 + ADOXQ AX, R14 + ADCXQ R12, R13 + + // | j3 + + // | w9 @ R13 + MULXQ 24(R11), AX, R12 + ADOXQ AX, R13 + ADCXQ R12, BX + + // | j4 + + // | w10 @ BX + MULXQ 32(R11), AX, R12 + ADOXQ AX, BX + ADCXQ R12, SI + + // | j5 + + // | w11 @ SI + MULXQ 40(R11), AX, R12 + ADOXQ AX, SI + ADCXQ R12, DI + + // | j6 + + // | w12 @ DI + MULXQ 48(R11), AX, R12 + ADOXQ AX, DI + ADCXQ R12, R10 + + // | j7 + + // | w13 @ R10 + MULXQ 56(R11), AX, R12 + ADOXQ AX, R10 + ADCXQ R12, R9 + + // | j8 + + // | w14 @ R9 + MULXQ 64(R11), AX, R12 + ADOXQ AX, R9 + + // | w15 @ 64(SP) + // | move to temp register + MOVQ 64(SP), AX + ADCXQ R12, AX + ADOXQ R8, AX + + // | move to an idle register + // | w15 @ AX + MOVQ AX, R8 + ADCXQ CX, CX + MOVQ $0x00, AX + ADOXQ AX, CX + + // | clear flags + XORQ AX, AX + + // | + +/* i = 7 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 R15 | 8 R14 | 9 R13 | 10 BX | 11 SI + // | 12 DI | 13 R10 | 14 R9 | 15 R8 | 16 56(SP) | 17 48(SP) | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | | u7 = w7 * inp + MOVQ R15, DX + MULXQ inp+32(FP), DX, R12 + + // | save u7 + MOVQ DX, 64(SP) + + // | + +/* */ + + // | j0 + + // | w7 @ R15 + MULXQ (R11), AX, R12 + ADOXQ AX, R15 + ADCXQ R12, R14 + + // | j1 + + // | w8 @ R14 + MULXQ 8(R11), AX, R12 + ADOXQ AX, R14 + ADCXQ R12, R13 + + // | j2 + + // | w9 @ R13 + MULXQ 16(R11), AX, R12 + ADOXQ AX, R13 + ADCXQ R12, BX + + // | j3 + + // | w10 @ BX + MULXQ 24(R11), AX, R12 + ADOXQ AX, BX + ADCXQ R12, SI + + // | j4 + + // | w11 @ SI + MULXQ 32(R11), AX, R12 + ADOXQ AX, SI + ADCXQ R12, DI + + // | j5 + + // | w12 @ DI + MULXQ 40(R11), AX, R12 + ADOXQ AX, DI + ADCXQ R12, R10 + + // | j6 + + // | w13 @ R10 + MULXQ 48(R11), AX, R12 + ADOXQ AX, R10 + ADCXQ R12, R9 + + // | j7 + + // | w14 @ R9 + MULXQ 56(R11), AX, R12 + ADOXQ AX, R9 + ADCXQ R12, R8 + + // | j8 + + // | w15 @ R8 + MULXQ 64(R11), AX, R12 + ADOXQ AX, R8 + + // | w16 @ 56(SP) + // | move to temp register + MOVQ 56(SP), AX + ADCXQ R12, AX + ADOXQ CX, AX + + // | move to an idle register + // | w16 @ AX + MOVQ AX, CX + ADCXQ R15, R15 + MOVQ $0x00, AX + ADOXQ AX, R15 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 8 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R14 | 9 R13 | 10 BX | 11 SI + // | 12 DI | 13 R10 | 14 R9 | 15 R8 | 16 CX | 17 48(SP) | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | | u8 = w8 * inp + MOVQ R14, DX + MULXQ inp+32(FP), DX, R12 + + // | save u8 + MOVQ DX, 56(SP) + + // | + +/* */ + + // | j0 + + // | w8 @ R14 + MULXQ (R11), AX, R12 + ADOXQ AX, R14 + ADCXQ R12, R13 + + // | j1 + + // | w9 @ R13 + MULXQ 8(R11), AX, R12 + ADOXQ AX, R13 + ADCXQ R12, BX + + // | j2 + + // | w10 @ BX + MULXQ 16(R11), AX, R12 + ADOXQ AX, BX + ADCXQ R12, SI + + // | j3 + + // | w11 @ SI + MULXQ 24(R11), AX, R12 + ADOXQ AX, SI + ADCXQ R12, DI + + // | j4 + + // | w12 @ DI + MULXQ 32(R11), AX, R12 + ADOXQ AX, DI + ADCXQ R12, R10 + + // | j5 + + // | w13 @ R10 + MULXQ 40(R11), AX, R12 + ADOXQ AX, R10 + ADCXQ R12, R9 + + // | j6 + + // | w14 @ R9 + MULXQ 48(R11), AX, R12 + ADOXQ AX, R9 + ADCXQ R12, R8 + + // | j7 + + // | w15 @ R8 + MULXQ 56(R11), AX, R12 + ADOXQ AX, R8 + ADCXQ R12, CX + + // | j8 + + // | w16 @ CX + MULXQ 64(R11), AX, R12 + ADOXQ AX, CX + + // | w17 @ 48(SP) + // | move to temp register + MOVQ 48(SP), AX + ADCXQ R12, AX + ADOXQ R15, AX + + // | move to an idle register + // | w17 @ AX + MOVQ AX, R15 + ADCXQ R14, R14 + MOVQ $0x00, AX + ADOXQ AX, R14 + + // | + // | W montgomery reduction q1 ends + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R13 | 10 BX | 11 SI + // | 12 DI | 13 R10 | 14 R9 | 15 R8 | 16 CX | 17 R15 | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | carry from q1 should be added to w18 + MOVQ R14, 48(SP) + + // | + +/* montgomerry reduction q2 */ + + // | clear flags + XORQ R14, R14 + + // | + +/* i = 0 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R13 | 10 BX | 11 SI + // | 12 DI | 13 R10 | 14 R9 | 15 R8 | 16 CX | 17 R15 | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | u0 @ 80(SP) + MOVQ 80(SP), DX + + // | + +/* */ + + // | j9 + + // | w9 @ R13 + MULXQ 72(R11), AX, R12 + ADOXQ AX, R13 + ADCXQ R12, BX + + // | j10 + + // | w10 @ BX + MULXQ 80(R11), AX, R12 + ADOXQ AX, BX + ADCXQ R12, SI + + // | j11 + + // | w11 @ SI + MULXQ 88(R11), AX, R12 + ADOXQ AX, SI + ADCXQ R12, DI + ADOXQ R14, DI + MOVQ $0x00, R14 + ADCXQ R14, R14 + MOVQ $0x00, AX + ADOXQ AX, R14 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R13 | 10 BX | 11 SI + // | 12 DI | 13 R10 | 14 R9 | 15 R8 | 16 CX | 17 R15 | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | u1 @ 88(SP) + MOVQ 88(SP), DX + + // | + +/* */ + + // | j9 + + // | w10 @ BX + MULXQ 72(R11), AX, R12 + ADOXQ AX, BX + MOVQ BX, 80(SP) + ADCXQ R12, SI + + // | j10 + + // | w11 @ SI + MULXQ 80(R11), AX, R12 + ADOXQ AX, SI + ADCXQ R12, DI + + // | j11 + + // | w12 @ DI + MULXQ 88(R11), AX, R12 + ADOXQ AX, DI + ADCXQ R12, R10 + ADOXQ R14, R10 + MOVQ $0x00, R14 + ADCXQ R14, R14 + MOVQ $0x00, AX + ADOXQ AX, R14 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R13 | 10 80(SP) | 11 SI + // | 12 DI | 13 R10 | 14 R9 | 15 R8 | 16 CX | 17 R15 | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | u2 @ 96(SP) + MOVQ 96(SP), DX + + // | + +/* */ + + // | j9 + + // | w11 @ SI + MULXQ 72(R11), AX, R12 + ADOXQ AX, SI + MOVQ SI, 88(SP) + ADCXQ R12, DI + + // | j10 + + // | w12 @ DI + MULXQ 80(R11), AX, R12 + ADOXQ AX, DI + ADCXQ R12, R10 + + // | j11 + + // | w13 @ R10 + MULXQ 88(R11), AX, R12 + ADOXQ AX, R10 + ADCXQ R12, R9 + ADOXQ R14, R9 + MOVQ $0x00, R14 + ADCXQ R14, R14 + MOVQ $0x00, AX + ADOXQ AX, R14 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R13 | 10 80(SP) | 11 88(SP) + // | 12 DI | 13 R10 | 14 R9 | 15 R8 | 16 CX | 17 R15 | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | u3 @ 104(SP) + MOVQ 104(SP), DX + + // | + +/* */ + + // | j9 + + // | w12 @ DI + MULXQ 72(R11), AX, R12 + ADOXQ AX, DI + MOVQ DI, 96(SP) + ADCXQ R12, R10 + + // | j10 + + // | w13 @ R10 + MULXQ 80(R11), AX, R12 + ADOXQ AX, R10 + ADCXQ R12, R9 + + // | j11 + + // | w14 @ R9 + MULXQ 88(R11), AX, R12 + ADOXQ AX, R9 + ADCXQ R12, R8 + ADOXQ R14, R8 + MOVQ $0x00, R14 + ADCXQ R14, R14 + MOVQ $0x00, AX + ADOXQ AX, R14 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 4 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R13 | 10 80(SP) | 11 88(SP) + // | 12 96(SP) | 13 R10 | 14 R9 | 15 R8 | 16 CX | 17 R15 | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | u4 @ 112(SP) + MOVQ 112(SP), DX + + // | + +/* */ + + // | j9 + + // | w13 @ R10 + MULXQ 72(R11), AX, R12 + ADOXQ AX, R10 + ADCXQ R12, R9 + + // | j10 + + // | w14 @ R9 + MULXQ 80(R11), AX, R12 + ADOXQ AX, R9 + ADCXQ R12, R8 + + // | j11 + + // | w15 @ R8 + MULXQ 88(R11), AX, R12 + ADOXQ AX, R8 + ADCXQ R12, CX + ADOXQ R14, CX + MOVQ $0x00, R14 + ADCXQ R14, R14 + MOVQ $0x00, AX + ADOXQ AX, R14 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 5 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R13 | 10 80(SP) | 11 88(SP) + // | 12 96(SP) | 13 R10 | 14 R9 | 15 R8 | 16 CX | 17 R15 | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | u5 @ 120(SP) + MOVQ 120(SP), DX + + // | + +/* */ + + // | j9 + + // | w14 @ R9 + MULXQ 72(R11), AX, R12 + ADOXQ AX, R9 + ADCXQ R12, R8 + + // | j10 + + // | w15 @ R8 + MULXQ 80(R11), AX, R12 + ADOXQ AX, R8 + ADCXQ R12, CX + + // | j11 + + // | w16 @ CX + MULXQ 88(R11), AX, R12 + ADOXQ AX, CX + ADCXQ R12, R15 + ADOXQ R14, R15 + + // | bring the carry from q1 + MOVQ 48(SP), R14 + MOVQ $0x00, AX + ADCXQ AX, R14 + ADOXQ AX, R14 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 6 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R13 | 10 80(SP) | 11 88(SP) + // | 12 96(SP) | 13 R10 | 14 R9 | 15 R8 | 16 CX | 17 R15 | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | u6 @ 72(SP) + MOVQ 72(SP), DX + + // | + +/* */ + + // | j9 + + // | w15 @ R8 + MULXQ 72(R11), AX, R12 + ADOXQ AX, R8 + ADCXQ R12, CX + + // | j10 + + // | w16 @ CX + MULXQ 80(R11), AX, R12 + ADOXQ AX, CX + ADCXQ R12, R15 + + // | j11 + + // | w17 @ R15 + MULXQ 88(R11), AX, R12 + ADOXQ AX, R15 + + // | w18 @ 40(SP) + // | move to an idle register + MOVQ 40(SP), BX + + // | w18 @ BX + ADCXQ R12, BX + ADOXQ R14, BX + MOVQ $0x00, R14 + ADCXQ R14, R14 + MOVQ $0x00, AX + ADOXQ AX, R14 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 7 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R13 | 10 80(SP) | 11 88(SP) + // | 12 96(SP) | 13 R10 | 14 R9 | 15 R8 | 16 CX | 17 R15 | 18 BX | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | u7 @ 64(SP) + MOVQ 64(SP), DX + + // | + +/* */ + + // | j9 + + // | w16 @ CX + MULXQ 72(R11), AX, R12 + ADOXQ AX, CX + ADCXQ R12, R15 + + // | j10 + + // | w17 @ R15 + MULXQ 80(R11), AX, R12 + ADOXQ AX, R15 + ADCXQ R12, BX + + // | j11 + + // | w18 @ BX + MULXQ 88(R11), AX, R12 + ADOXQ AX, BX + + // | w19 @ 32(SP) + // | move to an idle register + MOVQ 32(SP), DI + + // | w19 @ DI + ADCXQ R12, DI + ADOXQ R14, DI + MOVQ $0x00, R14 + ADCXQ R14, R14 + MOVQ $0x00, AX + ADOXQ AX, R14 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 8 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R13 | 10 80(SP) | 11 88(SP) + // | 12 96(SP) | 13 R10 | 14 R9 | 15 R8 | 16 CX | 17 R15 | 18 BX | 19 DI | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | u8 @ 56(SP) + MOVQ 56(SP), DX + + // | + +/* */ + + // | j9 + + // | w17 @ R15 + MULXQ 72(R11), AX, R12 + ADOXQ AX, R15 + ADCXQ R12, BX + + // | j10 + + // | w18 @ BX + MULXQ 80(R11), AX, R12 + ADOXQ AX, BX + ADCXQ R12, DI + + // | j11 + + // | w19 @ DI + MULXQ 88(R11), AX, R12 + ADOXQ AX, DI + + // | w20 @ 24(SP) + // | move to an idle register + MOVQ 24(SP), SI + + // | w20 @ SI + ADCXQ R12, SI + ADOXQ R14, SI + MOVQ $0x00, R14 + ADCXQ R14, R14 + MOVQ $0x00, AX + ADOXQ AX, R14 + + // | + // | q2 ends + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R13 | 10 80(SP) | 11 88(SP) + // | 12 96(SP) | 13 R10 | 14 R9 | 15 R8 | 16 CX | 17 R15 | 18 BX | 19 DI | 20 SI | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | save the carry from q2 + // | should be added to w21 + MOVQ R14, 48(SP) + + // | + +/* q2 q3 transition swap */ + + MOVQ 80(SP), R14 + MOVQ SI, 24(SP) + MOVQ 88(SP), SI + MOVQ DI, 32(SP) + MOVQ 96(SP), DI + + // | + // | W q2 q3 transition + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R13 | 10 R14 | 11 SI + // | 12 DI | 13 R10 | 14 R9 | 15 R8 | 16 CX | 17 R15 | 18 BX | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | + +/* montgomery reduction q3 */ + + // | clear flags + XORQ AX, AX + + // | + +/* i = 9 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R13 | 10 R14 | 11 SI + // | 12 DI | 13 R10 | 14 R9 | 15 R8 | 16 CX | 17 R15 | 18 BX | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | | u9 = w9 * inp + MOVQ R13, DX + MULXQ inp+32(FP), DX, R12 + + // | save u9 + MOVQ DX, 40(SP) + + // | + +/* */ + + // | j0 + + // | w9 @ R13 + MULXQ (R11), AX, R12 + ADOXQ AX, R13 + ADCXQ R12, R14 + + // | j1 + + // | w10 @ R14 + MULXQ 8(R11), AX, R12 + ADOXQ AX, R14 + ADCXQ R12, SI + + // | j2 + + // | w11 @ SI + MULXQ 16(R11), AX, R12 + ADOXQ AX, SI + ADCXQ R12, DI + + // | j3 + + // | w12 @ DI + MULXQ 24(R11), AX, R12 + ADOXQ AX, DI + ADCXQ R12, R10 + + // | j4 + + // | w13 @ R10 + MULXQ 32(R11), AX, R12 + ADOXQ AX, R10 + ADCXQ R12, R9 + + // | j5 + + // | w14 @ R9 + MULXQ 40(R11), AX, R12 + ADOXQ AX, R9 + ADCXQ R12, R8 + + // | j6 + + // | w15 @ R8 + MULXQ 48(R11), AX, R12 + ADOXQ AX, R8 + ADCXQ R12, CX + + // | j7 + + // | w16 @ CX + MULXQ 56(R11), AX, R12 + ADOXQ AX, CX + ADCXQ R12, R15 + + // | j8 + + // | w17 @ R15 + MULXQ 64(R11), AX, R12 + ADOXQ AX, R15 + ADCXQ R12, BX + ADOXQ R13, BX + ADCXQ R13, R13 + MOVQ $0x00, AX + ADOXQ AX, R13 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 10 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 R14 | 11 SI + // | 12 DI | 13 R10 | 14 R9 | 15 R8 | 16 CX | 17 R15 | 18 BX | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | | u10 = w10 * inp + MOVQ R14, DX + MULXQ inp+32(FP), DX, R12 + + // | save u10 + MOVQ DX, 56(SP) + + // | + +/* */ + + // | j0 + + // | w10 @ R14 + MULXQ (R11), AX, R12 + ADOXQ AX, R14 + ADCXQ R12, SI + + // | j1 + + // | w11 @ SI + MULXQ 8(R11), AX, R12 + ADOXQ AX, SI + ADCXQ R12, DI + + // | j2 + + // | w12 @ DI + MULXQ 16(R11), AX, R12 + ADOXQ AX, DI + ADCXQ R12, R10 + + // | j3 + + // | w13 @ R10 + MULXQ 24(R11), AX, R12 + ADOXQ AX, R10 + ADCXQ R12, R9 + + // | j4 + + // | w14 @ R9 + MULXQ 32(R11), AX, R12 + ADOXQ AX, R9 + ADCXQ R12, R8 + + // | j5 + + // | w15 @ R8 + MULXQ 40(R11), AX, R12 + ADOXQ AX, R8 + ADCXQ R12, CX + + // | j6 + + // | w16 @ CX + MULXQ 48(R11), AX, R12 + ADOXQ AX, CX + ADCXQ R12, R15 + + // | j7 + + // | w17 @ R15 + MULXQ 56(R11), AX, R12 + ADOXQ AX, R15 + ADCXQ R12, BX + + // | j8 + + // | w18 @ BX + MULXQ 64(R11), AX, R12 + ADOXQ AX, BX + + // | w19 @ 32(SP) + // | move to temp register + MOVQ 32(SP), AX + ADCXQ R12, AX + ADOXQ R13, AX + + // | move to an idle register + // | w19 @ AX + MOVQ AX, R13 + ADCXQ R14, R14 + MOVQ $0x00, AX + ADOXQ AX, R14 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 11 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 SI + // | 12 DI | 13 R10 | 14 R9 | 15 R8 | 16 CX | 17 R15 | 18 BX | 19 R13 | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | | u11 = w11 * inp + MOVQ SI, DX + MULXQ inp+32(FP), DX, R12 + + // | save u11 + MOVQ DX, 32(SP) + + // | + +/* */ + + // | j0 + + // | w11 @ SI + MULXQ (R11), AX, R12 + ADOXQ AX, SI + ADCXQ R12, DI + + // | j1 + + // | w12 @ DI + MULXQ 8(R11), AX, R12 + ADOXQ AX, DI + ADCXQ R12, R10 + + // | j2 + + // | w13 @ R10 + MULXQ 16(R11), AX, R12 + ADOXQ AX, R10 + ADCXQ R12, R9 + + // | j3 + + // | w14 @ R9 + MULXQ 24(R11), AX, R12 + ADOXQ AX, R9 + ADCXQ R12, R8 + + // | j4 + + // | w15 @ R8 + MULXQ 32(R11), AX, R12 + ADOXQ AX, R8 + ADCXQ R12, CX + + // | j5 + + // | w16 @ CX + MULXQ 40(R11), AX, R12 + ADOXQ AX, CX + ADCXQ R12, R15 + + // | j6 + + // | w17 @ R15 + MULXQ 48(R11), AX, R12 + ADOXQ AX, R15 + ADCXQ R12, BX + + // | j7 + + // | w18 @ BX + MULXQ 56(R11), AX, R12 + ADOXQ AX, BX + ADCXQ R12, R13 + + // | j8 + + // | w19 @ R13 + MULXQ 64(R11), AX, R12 + ADOXQ AX, R13 + + // | w20 @ 24(SP) + // | move to temp register + MOVQ 24(SP), AX + ADCXQ R12, AX + ADOXQ R14, AX + + // | move to an idle register + // | w20 @ AX + MOVQ AX, R14 + ADCXQ SI, SI + MOVQ $0x00, AX + ADOXQ AX, SI + + // | + // | W q3 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - + // | 12 DI | 13 R10 | 14 R9 | 15 R8 | 16 CX | 17 R15 | 18 BX | 19 R13 | 20 R14 | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | aggregate carries from q2 & q3 + // | should be added to w21 + ADCQ 48(SP), SI + + // | + +/* montgomerry reduction q4 */ + + // | clear flags + XORQ AX, AX + + // | + +/* i = 0 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - + // | 12 DI | 13 R10 | 14 R9 | 15 R8 | 16 CX | 17 R15 | 18 BX | 19 R13 | 20 R14 | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | u0 @ 40(SP) + MOVQ 40(SP), DX + + // | + +/* */ + + // | j9 + + // | w18 @ BX + MULXQ 72(R11), AX, R12 + ADOXQ AX, BX + ADCXQ R12, R13 + MOVQ BX, 24(SP) + + // | j10 + + // | w19 @ R13 + MULXQ 80(R11), AX, R12 + ADOXQ AX, R13 + ADCXQ R12, R14 + + // | j11 + + // | w20 @ R14 + MULXQ 88(R11), AX, R12 + ADOXQ AX, R14 + + // | w21 @ 16(SP) + // | move to an idle register + MOVQ 16(SP), BX + ADCXQ R12, BX + + // | bring carry from q2 & q3 + // | w21 @ BX + ADOXQ SI, BX + MOVQ $0x00, SI + ADCXQ SI, SI + MOVQ $0x00, R12 + ADOXQ R12, SI + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - + // | 12 DI | 13 R10 | 14 R9 | 15 R8 | 16 CX | 17 R15 | 18 24(SP) | 19 R13 | 20 R14 | 21 BX | 22 8(SP) | 23 (SP) + + + // | u1 @ 56(SP) + MOVQ 56(SP), DX + + // | + +/* */ + + // | j9 + + // | w19 @ R13 + MULXQ 72(R11), AX, R12 + ADOXQ AX, R13 + ADCXQ R12, R14 + MOVQ R13, 16(SP) + + // | j10 + + // | w20 @ R14 + MULXQ 80(R11), AX, R12 + ADOXQ AX, R14 + ADCXQ R12, BX + + // | j11 + + // | w21 @ BX + MULXQ 88(R11), AX, R12 + ADOXQ AX, BX + + // | w22 @ 8(SP) + // | move to an idle register + MOVQ 8(SP), R13 + ADCXQ R12, R13 + + // | w22 @ R13 + ADOXQ SI, R13 + MOVQ $0x00, SI + ADCXQ SI, SI + MOVQ $0x00, R12 + ADOXQ R12, SI + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - + // | 12 DI | 13 R10 | 14 R9 | 15 R8 | 16 CX | 17 R15 | 18 24(SP) | 19 16(SP) | 20 R14 | 21 BX | 22 R13 | 23 (SP) + + + // | u2 @ 32(SP) + MOVQ 32(SP), DX + + // | + +/* */ + + // | j9 + + // | w20 @ R14 + MULXQ 72(R11), AX, R12 + ADOXQ AX, R14 + ADCXQ R12, BX + + // | j10 + + // | w21 @ BX + MULXQ 80(R11), AX, R12 + ADOXQ AX, BX + ADCXQ R12, R13 + + // | j11 + + // | w22 @ R13 + MULXQ 88(R11), AX, R12 + ADOXQ AX, R13 + + // | w23 @ (SP) + // | move to an idle register + MOVQ (SP), AX + ADCXQ R12, AX + + // | w23 @ AX + ADOXQ SI, AX + MOVQ $0x00, SI + ADCXQ SI, SI + MOVQ $0x00, R12 + ADOXQ R12, SI + + // | + // | W q4 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - + // | 12 DI | 13 R10 | 14 R9 | 15 R8 | 16 CX | 17 R15 | 18 24(SP) | 19 16(SP) | 20 R14 | 21 BX | 22 R13 | 23 AX + + + // | + +/* modular reduction */ + + MOVQ DI, R12 + SUBQ (R11), R12 + MOVQ R10, DX + SBBQ 8(R11), DX + MOVQ DX, (SP) + MOVQ R9, DX + SBBQ 16(R11), DX + MOVQ DX, 8(SP) + MOVQ R8, DX + SBBQ 24(R11), DX + MOVQ DX, 32(SP) + MOVQ CX, DX + SBBQ 32(R11), DX + MOVQ DX, 40(SP) + MOVQ R15, DX + SBBQ 40(R11), DX + MOVQ DX, 48(SP) + MOVQ 24(SP), DX + SBBQ 48(R11), DX + MOVQ DX, 56(SP) + MOVQ 16(SP), DX + SBBQ 56(R11), DX + MOVQ DX, 64(SP) + MOVQ R14, DX + SBBQ 64(R11), DX + MOVQ DX, 72(SP) + MOVQ BX, DX + SBBQ 72(R11), DX + MOVQ DX, 80(SP) + MOVQ R13, DX + SBBQ 80(R11), DX + MOVQ DX, 88(SP) + MOVQ AX, DX + SBBQ 88(R11), DX + MOVQ DX, 96(SP) + SBBQ $0x00, SI + + // | + +/* out */ + + MOVQ c+0(FP), SI + CMOVQCC R12, DI + MOVQ DI, (SI) + CMOVQCC (SP), R10 + MOVQ R10, 8(SI) + CMOVQCC 8(SP), R9 + MOVQ R9, 16(SI) + CMOVQCC 32(SP), R8 + MOVQ R8, 24(SI) + CMOVQCC 40(SP), CX + MOVQ CX, 32(SI) + CMOVQCC 48(SP), R15 + MOVQ R15, 40(SI) + MOVQ 24(SP), DX + CMOVQCC 56(SP), DX + MOVQ DX, 48(SI) + MOVQ 16(SP), DX + CMOVQCC 64(SP), DX + MOVQ DX, 56(SI) + CMOVQCC 72(SP), R14 + MOVQ R14, 64(SI) + CMOVQCC 80(SP), BX + MOVQ BX, 72(SI) + CMOVQCC 88(SP), R13 + MOVQ R13, 80(SI) + CMOVQCC 96(SP), AX + MOVQ AX, 88(SI) + RET + + // | + +/* end */ + + +// func mul_no_adx_bmi2_12(c *[12]uint64, a *[12]uint64, b *[12]uint64, p *[12]uint64, inp uint64) +TEXT ·mul_no_adx_bmi2_12(SB), NOSPLIT, $216-40 + // | + +/* inputs */ + + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + MOVQ $0x00, R9 + MOVQ $0x00, R10 + MOVQ $0x00, R11 + MOVQ $0x00, R12 + MOVQ $0x00, R13 + MOVQ $0x00, R14 + MOVQ $0x00, R15 + + // | + +/* i = 0 */ + + // | a0 @ CX + MOVQ (DI), CX + + // | a0 * b0 + MOVQ (SI), AX + MULQ CX + MOVQ AX, (SP) + MOVQ DX, R8 + + // | a0 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + + // | a0 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + + // | a0 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + + // | a0 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + + // | a0 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + + // | a0 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + + // | a0 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + + // | + +/* i = 1 */ + + // | a1 @ CX + MOVQ 8(DI), CX + MOVQ $0x00, BX + + // | a1 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R10 + ADCQ $0x00, BX + MOVQ R8, 8(SP) + MOVQ $0x00, R8 + + // | a1 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + + // | a1 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + + // | + +/* i = 2 */ + + // | a2 @ CX + MOVQ 16(DI), CX + MOVQ $0x00, BX + + // | a2 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ $0x00, R11 + ADCQ $0x00, BX + MOVQ R9, 16(SP) + MOVQ $0x00, R9 + + // | a2 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + + // | a2 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + + // | + +/* i = 3 */ + + // | a3 @ CX + MOVQ 24(DI), CX + MOVQ $0x00, BX + + // | a3 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ $0x00, R12 + ADCQ $0x00, BX + MOVQ R10, 24(SP) + MOVQ $0x00, R10 + + // | a3 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + + // | a3 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + + // | + +/* i = 4 */ + + // | a4 @ CX + MOVQ 32(DI), CX + MOVQ $0x00, BX + + // | a4 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ $0x00, R13 + ADCQ $0x00, BX + MOVQ R11, 32(SP) + MOVQ $0x00, R11 + + // | a4 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + + // | a4 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + + // | + +/* i = 5 */ + + // | a5 @ CX + MOVQ 40(DI), CX + MOVQ $0x00, BX + + // | a5 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ $0x00, R14 + ADCQ $0x00, BX + MOVQ R12, 40(SP) + MOVQ $0x00, R12 + + // | a5 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + + // | a5 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + + // | + +/* i = 6 */ + + // | a6 @ CX + MOVQ 48(DI), CX + MOVQ $0x00, BX + + // | a6 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ $0x00, R15 + ADCQ $0x00, BX + MOVQ R13, 48(SP) + MOVQ $0x00, R13 + + // | a6 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + + // | a6 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + + // | + +/* i = 7 */ + + // | a7 @ CX + MOVQ 56(DI), CX + MOVQ $0x00, BX + + // | a7 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ $0x00, R8 + ADCQ $0x00, BX + MOVQ R14, 56(SP) + MOVQ $0x00, R14 + + // | a7 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + + // | a7 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + + // | + +/* i = 8 */ + + // | a8 @ CX + MOVQ 64(DI), CX + MOVQ $0x00, BX + + // | a8 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ $0x00, R9 + ADCQ $0x00, BX + MOVQ R15, 64(SP) + MOVQ $0x00, R15 + + // | a8 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + + // | a8 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + + // | + +/* i = 9 */ + + // | a9 @ CX + MOVQ 72(DI), CX + MOVQ $0x00, BX + + // | a9 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R10 + ADCQ $0x00, BX + MOVQ R8, 72(SP) + MOVQ $0x00, R8 + + // | a9 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a9 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a9 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a9 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a9 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a9 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + + // | a9 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + + // | + +/* i = 10 */ + + // | a10 @ CX + MOVQ 80(DI), CX + MOVQ $0x00, BX + + // | a10 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ $0x00, R11 + ADCQ $0x00, BX + MOVQ R9, 80(SP) + MOVQ $0x00, R9 + + // | a10 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a10 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a10 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a10 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a10 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a10 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + + // | a10 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + + // | + +/* i = 11 */ + + // | a11 @ CX + MOVQ 88(DI), CX + MOVQ $0x00, BX + + // | a11 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ $0x00, R12 + ADCQ $0x00, BX + + // | a11 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a11 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a11 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a11 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a11 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a11 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, BX + + // | a11 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, BX + + // | + +/* */ + + // | + // | W part 1 multiplication + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) | 11 R10 + // | 12 R11 | 13 R12 | 14 R13 | 15 R14 | 16 R15 | 17 R8 | 18 R9 | 19 BX | 20 - | 21 - | 22 - | 23 - + + + MOVQ R10, 88(SP) + MOVQ R11, 96(SP) + MOVQ R12, 104(SP) + MOVQ R13, 112(SP) + MOVQ R14, 120(SP) + MOVQ R15, 128(SP) + MOVQ R8, 136(SP) + MOVQ R9, 144(SP) + MOVQ BX, 152(SP) + + // | + // | W part 1 moved to stack + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) | 11 88(SP) + // | 12 96(SP) | 13 104(SP) | 14 112(SP) | 15 120(SP) | 16 128(SP) | 17 136(SP) | 18 144(SP) | 19 152(SP) | 20 - | 21 - | 22 - | 23 - + + + MOVQ $0x00, R9 + MOVQ $0x00, R10 + MOVQ $0x00, R11 + MOVQ $0x00, R12 + MOVQ $0x00, R13 + MOVQ $0x00, R14 + MOVQ $0x00, R15 + + // | + +/* i = 0 */ + + // | a0 @ CX + MOVQ (DI), CX + + // | a0 * b8 + MOVQ 64(SI), AX + MULQ CX + MOVQ AX, 160(SP) + MOVQ DX, R8 + + // | a0 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + + // | a0 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + + // | a0 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + + // | + +/* i = 1 */ + + // | a1 @ CX + MOVQ 8(DI), CX + MOVQ $0x00, BX + + // | a1 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R10 + ADCQ $0x00, BX + MOVQ R8, 168(SP) + MOVQ $0x00, R8 + + // | a1 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + + // | a1 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + + // | + +/* i = 2 */ + + // | a2 @ CX + MOVQ 16(DI), CX + MOVQ $0x00, BX + + // | a2 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ $0x00, R11 + ADCQ $0x00, BX + MOVQ R9, 176(SP) + MOVQ $0x00, R9 + + // | a2 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + + // | a2 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + + // | + +/* i = 3 */ + + // | a3 @ CX + MOVQ 24(DI), CX + MOVQ $0x00, BX + + // | a3 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ $0x00, R12 + ADCQ $0x00, BX + MOVQ R10, 184(SP) + MOVQ $0x00, R10 + + // | a3 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + + // | a3 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + + // | + +/* i = 4 */ + + // | a4 @ CX + MOVQ 32(DI), CX + MOVQ $0x00, BX + + // | a4 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ $0x00, R13 + ADCQ $0x00, BX + MOVQ R11, 192(SP) + MOVQ $0x00, R11 + + // | a4 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + + // | a4 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + + // | + +/* i = 5 */ + + // | a5 @ CX + MOVQ 40(DI), CX + MOVQ $0x00, BX + + // | a5 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ $0x00, R14 + ADCQ $0x00, BX + MOVQ R12, 200(SP) + MOVQ $0x00, R12 + + // | a5 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + + // | a5 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + + // | + +/* i = 6 */ + + // | a6 @ CX + MOVQ 48(DI), CX + MOVQ $0x00, BX + + // | a6 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ $0x00, R15 + ADCQ $0x00, BX + MOVQ R13, 208(SP) + MOVQ $0x00, R13 + + // | a6 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + + // | a6 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + + // | + +/* i = 7 */ + + // | a7 @ CX + MOVQ 56(DI), CX + MOVQ $0x00, BX + + // | a7 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ $0x00, R8 + ADCQ $0x00, BX + + // | a7 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + + // | a7 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + + // | + +/* i = 8 */ + + // | a8 @ CX + MOVQ 64(DI), CX + MOVQ $0x00, BX + + // | a8 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ $0x00, R9 + ADCQ $0x00, BX + + // | a8 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + + // | a8 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + + // | + +/* i = 9 */ + + // | a9 @ CX + MOVQ 72(DI), CX + MOVQ $0x00, BX + + // | a9 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R10 + ADCQ $0x00, BX + + // | a9 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a9 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + + // | a9 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + + // | + +/* i = 10 */ + + // | a10 @ CX + MOVQ 80(DI), CX + MOVQ $0x00, BX + + // | a10 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ $0x00, R11 + ADCQ $0x00, BX + + // | a10 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a10 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + + // | a10 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + + // | + +/* i = 11 */ + + // | a11 @ CX + MOVQ 88(DI), CX + MOVQ $0x00, BX + + // | a11 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ $0x00, R12 + ADCQ $0x00, BX + + // | a11 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a11 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ $0x00, BX + + // | a11 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, BX + + // | + +/* */ + + // | + // | W part 2 multiplication + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 160(SP) | 9 168(SP) | 10 176(SP) | 11 184(SP) + // | 12 192(SP) | 13 200(SP) | 14 208(SP) | 15 R14 | 16 R15 | 17 R8 | 18 R9 | 19 R10 | 20 R11 | 21 R12 | 22 R13 | 23 BX + + + // | + // | W part 1 + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) | 11 88(SP) + // | 12 96(SP) | 13 104(SP) | 14 112(SP) | 15 120(SP) | 16 128(SP) | 17 136(SP) | 18 144(SP) | 19 152(SP) | 20 - | 21 - | 22 - | 23 - + + + MOVQ 64(SP), AX + ADDQ AX, 160(SP) + MOVQ 72(SP), AX + ADCQ AX, 168(SP) + MOVQ 80(SP), AX + ADCQ AX, 176(SP) + MOVQ 88(SP), AX + ADCQ AX, 184(SP) + MOVQ 96(SP), AX + ADCQ AX, 192(SP) + MOVQ 104(SP), AX + ADCQ AX, 200(SP) + MOVQ 112(SP), AX + ADCQ AX, 208(SP) + ADCQ 120(SP), R14 + ADCQ 128(SP), R15 + ADCQ 136(SP), R8 + ADCQ 144(SP), R9 + ADCQ 152(SP), R10 + ADCQ $0x00, R11 + ADCQ $0x00, R12 + ADCQ $0x00, R13 + ADCQ $0x00, BX + + // | + // | W combined + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 160(SP) | 9 168(SP) | 10 176(SP) | 11 184(SP) + // | 12 192(SP) | 13 200(SP) | 14 208(SP) | 15 R14 | 16 R15 | 17 R8 | 18 R9 | 19 R10 | 20 R11 | 21 R12 | 22 R13 | 23 BX + + + MOVQ (SP), CX + MOVQ 8(SP), DI + MOVQ 16(SP), SI + MOVQ BX, (SP) + MOVQ 24(SP), BX + MOVQ R13, 8(SP) + MOVQ 32(SP), R13 + MOVQ R12, 16(SP) + MOVQ 40(SP), R12 + MOVQ R11, 24(SP) + MOVQ 48(SP), R11 + MOVQ R10, 32(SP) + MOVQ 56(SP), R10 + MOVQ R9, 40(SP) + MOVQ 160(SP), R9 + MOVQ R8, 48(SP) + MOVQ R15, 56(SP) + MOVQ R14, 64(SP) + + // | fetch modulus + MOVQ p+24(FP), R15 + + // | + +/* montgomery reduction q1 */ + + // | + +/* i = 0 */ + + // | + // | W + // | 0 CX | 1 DI | 2 SI | 3 BX | 4 R13 | 5 R12 | 6 R11 | 7 R10 | 8 R9 | 9 168(SP) | 10 176(SP) | 11 184(SP) + // | 12 192(SP) | 13 200(SP) | 14 208(SP) | 15 64(SP) | 16 56(SP) | 17 48(SP) | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | | u0 = w0 * inp + MOVQ CX, AX + MULQ inp+32(FP) + MOVQ AX, R14 + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u0 + MOVQ R14, 72(SP) + + // | j0 + + // | w0 @ CX + MOVQ (R15), AX + MULQ R14 + ADDQ AX, CX + ADCQ DX, R8 + + // | j1 + + // | w1 @ DI + MOVQ 8(R15), AX + MULQ R14 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w2 @ SI + MOVQ 16(R15), AX + MULQ R14 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w3 @ BX + MOVQ 24(R15), AX + MULQ R14 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w4 @ R13 + MOVQ 32(R15), AX + MULQ R14 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w5 @ R12 + MOVQ 40(R15), AX + MULQ R14 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w6 @ R11 + MOVQ 48(R15), AX + MULQ R14 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w7 @ R10 + MOVQ 56(R15), AX + MULQ R14 + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R8, R10 + + // | w8 @ R9 + ADCQ DX, R9 + ADCQ $0x00, CX + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 DI | 2 SI | 3 BX | 4 R13 | 5 R12 | 6 R11 | 7 R10 | 8 R9 | 9 168(SP) | 10 176(SP) | 11 184(SP) + // | 12 192(SP) | 13 200(SP) | 14 208(SP) | 15 64(SP) | 16 56(SP) | 17 48(SP) | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | | u1 = w1 * inp + MOVQ DI, AX + MULQ inp+32(FP) + MOVQ AX, R14 + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u1 + MOVQ R14, 80(SP) + + // | j0 + + // | w1 @ DI + MOVQ (R15), AX + MULQ R14 + ADDQ AX, DI + ADCQ DX, R8 + + // | j1 + + // | w2 @ SI + MOVQ 8(R15), AX + MULQ R14 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w3 @ BX + MOVQ 16(R15), AX + MULQ R14 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w4 @ R13 + MOVQ 24(R15), AX + MULQ R14 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w5 @ R12 + MOVQ 32(R15), AX + MULQ R14 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w6 @ R11 + MOVQ 40(R15), AX + MULQ R14 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w7 @ R10 + MOVQ 48(R15), AX + MULQ R14 + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w8 @ R9 + MOVQ 56(R15), AX + MULQ R14 + ADDQ AX, R9 + ADCQ DX, CX + ADDQ R8, R9 + + // | move to idle register + MOVQ 168(SP), DI + + // | w9 @ DI + ADCQ CX, DI + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 SI | 3 BX | 4 R13 | 5 R12 | 6 R11 | 7 R10 | 8 R9 | 9 DI | 10 176(SP) | 11 184(SP) + // | 12 192(SP) | 13 200(SP) | 14 208(SP) | 15 64(SP) | 16 56(SP) | 17 48(SP) | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | | u2 = w2 * inp + MOVQ SI, AX + MULQ inp+32(FP) + MOVQ AX, R14 + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u2 + MOVQ R14, 88(SP) + + // | j0 + + // | w2 @ SI + MOVQ (R15), AX + MULQ R14 + ADDQ AX, SI + ADCQ DX, R8 + + // | j1 + + // | w3 @ BX + MOVQ 8(R15), AX + MULQ R14 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w4 @ R13 + MOVQ 16(R15), AX + MULQ R14 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w5 @ R12 + MOVQ 24(R15), AX + MULQ R14 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w6 @ R11 + MOVQ 32(R15), AX + MULQ R14 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w7 @ R10 + MOVQ 40(R15), AX + MULQ R14 + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w8 @ R9 + MOVQ 48(R15), AX + MULQ R14 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R8, R9 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w9 @ DI + MOVQ 56(R15), AX + MULQ R14 + ADDQ AX, DI + ADCQ DX, CX + ADDQ R8, DI + + // | move to idle register + MOVQ 176(SP), SI + + // | w10 @ SI + ADCQ CX, SI + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 BX | 4 R13 | 5 R12 | 6 R11 | 7 R10 | 8 R9 | 9 DI | 10 SI | 11 184(SP) + // | 12 192(SP) | 13 200(SP) | 14 208(SP) | 15 64(SP) | 16 56(SP) | 17 48(SP) | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | | u3 = w3 * inp + MOVQ BX, AX + MULQ inp+32(FP) + MOVQ AX, R14 + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u3 + MOVQ R14, 96(SP) + + // | j0 + + // | w3 @ BX + MOVQ (R15), AX + MULQ R14 + ADDQ AX, BX + ADCQ DX, R8 + + // | j1 + + // | w4 @ R13 + MOVQ 8(R15), AX + MULQ R14 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w5 @ R12 + MOVQ 16(R15), AX + MULQ R14 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w6 @ R11 + MOVQ 24(R15), AX + MULQ R14 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w7 @ R10 + MOVQ 32(R15), AX + MULQ R14 + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w8 @ R9 + MOVQ 40(R15), AX + MULQ R14 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R8, R9 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w9 @ DI + MOVQ 48(R15), AX + MULQ R14 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w10 @ SI + MOVQ 56(R15), AX + MULQ R14 + ADDQ AX, SI + ADCQ DX, CX + ADDQ R8, SI + + // | move to idle register + MOVQ 184(SP), BX + + // | w11 @ BX + ADCQ CX, BX + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 4 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 R13 | 5 R12 | 6 R11 | 7 R10 | 8 R9 | 9 DI | 10 SI | 11 BX + // | 12 192(SP) | 13 200(SP) | 14 208(SP) | 15 64(SP) | 16 56(SP) | 17 48(SP) | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | | u4 = w4 * inp + MOVQ R13, AX + MULQ inp+32(FP) + MOVQ AX, R14 + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u4 + MOVQ R14, 104(SP) + + // | j0 + + // | w4 @ R13 + MOVQ (R15), AX + MULQ R14 + ADDQ AX, R13 + ADCQ DX, R8 + + // | j1 + + // | w5 @ R12 + MOVQ 8(R15), AX + MULQ R14 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w6 @ R11 + MOVQ 16(R15), AX + MULQ R14 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w7 @ R10 + MOVQ 24(R15), AX + MULQ R14 + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w8 @ R9 + MOVQ 32(R15), AX + MULQ R14 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R8, R9 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w9 @ DI + MOVQ 40(R15), AX + MULQ R14 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w10 @ SI + MOVQ 48(R15), AX + MULQ R14 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w11 @ BX + MOVQ 56(R15), AX + MULQ R14 + ADDQ AX, BX + ADCQ DX, CX + ADDQ R8, BX + + // | move to idle register + MOVQ 192(SP), R13 + + // | w12 @ R13 + ADCQ CX, R13 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 5 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 R12 | 6 R11 | 7 R10 | 8 R9 | 9 DI | 10 SI | 11 BX + // | 12 R13 | 13 200(SP) | 14 208(SP) | 15 64(SP) | 16 56(SP) | 17 48(SP) | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | | u5 = w5 * inp + MOVQ R12, AX + MULQ inp+32(FP) + MOVQ AX, R14 + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u5 + MOVQ R14, 112(SP) + + // | j0 + + // | w5 @ R12 + MOVQ (R15), AX + MULQ R14 + ADDQ AX, R12 + ADCQ DX, R8 + + // | j1 + + // | w6 @ R11 + MOVQ 8(R15), AX + MULQ R14 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w7 @ R10 + MOVQ 16(R15), AX + MULQ R14 + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w8 @ R9 + MOVQ 24(R15), AX + MULQ R14 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R8, R9 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w9 @ DI + MOVQ 32(R15), AX + MULQ R14 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w10 @ SI + MOVQ 40(R15), AX + MULQ R14 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w11 @ BX + MOVQ 48(R15), AX + MULQ R14 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w12 @ R13 + MOVQ 56(R15), AX + MULQ R14 + ADDQ AX, R13 + ADCQ DX, CX + ADDQ R8, R13 + + // | move to idle register + MOVQ 200(SP), R12 + + // | w13 @ R12 + ADCQ CX, R12 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 6 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 R11 | 7 R10 | 8 R9 | 9 DI | 10 SI | 11 BX + // | 12 R13 | 13 R12 | 14 208(SP) | 15 64(SP) | 16 56(SP) | 17 48(SP) | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | | u6 = w6 * inp + MOVQ R11, AX + MULQ inp+32(FP) + MOVQ AX, R14 + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u6 + MOVQ R14, 120(SP) + + // | j0 + + // | w6 @ R11 + MOVQ (R15), AX + MULQ R14 + ADDQ AX, R11 + ADCQ DX, R8 + + // | j1 + + // | w7 @ R10 + MOVQ 8(R15), AX + MULQ R14 + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w8 @ R9 + MOVQ 16(R15), AX + MULQ R14 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R8, R9 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w9 @ DI + MOVQ 24(R15), AX + MULQ R14 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w10 @ SI + MOVQ 32(R15), AX + MULQ R14 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w11 @ BX + MOVQ 40(R15), AX + MULQ R14 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w12 @ R13 + MOVQ 48(R15), AX + MULQ R14 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w13 @ R12 + MOVQ 56(R15), AX + MULQ R14 + ADDQ AX, R12 + ADCQ DX, CX + ADDQ R8, R12 + + // | move to idle register + MOVQ 208(SP), R11 + + // | w14 @ R11 + ADCQ CX, R11 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 7 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 R10 | 8 R9 | 9 DI | 10 SI | 11 BX + // | 12 R13 | 13 R12 | 14 R11 | 15 64(SP) | 16 56(SP) | 17 48(SP) | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | | u7 = w7 * inp + MOVQ R10, AX + MULQ inp+32(FP) + MOVQ AX, R14 + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u7 + MOVQ R14, 128(SP) + + // | j0 + + // | w7 @ R10 + MOVQ (R15), AX + MULQ R14 + ADDQ AX, R10 + ADCQ DX, R8 + + // | j1 + + // | w8 @ R9 + MOVQ 8(R15), AX + MULQ R14 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R8, R9 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w9 @ DI + MOVQ 16(R15), AX + MULQ R14 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w10 @ SI + MOVQ 24(R15), AX + MULQ R14 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w11 @ BX + MOVQ 32(R15), AX + MULQ R14 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w12 @ R13 + MOVQ 40(R15), AX + MULQ R14 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w13 @ R12 + MOVQ 48(R15), AX + MULQ R14 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w14 @ R11 + MOVQ 56(R15), AX + MULQ R14 + ADDQ AX, R11 + ADCQ DX, CX + ADDQ R8, R11 + + // | move to idle register + MOVQ 64(SP), R10 + + // | w15 @ R10 + ADCQ CX, R10 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + // | W q1 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R9 | 9 DI | 10 SI | 11 BX + // | 12 R13 | 13 R12 | 14 R11 | 15 R10 | 16 56(SP) | 17 48(SP) | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | save the carry from q1 + // | should be added to w16 + MOVQ CX, 64(SP) + + // | + +/* montgomerry reduction q2 */ + + // | + +/* i = 0 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R9 | 9 DI | 10 SI | 11 BX + // | 12 R13 | 13 R12 | 14 R11 | 15 R10 | 16 56(SP) | 17 48(SP) | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + MOVQ $0x00, R8 + + // | + +/* */ + + // | j8 + + // | w8 @ R9 + MOVQ 64(R15), AX + MULQ 72(SP) + ADDQ AX, R9 + ADCQ DX, R8 + + // | j9 + + // | w9 @ DI + MOVQ 72(R15), AX + MULQ 72(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j10 + + // | w10 @ SI + MOVQ 80(R15), AX + MULQ 72(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j11 + + // | w11 @ BX + MOVQ 88(R15), AX + MULQ 72(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + + // | w12 @ R13 + ADCQ DX, R13 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R9 | 9 DI | 10 SI | 11 BX + // | 12 R13 | 13 R12 | 14 R11 | 15 R10 | 16 56(SP) | 17 48(SP) | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + MOVQ $0x00, R8 + + // | + +/* */ + + // | j8 + + // | w9 @ DI + MOVQ 64(R15), AX + MULQ 80(SP) + ADDQ AX, DI + ADCQ DX, R8 + MOVQ DI, 72(SP) + + // | j9 + + // | w10 @ SI + MOVQ 72(R15), AX + MULQ 80(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j10 + + // | w11 @ BX + MOVQ 80(R15), AX + MULQ 80(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j11 + + // | w12 @ R13 + MOVQ 88(R15), AX + MULQ 80(SP) + ADDQ AX, R13 + ADCQ DX, CX + ADDQ R8, R13 + + // | w13 @ R12 + ADCQ CX, R12 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R9 | 9 72(SP) | 10 SI | 11 BX + // | 12 R13 | 13 R12 | 14 R11 | 15 R10 | 16 56(SP) | 17 48(SP) | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + MOVQ $0x00, R8 + + // | + +/* */ + + // | j8 + + // | w10 @ SI + MOVQ 64(R15), AX + MULQ 88(SP) + ADDQ AX, SI + ADCQ DX, R8 + MOVQ SI, 80(SP) + + // | j9 + + // | w11 @ BX + MOVQ 72(R15), AX + MULQ 88(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j10 + + // | w12 @ R13 + MOVQ 80(R15), AX + MULQ 88(SP) + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j11 + + // | w13 @ R12 + MOVQ 88(R15), AX + MULQ 88(SP) + ADDQ AX, R12 + ADCQ DX, CX + ADDQ R8, R12 + + // | w14 @ R11 + ADCQ CX, R11 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R9 | 9 72(SP) | 10 80(SP) | 11 BX + // | 12 R13 | 13 R12 | 14 R11 | 15 R10 | 16 56(SP) | 17 48(SP) | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + MOVQ $0x00, R8 + + // | + +/* */ + + // | j8 + + // | w11 @ BX + MOVQ 64(R15), AX + MULQ 96(SP) + ADDQ AX, BX + ADCQ DX, R8 + + // | j9 + + // | w12 @ R13 + MOVQ 72(R15), AX + MULQ 96(SP) + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j10 + + // | w13 @ R12 + MOVQ 80(R15), AX + MULQ 96(SP) + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j11 + + // | w14 @ R11 + MOVQ 88(R15), AX + MULQ 96(SP) + ADDQ AX, R11 + ADCQ DX, CX + ADDQ R8, R11 + + // | w15 @ R10 + ADCQ CX, R10 + + // | bring the carry from q1 + MOVQ 64(SP), CX + ADCQ $0x00, CX + + // | + +/* i = 4 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R9 | 9 72(SP) | 10 80(SP) | 11 BX + // | 12 R13 | 13 R12 | 14 R11 | 15 R10 | 16 56(SP) | 17 48(SP) | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + MOVQ $0x00, R8 + + // | + +/* */ + + // | j8 + + // | w12 @ R13 + MOVQ 64(R15), AX + MULQ 104(SP) + ADDQ AX, R13 + ADCQ DX, R8 + + // | j9 + + // | w13 @ R12 + MOVQ 72(R15), AX + MULQ 104(SP) + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j10 + + // | w14 @ R11 + MOVQ 80(R15), AX + MULQ 104(SP) + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j11 + + // | w15 @ R10 + MOVQ 88(R15), AX + MULQ 104(SP) + ADDQ AX, R10 + ADCQ DX, CX + ADDQ R8, R10 + + // | move to an idle register + MOVQ 56(SP), R14 + + // | w16 @ R14 + ADCQ CX, R14 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 5 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R9 | 9 72(SP) | 10 80(SP) | 11 BX + // | 12 R13 | 13 R12 | 14 R11 | 15 R10 | 16 R14 | 17 48(SP) | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + MOVQ $0x00, R8 + + // | + +/* */ + + // | j8 + + // | w13 @ R12 + MOVQ 64(R15), AX + MULQ 112(SP) + ADDQ AX, R12 + ADCQ DX, R8 + + // | j9 + + // | w14 @ R11 + MOVQ 72(R15), AX + MULQ 112(SP) + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j10 + + // | w15 @ R10 + MOVQ 80(R15), AX + MULQ 112(SP) + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j11 + + // | w16 @ R14 + MOVQ 88(R15), AX + MULQ 112(SP) + ADDQ AX, R14 + ADCQ DX, CX + ADDQ R8, R14 + + // | move to an idle register + MOVQ 48(SP), DI + + // | w17 @ DI + ADCQ CX, DI + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 6 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R9 | 9 72(SP) | 10 80(SP) | 11 BX + // | 12 R13 | 13 R12 | 14 R11 | 15 R10 | 16 R14 | 17 DI | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + MOVQ $0x00, R8 + + // | + +/* */ + + // | j8 + + // | w14 @ R11 + MOVQ 64(R15), AX + MULQ 120(SP) + ADDQ AX, R11 + ADCQ DX, R8 + + // | j9 + + // | w15 @ R10 + MOVQ 72(R15), AX + MULQ 120(SP) + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j10 + + // | w16 @ R14 + MOVQ 80(R15), AX + MULQ 120(SP) + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j11 + + // | w17 @ DI + MOVQ 88(R15), AX + MULQ 120(SP) + ADDQ AX, DI + ADCQ DX, CX + ADDQ R8, DI + + // | move to an idle register + MOVQ 40(SP), SI + + // | w18 @ SI + ADCQ CX, SI + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 7 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R9 | 9 72(SP) | 10 80(SP) | 11 BX + // | 12 R13 | 13 R12 | 14 R11 | 15 R10 | 16 R14 | 17 DI | 18 SI | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + MOVQ $0x00, R8 + + // | + +/* */ + + // | j8 + + // | w15 @ R10 + MOVQ 64(R15), AX + MULQ 128(SP) + ADDQ AX, R10 + ADCQ DX, R8 + + // | j9 + + // | w16 @ R14 + MOVQ 72(R15), AX + MULQ 128(SP) + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j10 + + // | w17 @ DI + MOVQ 80(R15), AX + MULQ 128(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j11 + + // | w18 @ SI + MOVQ 88(R15), AX + MULQ 128(SP) + ADDQ AX, SI + ADCQ DX, CX + ADDQ R8, SI + + // | tolarete this limb to stay in stack + // | w19 @ 32(SP) + ADCQ CX, 32(SP) + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + // | q2 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R9 | 9 72(SP) | 10 80(SP) | 11 BX + // | 12 R13 | 13 R12 | 14 R11 | 15 R10 | 16 R14 | 17 DI | 18 SI | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | save the carry from q2 + // | should be added to w20 + MOVQ CX, 64(SP) + + // | + +/* q2 q3 transition swap */ + + MOVQ 72(SP), CX + MOVQ SI, 72(SP) + MOVQ 80(SP), SI + MOVQ DI, 80(SP) + + // | + // | W q2 q3 transition + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R9 | 9 CX | 10 SI | 11 BX + // | 12 R13 | 13 R12 | 14 R11 | 15 R10 | 16 R14 | 17 80(SP) | 18 72(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | + +/* montgomery reduction q3 */ + + // | + +/* i = 8 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R9 | 9 CX | 10 SI | 11 BX + // | 12 R13 | 13 R12 | 14 R11 | 15 R10 | 16 R14 | 17 80(SP) | 18 72(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | | u8 = w8 * inp + MOVQ R9, AX + MULQ inp+32(FP) + MOVQ AX, DI + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u8 + MOVQ DI, 88(SP) + + // | j0 + + // | w8 @ R9 + MOVQ (R15), AX + MULQ DI + ADDQ AX, R9 + ADCQ DX, R8 + + // | j1 + + // | w9 @ CX + MOVQ 8(R15), AX + MULQ DI + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R8, CX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w10 @ SI + MOVQ 16(R15), AX + MULQ DI + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w11 @ BX + MOVQ 24(R15), AX + MULQ DI + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w12 @ R13 + MOVQ 32(R15), AX + MULQ DI + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w13 @ R12 + MOVQ 40(R15), AX + MULQ DI + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w14 @ R11 + MOVQ 48(R15), AX + MULQ DI + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w15 @ R10 + MOVQ 56(R15), AX + MULQ DI + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R8, R10 + + // | w16 @ R14 + ADCQ DX, R14 + ADCQ $0x00, R9 + + // | + +/* i = 9 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 CX | 10 SI | 11 BX + // | 12 R13 | 13 R12 | 14 R11 | 15 R10 | 16 R14 | 17 80(SP) | 18 72(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | | u9 = w9 * inp + MOVQ CX, AX + MULQ inp+32(FP) + MOVQ AX, DI + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u9 + MOVQ DI, 96(SP) + + // | j0 + + // | w9 @ CX + MOVQ (R15), AX + MULQ DI + ADDQ AX, CX + ADCQ DX, R8 + + // | j1 + + // | w10 @ SI + MOVQ 8(R15), AX + MULQ DI + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w11 @ BX + MOVQ 16(R15), AX + MULQ DI + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w12 @ R13 + MOVQ 24(R15), AX + MULQ DI + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w13 @ R12 + MOVQ 32(R15), AX + MULQ DI + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w14 @ R11 + MOVQ 40(R15), AX + MULQ DI + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w15 @ R10 + MOVQ 48(R15), AX + MULQ DI + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w16 @ R14 + MOVQ 56(R15), AX + MULQ DI + ADDQ AX, R14 + ADCQ DX, R9 + ADDQ R8, R14 + + // | move to idle register + MOVQ 80(SP), CX + + // | w17 @ CX + ADCQ R9, CX + MOVQ $0x00, R9 + ADCQ $0x00, R9 + + // | + +/* i = 10 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 SI | 11 BX + // | 12 R13 | 13 R12 | 14 R11 | 15 R10 | 16 R14 | 17 CX | 18 72(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | | u10 = w10 * inp + MOVQ SI, AX + MULQ inp+32(FP) + MOVQ AX, DI + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u10 + MOVQ DI, 80(SP) + + // | j0 + + // | w10 @ SI + MOVQ (R15), AX + MULQ DI + ADDQ AX, SI + ADCQ DX, R8 + + // | j1 + + // | w11 @ BX + MOVQ 8(R15), AX + MULQ DI + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w12 @ R13 + MOVQ 16(R15), AX + MULQ DI + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w13 @ R12 + MOVQ 24(R15), AX + MULQ DI + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w14 @ R11 + MOVQ 32(R15), AX + MULQ DI + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w15 @ R10 + MOVQ 40(R15), AX + MULQ DI + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w16 @ R14 + MOVQ 48(R15), AX + MULQ DI + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w17 @ CX + MOVQ 56(R15), AX + MULQ DI + ADDQ AX, CX + ADCQ DX, R9 + ADDQ R8, CX + + // | move to idle register + MOVQ 72(SP), SI + + // | w18 @ SI + ADCQ R9, SI + MOVQ $0x00, R9 + ADCQ $0x00, R9 + + // | + +/* i = 11 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 BX + // | 12 R13 | 13 R12 | 14 R11 | 15 R10 | 16 R14 | 17 CX | 18 SI | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | | u11 = w11 * inp + MOVQ BX, AX + MULQ inp+32(FP) + MOVQ AX, DI + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u11 + MOVQ DI, 72(SP) + + // | j0 + + // | w11 @ BX + MOVQ (R15), AX + MULQ DI + ADDQ AX, BX + ADCQ DX, R8 + + // | j1 + + // | w12 @ R13 + MOVQ 8(R15), AX + MULQ DI + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w13 @ R12 + MOVQ 16(R15), AX + MULQ DI + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w14 @ R11 + MOVQ 24(R15), AX + MULQ DI + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w15 @ R10 + MOVQ 32(R15), AX + MULQ DI + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w16 @ R14 + MOVQ 40(R15), AX + MULQ DI + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w17 @ CX + MOVQ 48(R15), AX + MULQ DI + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R8, CX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w18 @ SI + MOVQ 56(R15), AX + MULQ DI + ADDQ AX, SI + ADCQ DX, R9 + ADDQ R8, SI + + // | move to idle register + MOVQ 32(SP), BX + + // | w19 @ BX + ADCQ R9, BX + MOVQ $0x00, R9 + ADCQ $0x00, R9 + + // | + // | W q3 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - + // | 12 R13 | 13 R12 | 14 R11 | 15 R10 | 16 R14 | 17 CX | 18 SI | 19 BX | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + // | aggregate carries from q2 & q3 + // | should be added to w20 + ADCQ R9, 64(SP) + + // | + +/* montgomerry reduction q4 */ + + // | + +/* i = 0 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - + // | 12 R13 | 13 R12 | 14 R11 | 15 R10 | 16 R14 | 17 CX | 18 SI | 19 BX | 20 24(SP) | 21 16(SP) | 22 8(SP) | 23 (SP) + + + MOVQ $0x00, R8 + + // | + +/* */ + + // | j8 + + // | w16 @ R14 + MOVQ 64(R15), AX + MULQ 88(SP) + ADDQ AX, R14 + ADCQ DX, R8 + + // | j9 + + // | w17 @ CX + MOVQ 72(R15), AX + MULQ 88(SP) + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R8, CX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j10 + + // | w18 @ SI + MOVQ 80(R15), AX + MULQ 88(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j11 + + // | w19 @ BX + MOVQ 88(R15), AX + MULQ 88(SP) + ADDQ AX, BX + ADCQ 64(SP), DX + ADDQ R8, BX + MOVQ 24(SP), DI + + // | w20 @ DI + ADCQ DX, DI + MOVQ $0x00, R9 + ADCQ $0x00, R9 + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - + // | 12 R13 | 13 R12 | 14 R11 | 15 R10 | 16 R14 | 17 CX | 18 SI | 19 BX | 20 DI | 21 16(SP) | 22 8(SP) | 23 (SP) + + + MOVQ $0x00, R8 + + // | + +/* */ + + // | j8 + + // | w17 @ CX + MOVQ 64(R15), AX + MULQ 96(SP) + ADDQ AX, CX + ADCQ DX, R8 + MOVQ CX, 32(SP) + + // | j9 + + // | w18 @ SI + MOVQ 72(R15), AX + MULQ 96(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j10 + + // | w19 @ BX + MOVQ 80(R15), AX + MULQ 96(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j11 + + // | w20 @ DI + MOVQ 88(R15), AX + MULQ 96(SP) + ADDQ AX, DI + ADCQ DX, R9 + ADDQ R8, DI + MOVQ 16(SP), CX + + // | w21 @ CX + ADCQ R9, CX + MOVQ $0x00, R9 + ADCQ $0x00, R9 + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - + // | 12 R13 | 13 R12 | 14 R11 | 15 R10 | 16 R14 | 17 32(SP) | 18 SI | 19 BX | 20 DI | 21 CX | 22 8(SP) | 23 (SP) + + + MOVQ $0x00, R8 + + // | + +/* */ + + // | j8 + + // | w18 @ SI + MOVQ 64(R15), AX + MULQ 80(SP) + ADDQ AX, SI + ADCQ DX, R8 + MOVQ SI, 16(SP) + + // | j9 + + // | w19 @ BX + MOVQ 72(R15), AX + MULQ 80(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j10 + + // | w20 @ DI + MOVQ 80(R15), AX + MULQ 80(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j11 + + // | w21 @ CX + MOVQ 88(R15), AX + MULQ 80(SP) + ADDQ AX, CX + ADCQ DX, R9 + ADDQ R8, CX + MOVQ 8(SP), SI + + // | w22 @ SI + ADCQ R9, SI + MOVQ $0x00, R9 + ADCQ $0x00, R9 + + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - + // | 12 R13 | 13 R12 | 14 R11 | 15 R10 | 16 R14 | 17 32(SP) | 18 16(SP) | 19 BX | 20 DI | 21 CX | 22 SI | 23 (SP) + + + MOVQ $0x00, R8 + + // | + +/* */ + + // | j8 + + // | w19 @ BX + MOVQ 64(R15), AX + MULQ 72(SP) + ADDQ AX, BX + ADCQ DX, R8 + + // | j9 + + // | w20 @ DI + MOVQ 72(R15), AX + MULQ 72(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j10 + + // | w21 @ CX + MOVQ 80(R15), AX + MULQ 72(SP) + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R8, CX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j11 + + // | w22 @ SI + MOVQ 88(R15), AX + MULQ 72(SP) + ADDQ AX, SI + ADCQ DX, R9 + ADDQ R8, SI + + // | very last limb goes to short carry register + MOVQ (SP), R8 + + // | w-1 @ R8 + ADCQ R9, R8 + MOVQ $0x00, R9 + ADCQ $0x00, R9 + + // | + // | W q4 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - + // | 12 R13 | 13 R12 | 14 R11 | 15 R10 | 16 R14 | 17 32(SP) | 18 16(SP) | 19 BX | 20 DI | 21 CX | 22 SI | 23 R8 + + + // | + +/* modular reduction */ + + MOVQ R13, DX + SUBQ (R15), DX + MOVQ DX, (SP) + MOVQ R12, DX + SBBQ 8(R15), DX + MOVQ DX, 8(SP) + MOVQ R11, DX + SBBQ 16(R15), DX + MOVQ DX, 104(SP) + MOVQ R10, DX + SBBQ 24(R15), DX + MOVQ DX, 112(SP) + MOVQ R14, DX + SBBQ 32(R15), DX + MOVQ DX, 120(SP) + MOVQ 32(SP), DX + SBBQ 40(R15), DX + MOVQ DX, 128(SP) + MOVQ 16(SP), DX + SBBQ 48(R15), DX + MOVQ DX, 136(SP) + MOVQ BX, DX + SBBQ 56(R15), DX + MOVQ DX, 144(SP) + MOVQ DI, DX + SBBQ 64(R15), DX + MOVQ DX, 152(SP) + MOVQ CX, DX + SBBQ 72(R15), DX + MOVQ DX, 160(SP) + MOVQ SI, DX + SBBQ 80(R15), DX + MOVQ DX, 168(SP) + MOVQ R8, DX + SBBQ 88(R15), DX + MOVQ DX, 176(SP) + SBBQ $0x00, R9 + + // | + +/* out */ + + MOVQ c+0(FP), R9 + CMOVQCC (SP), R13 + MOVQ R13, (R9) + CMOVQCC 8(SP), R12 + MOVQ R12, 8(R9) + CMOVQCC 104(SP), R11 + MOVQ R11, 16(R9) + CMOVQCC 112(SP), R10 + MOVQ R10, 24(R9) + CMOVQCC 120(SP), R14 + MOVQ R14, 32(R9) + MOVQ 32(SP), DX + CMOVQCC 128(SP), DX + MOVQ DX, 40(R9) + MOVQ 16(SP), DX + CMOVQCC 136(SP), DX + MOVQ DX, 48(R9) + CMOVQCC 144(SP), BX + MOVQ BX, 56(R9) + CMOVQCC 152(SP), DI + MOVQ DI, 64(R9) + CMOVQCC 160(SP), CX + MOVQ CX, 72(R9) + CMOVQCC 168(SP), SI + MOVQ SI, 80(R9) + CMOVQCC 176(SP), R8 + MOVQ R8, 88(R9) + RET + + // | + +/* end */ + + +// func cpy13(dst *[13]uint64, src *[13]uint64) +TEXT ·cpy13(SB), NOSPLIT, $0-16 + MOVQ dst+0(FP), DI + MOVQ src+8(FP), SI + MOVQ (SI), R8 + MOVQ R8, (DI) + MOVQ 8(SI), R8 + MOVQ R8, 8(DI) + MOVQ 16(SI), R8 + MOVQ R8, 16(DI) + MOVQ 24(SI), R8 + MOVQ R8, 24(DI) + MOVQ 32(SI), R8 + MOVQ R8, 32(DI) + MOVQ 40(SI), R8 + MOVQ R8, 40(DI) + MOVQ 48(SI), R8 + MOVQ R8, 48(DI) + MOVQ 56(SI), R8 + MOVQ R8, 56(DI) + MOVQ 64(SI), R8 + MOVQ R8, 64(DI) + MOVQ 72(SI), R8 + MOVQ R8, 72(DI) + MOVQ 80(SI), R8 + MOVQ R8, 80(DI) + MOVQ 88(SI), R8 + MOVQ R8, 88(DI) + MOVQ 96(SI), R8 + MOVQ R8, 96(DI) + RET + +// func eq13(a *[13]uint64, b *[13]uint64) bool +TEXT ·eq13(SB), NOSPLIT, $0-17 + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + MOVB $0x00, ret+16(FP) + MOVQ (DI), R8 + CMPQ (SI), R8 + JNE ret + MOVQ 8(DI), R8 + CMPQ 8(SI), R8 + JNE ret + MOVQ 16(DI), R8 + CMPQ 16(SI), R8 + JNE ret + MOVQ 24(DI), R8 + CMPQ 24(SI), R8 + JNE ret + MOVQ 32(DI), R8 + CMPQ 32(SI), R8 + JNE ret + MOVQ 40(DI), R8 + CMPQ 40(SI), R8 + JNE ret + MOVQ 48(DI), R8 + CMPQ 48(SI), R8 + JNE ret + MOVQ 56(DI), R8 + CMPQ 56(SI), R8 + JNE ret + MOVQ 64(DI), R8 + CMPQ 64(SI), R8 + JNE ret + MOVQ 72(DI), R8 + CMPQ 72(SI), R8 + JNE ret + MOVQ 80(DI), R8 + CMPQ 80(SI), R8 + JNE ret + MOVQ 88(DI), R8 + CMPQ 88(SI), R8 + JNE ret + MOVQ 96(DI), R8 + CMPQ 96(SI), R8 + JNE ret + MOVB $0x01, ret+16(FP) + +ret: + RET + +// func cmp13(a *[13]uint64, b *[13]uint64) int8 +TEXT ·cmp13(SB), NOSPLIT, $0-17 + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + MOVQ 96(DI), R8 + CMPQ 96(SI), R8 + JB gt + JA lt + MOVQ 88(DI), R8 + CMPQ 88(SI), R8 + JB gt + JA lt + MOVQ 80(DI), R8 + CMPQ 80(SI), R8 + JB gt + JA lt + MOVQ 72(DI), R8 + CMPQ 72(SI), R8 + JB gt + JA lt + MOVQ 64(DI), R8 + CMPQ 64(SI), R8 + JB gt + JA lt + MOVQ 56(DI), R8 + CMPQ 56(SI), R8 + JB gt + JA lt + MOVQ 48(DI), R8 + CMPQ 48(SI), R8 + JB gt + JA lt + MOVQ 40(DI), R8 + CMPQ 40(SI), R8 + JB gt + JA lt + MOVQ 32(DI), R8 + CMPQ 32(SI), R8 + JB gt + JA lt + MOVQ 24(DI), R8 + CMPQ 24(SI), R8 + JB gt + JA lt + MOVQ 16(DI), R8 + CMPQ 16(SI), R8 + JB gt + JA lt + MOVQ 8(DI), R8 + CMPQ 8(SI), R8 + JB gt + JA lt + MOVQ (DI), R8 + CMPQ (SI), R8 + JB gt + JA lt + MOVB $0x00, ret+16(FP) + JMP ret + +gt: + MOVB $0x01, ret+16(FP) + JMP ret + +lt: + MOVB $0xff, ret+16(FP) + +ret: + RET + +// func add13(c *[13]uint64, a *[13]uint64, b *[13]uint64, p *[13]uint64) +TEXT ·add13(SB), NOSPLIT, $128-32 + // | + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + XORQ AX, AX + + // | + MOVQ (DI), CX + ADDQ (SI), CX + MOVQ 8(DI), DX + ADCQ 8(SI), DX + MOVQ 16(DI), R8 + ADCQ 16(SI), R8 + MOVQ 24(DI), R9 + ADCQ 24(SI), R9 + MOVQ 32(DI), R10 + ADCQ 32(SI), R10 + MOVQ 40(DI), R11 + ADCQ 40(SI), R11 + MOVQ 48(DI), R12 + ADCQ 48(SI), R12 + MOVQ 56(DI), R13 + ADCQ 56(SI), R13 + MOVQ 64(DI), R14 + ADCQ 64(SI), R14 + MOVQ 72(DI), R15 + ADCQ 72(SI), R15 + MOVQ 80(DI), BX + ADCQ 80(SI), BX + MOVQ BX, (SP) + MOVQ 88(DI), BX + ADCQ 88(SI), BX + MOVQ BX, 8(SP) + MOVQ 96(DI), BX + ADCQ 96(SI), BX + MOVQ BX, 16(SP) + ADCQ $0x00, AX + + // | + MOVQ p+24(FP), SI + MOVQ CX, BX + SUBQ (SI), BX + MOVQ BX, 24(SP) + MOVQ DX, BX + SBBQ 8(SI), BX + MOVQ BX, 32(SP) + MOVQ R8, BX + SBBQ 16(SI), BX + MOVQ BX, 40(SP) + MOVQ R9, BX + SBBQ 24(SI), BX + MOVQ BX, 48(SP) + MOVQ R10, BX + SBBQ 32(SI), BX + MOVQ BX, 56(SP) + MOVQ R11, BX + SBBQ 40(SI), BX + MOVQ BX, 64(SP) + MOVQ R12, BX + SBBQ 48(SI), BX + MOVQ BX, 72(SP) + MOVQ R13, BX + SBBQ 56(SI), BX + MOVQ BX, 80(SP) + MOVQ R14, BX + SBBQ 64(SI), BX + MOVQ BX, 88(SP) + MOVQ R15, BX + SBBQ 72(SI), BX + MOVQ BX, 96(SP) + MOVQ (SP), BX + SBBQ 80(SI), BX + MOVQ BX, 104(SP) + MOVQ 8(SP), BX + SBBQ 88(SI), BX + MOVQ BX, 112(SP) + MOVQ 16(SP), BX + SBBQ 96(SI), BX + MOVQ BX, 120(SP) + SBBQ $0x00, AX + + // | + MOVQ c+0(FP), DI + CMOVQCC 24(SP), CX + MOVQ CX, (DI) + CMOVQCC 32(SP), DX + MOVQ DX, 8(DI) + CMOVQCC 40(SP), R8 + MOVQ R8, 16(DI) + CMOVQCC 48(SP), R9 + MOVQ R9, 24(DI) + CMOVQCC 56(SP), R10 + MOVQ R10, 32(DI) + CMOVQCC 64(SP), R11 + MOVQ R11, 40(DI) + CMOVQCC 72(SP), R12 + MOVQ R12, 48(DI) + CMOVQCC 80(SP), R13 + MOVQ R13, 56(DI) + CMOVQCC 88(SP), R14 + MOVQ R14, 64(DI) + CMOVQCC 96(SP), R15 + MOVQ R15, 72(DI) + MOVQ (SP), BX + CMOVQCC 104(SP), BX + MOVQ BX, 80(DI) + MOVQ 8(SP), BX + CMOVQCC 112(SP), BX + MOVQ BX, 88(DI) + MOVQ 16(SP), BX + CMOVQCC 120(SP), BX + MOVQ BX, 96(DI) + RET + + // | + +/* end */ + + RET + +// func addn13(a *[13]uint64, b *[13]uint64) uint64 +TEXT ·addn13(SB), NOSPLIT, $24-24 + // | + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + + // | + MOVQ (DI), CX + ADDQ (SI), CX + MOVQ 8(DI), DX + ADCQ 8(SI), DX + MOVQ 16(DI), R8 + ADCQ 16(SI), R8 + MOVQ 24(DI), R9 + ADCQ 24(SI), R9 + MOVQ 32(DI), R10 + ADCQ 32(SI), R10 + MOVQ 40(DI), R11 + ADCQ 40(SI), R11 + MOVQ 48(DI), R12 + ADCQ 48(SI), R12 + MOVQ 56(DI), R13 + ADCQ 56(SI), R13 + MOVQ 64(DI), R14 + ADCQ 64(SI), R14 + MOVQ 72(DI), R15 + ADCQ 72(SI), R15 + MOVQ 80(DI), BX + ADCQ 80(SI), BX + MOVQ BX, (SP) + MOVQ 88(DI), BX + ADCQ 88(SI), BX + MOVQ BX, 8(SP) + MOVQ 96(DI), BX + ADCQ 96(SI), BX + MOVQ BX, 16(SP) + ADCQ $0x00, AX + + // | + MOVQ CX, (DI) + MOVQ DX, 8(DI) + MOVQ R8, 16(DI) + MOVQ R9, 24(DI) + MOVQ R10, 32(DI) + MOVQ R11, 40(DI) + MOVQ R12, 48(DI) + MOVQ R13, 56(DI) + MOVQ R14, 64(DI) + MOVQ R15, 72(DI) + MOVQ (SP), BX + MOVQ BX, 80(DI) + MOVQ 8(SP), BX + MOVQ BX, 88(DI) + MOVQ 16(SP), BX + MOVQ BX, 96(DI) + MOVQ AX, ret+16(FP) + RET + + // | + +/* end */ + + RET + +// func double13(c *[13]uint64, a *[13]uint64, p *[13]uint64) +TEXT ·double13(SB), NOSPLIT, $128-24 + // | + MOVQ a+8(FP), DI + XORQ AX, AX + MOVQ (DI), CX + ADDQ CX, CX + MOVQ 8(DI), DX + ADCQ DX, DX + MOVQ 16(DI), R8 + ADCQ R8, R8 + MOVQ 24(DI), R9 + ADCQ R9, R9 + MOVQ 32(DI), R10 + ADCQ R10, R10 + MOVQ 40(DI), R11 + ADCQ R11, R11 + MOVQ 48(DI), R12 + ADCQ R12, R12 + MOVQ 56(DI), R13 + ADCQ R13, R13 + MOVQ 64(DI), R14 + ADCQ R14, R14 + MOVQ 72(DI), R15 + ADCQ R15, R15 + MOVQ 80(DI), BX + ADCQ BX, BX + MOVQ BX, (SP) + MOVQ 88(DI), BX + ADCQ BX, BX + MOVQ BX, 8(SP) + MOVQ 96(DI), BX + ADCQ BX, BX + MOVQ BX, 16(SP) + ADCQ $0x00, AX + + // | + MOVQ p+16(FP), SI + MOVQ CX, BX + SUBQ (SI), BX + MOVQ BX, 24(SP) + MOVQ DX, BX + SBBQ 8(SI), BX + MOVQ BX, 32(SP) + MOVQ R8, BX + SBBQ 16(SI), BX + MOVQ BX, 40(SP) + MOVQ R9, BX + SBBQ 24(SI), BX + MOVQ BX, 48(SP) + MOVQ R10, BX + SBBQ 32(SI), BX + MOVQ BX, 56(SP) + MOVQ R11, BX + SBBQ 40(SI), BX + MOVQ BX, 64(SP) + MOVQ R12, BX + SBBQ 48(SI), BX + MOVQ BX, 72(SP) + MOVQ R13, BX + SBBQ 56(SI), BX + MOVQ BX, 80(SP) + MOVQ R14, BX + SBBQ 64(SI), BX + MOVQ BX, 88(SP) + MOVQ R15, BX + SBBQ 72(SI), BX + MOVQ BX, 96(SP) + MOVQ (SP), BX + SBBQ 80(SI), BX + MOVQ BX, 104(SP) + MOVQ 8(SP), BX + SBBQ 88(SI), BX + MOVQ BX, 112(SP) + MOVQ 16(SP), BX + SBBQ 96(SI), BX + MOVQ BX, 120(SP) + SBBQ $0x00, AX + + // | + MOVQ c+0(FP), DI + CMOVQCC 24(SP), CX + MOVQ CX, (DI) + CMOVQCC 32(SP), DX + MOVQ DX, 8(DI) + CMOVQCC 40(SP), R8 + MOVQ R8, 16(DI) + CMOVQCC 48(SP), R9 + MOVQ R9, 24(DI) + CMOVQCC 56(SP), R10 + MOVQ R10, 32(DI) + CMOVQCC 64(SP), R11 + MOVQ R11, 40(DI) + CMOVQCC 72(SP), R12 + MOVQ R12, 48(DI) + CMOVQCC 80(SP), R13 + MOVQ R13, 56(DI) + CMOVQCC 88(SP), R14 + MOVQ R14, 64(DI) + CMOVQCC 96(SP), R15 + MOVQ R15, 72(DI) + MOVQ (SP), BX + CMOVQCC 104(SP), BX + MOVQ BX, 80(DI) + MOVQ 8(SP), BX + CMOVQCC 112(SP), BX + MOVQ BX, 88(DI) + MOVQ 16(SP), BX + CMOVQCC 120(SP), BX + MOVQ BX, 96(DI) + RET + + // | + +/* end */ + + RET + +// func sub13(c *[13]uint64, a *[13]uint64, b *[13]uint64, p *[13]uint64) +TEXT ·sub13(SB), NOSPLIT, $128-32 + // | + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + XORQ AX, AX + MOVQ (DI), CX + SUBQ (SI), CX + MOVQ 8(DI), DX + SBBQ 8(SI), DX + MOVQ 16(DI), R8 + SBBQ 16(SI), R8 + MOVQ 24(DI), R9 + SBBQ 24(SI), R9 + MOVQ 32(DI), R10 + SBBQ 32(SI), R10 + MOVQ 40(DI), R11 + SBBQ 40(SI), R11 + MOVQ 48(DI), R12 + SBBQ 48(SI), R12 + MOVQ 56(DI), R13 + SBBQ 56(SI), R13 + MOVQ 64(DI), R14 + SBBQ 64(SI), R14 + MOVQ 72(DI), R15 + SBBQ 72(SI), R15 + MOVQ 80(DI), BX + SBBQ 80(SI), BX + MOVQ BX, (SP) + MOVQ 88(DI), BX + SBBQ 88(SI), BX + MOVQ BX, 8(SP) + MOVQ 96(DI), BX + SBBQ 96(SI), BX + MOVQ BX, 16(SP) + + // | + MOVQ p+24(FP), SI + CMOVQCS (SI), AX + MOVQ AX, 24(SP) + CMOVQCS 8(SI), AX + MOVQ AX, 32(SP) + CMOVQCS 16(SI), AX + MOVQ AX, 40(SP) + CMOVQCS 24(SI), AX + MOVQ AX, 48(SP) + CMOVQCS 32(SI), AX + MOVQ AX, 56(SP) + CMOVQCS 40(SI), AX + MOVQ AX, 64(SP) + CMOVQCS 48(SI), AX + MOVQ AX, 72(SP) + CMOVQCS 56(SI), AX + MOVQ AX, 80(SP) + CMOVQCS 64(SI), AX + MOVQ AX, 88(SP) + CMOVQCS 72(SI), AX + MOVQ AX, 96(SP) + CMOVQCS 80(SI), AX + MOVQ AX, 104(SP) + CMOVQCS 88(SI), AX + MOVQ AX, 112(SP) + CMOVQCS 96(SI), AX + MOVQ AX, 120(SP) + + // | + MOVQ c+0(FP), DI + ADDQ 24(SP), CX + MOVQ CX, (DI) + ADCQ 32(SP), DX + MOVQ DX, 8(DI) + ADCQ 40(SP), R8 + MOVQ R8, 16(DI) + ADCQ 48(SP), R9 + MOVQ R9, 24(DI) + ADCQ 56(SP), R10 + MOVQ R10, 32(DI) + ADCQ 64(SP), R11 + MOVQ R11, 40(DI) + ADCQ 72(SP), R12 + MOVQ R12, 48(DI) + ADCQ 80(SP), R13 + MOVQ R13, 56(DI) + ADCQ 88(SP), R14 + MOVQ R14, 64(DI) + ADCQ 96(SP), R15 + MOVQ R15, 72(DI) + MOVQ (SP), BX + ADCQ 104(SP), BX + MOVQ BX, 80(DI) + MOVQ 8(SP), BX + ADCQ 112(SP), BX + MOVQ BX, 88(DI) + MOVQ 16(SP), BX + ADCQ 120(SP), BX + MOVQ BX, 96(DI) + RET + + // | + +/* end */ + + RET + +// func subn13(a *[13]uint64, b *[13]uint64) uint64 +TEXT ·subn13(SB), NOSPLIT, $24-24 + // | + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + XORQ AX, AX + + // | + MOVQ (DI), CX + SUBQ (SI), CX + MOVQ 8(DI), DX + SBBQ 8(SI), DX + MOVQ 16(DI), R8 + SBBQ 16(SI), R8 + MOVQ 24(DI), R9 + SBBQ 24(SI), R9 + MOVQ 32(DI), R10 + SBBQ 32(SI), R10 + MOVQ 40(DI), R11 + SBBQ 40(SI), R11 + MOVQ 48(DI), R12 + SBBQ 48(SI), R12 + MOVQ 56(DI), R13 + SBBQ 56(SI), R13 + MOVQ 64(DI), R14 + SBBQ 64(SI), R14 + MOVQ 72(DI), R15 + SBBQ 72(SI), R15 + MOVQ 80(DI), BX + SBBQ 80(SI), BX + MOVQ BX, (SP) + MOVQ 88(DI), BX + SBBQ 88(SI), BX + MOVQ BX, 8(SP) + MOVQ 96(DI), BX + SBBQ 96(SI), BX + MOVQ BX, 16(SP) + ADCQ $0x00, AX + + // | + MOVQ CX, (DI) + MOVQ DX, 8(DI) + MOVQ R8, 16(DI) + MOVQ R9, 24(DI) + MOVQ R10, 32(DI) + MOVQ R11, 40(DI) + MOVQ R12, 48(DI) + MOVQ R13, 56(DI) + MOVQ R14, 64(DI) + MOVQ R15, 72(DI) + MOVQ (SP), BX + MOVQ BX, 80(DI) + MOVQ 8(SP), BX + MOVQ BX, 88(DI) + MOVQ 16(SP), BX + MOVQ BX, 96(DI) + MOVQ AX, ret+16(FP) + RET + + // | + +/* end */ + + RET + +// func _neg13(c *[13]uint64, a *[13]uint64, p *[13]uint64) +TEXT ·_neg13(SB), NOSPLIT, $24-24 + // | + MOVQ a+8(FP), DI + + // | + MOVQ p+16(FP), SI + MOVQ (SI), CX + SUBQ (DI), CX + MOVQ 8(SI), DX + SBBQ 8(DI), DX + MOVQ 16(SI), R8 + SBBQ 16(DI), R8 + MOVQ 24(SI), R9 + SBBQ 24(DI), R9 + MOVQ 32(SI), R10 + SBBQ 32(DI), R10 + MOVQ 40(SI), R11 + SBBQ 40(DI), R11 + MOVQ 48(SI), R12 + SBBQ 48(DI), R12 + MOVQ 56(SI), R13 + SBBQ 56(DI), R13 + MOVQ 64(SI), R14 + SBBQ 64(DI), R14 + MOVQ 72(SI), R15 + SBBQ 72(DI), R15 + MOVQ 80(SI), BX + SBBQ 80(DI), BX + MOVQ BX, (SP) + MOVQ 88(SI), BX + SBBQ 88(DI), BX + MOVQ BX, 8(SP) + MOVQ 96(SI), BX + SBBQ 96(DI), BX + MOVQ BX, 16(SP) + + // | + MOVQ c+0(FP), DI + MOVQ CX, (DI) + MOVQ DX, 8(DI) + MOVQ R8, 16(DI) + MOVQ R9, 24(DI) + MOVQ R10, 32(DI) + MOVQ R11, 40(DI) + MOVQ R12, 48(DI) + MOVQ R13, 56(DI) + MOVQ R14, 64(DI) + MOVQ R15, 72(DI) + MOVQ (SP), BX + MOVQ BX, 80(DI) + MOVQ 8(SP), BX + MOVQ BX, 88(DI) + MOVQ 16(SP), BX + MOVQ BX, 96(DI) + RET + + // | + +/* end */ + + RET + +// func mul_two_13(a *[13]uint64) +TEXT ·mul_two_13(SB), NOSPLIT, $0-8 + MOVQ a+0(FP), DI + XORQ AX, AX + RCLQ $0x01, (DI) + RCLQ $0x01, 8(DI) + RCLQ $0x01, 16(DI) + RCLQ $0x01, 24(DI) + RCLQ $0x01, 32(DI) + RCLQ $0x01, 40(DI) + RCLQ $0x01, 48(DI) + RCLQ $0x01, 56(DI) + RCLQ $0x01, 64(DI) + RCLQ $0x01, 72(DI) + RCLQ $0x01, 80(DI) + RCLQ $0x01, 88(DI) + RCLQ $0x01, 96(DI) + RET + +// func div_two_13(a *[13]uint64) +TEXT ·div_two_13(SB), NOSPLIT, $0-8 + MOVQ a+0(FP), DI + XORQ AX, AX + RCRQ $0x01, 96(DI) + RCRQ $0x01, 88(DI) + RCRQ $0x01, 80(DI) + RCRQ $0x01, 72(DI) + RCRQ $0x01, 64(DI) + RCRQ $0x01, 56(DI) + RCRQ $0x01, 48(DI) + RCRQ $0x01, 40(DI) + RCRQ $0x01, 32(DI) + RCRQ $0x01, 24(DI) + RCRQ $0x01, 16(DI) + RCRQ $0x01, 8(DI) + RCRQ $0x01, (DI) + RET + +// func mul13(c *[13]uint64, a *[13]uint64, b *[13]uint64, p *[13]uint64, inp uint64) +TEXT ·mul13(SB), NOSPLIT, $232-40 + // | + +/* inputs */ + + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + XORQ AX, AX + + // | + +/* i = 0 */ + + // | a0 @ DX + MOVQ (DI), DX + + // | a0 * b0 + MULXQ (SI), AX, CX + MOVQ AX, (SP) + + // | a0 * b1 + MULXQ 8(SI), AX, R8 + ADCXQ AX, CX + + // | a0 * b2 + MULXQ 16(SI), AX, R9 + ADCXQ AX, R8 + + // | a0 * b3 + MULXQ 24(SI), AX, R10 + ADCXQ AX, R9 + + // | a0 * b4 + MULXQ 32(SI), AX, R11 + ADCXQ AX, R10 + + // | a0 * b5 + MULXQ 40(SI), AX, R12 + ADCXQ AX, R11 + + // | a0 * b6 + MULXQ 48(SI), AX, R13 + ADCXQ AX, R12 + + // | a0 * b7 + MULXQ 56(SI), AX, R14 + ADCXQ AX, R13 + + // | a0 * b8 + MULXQ 64(SI), AX, R15 + ADCXQ AX, R14 + ADCQ $0x00, R15 + + // | + +/* i = 1 */ + + // | a1 @ DX + MOVQ 8(DI), DX + XORQ AX, AX + + // | a1 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + MOVQ CX, 8(SP) + MOVQ $0x00, CX + + // | a1 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a1 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a1 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a1 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a1 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a1 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a1 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a1 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R15 + ADOXQ CX, CX + ADCXQ BX, CX + + // | + +/* i = 2 */ + + // | a2 @ DX + MOVQ 16(DI), DX + XORQ AX, AX + + // | a2 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + MOVQ R8, 16(SP) + MOVQ $0x00, R8 + + // | a2 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a2 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a2 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a2 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a2 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a2 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a2 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a2 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, CX + ADOXQ R8, R8 + ADCXQ BX, R8 + + // | + +/* i = 3 */ + + // | a3 @ DX + MOVQ 24(DI), DX + XORQ AX, AX + + // | a3 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + MOVQ R9, 24(SP) + MOVQ $0x00, R9 + + // | a3 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a3 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a3 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a3 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a3 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a3 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a3 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a3 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R8 + ADOXQ R9, R9 + ADCXQ BX, R9 + + // | + +/* i = 4 */ + + // | a4 @ DX + MOVQ 32(DI), DX + XORQ AX, AX + + // | a4 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + MOVQ R10, 32(SP) + MOVQ $0x00, R10 + + // | a4 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a4 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a4 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a4 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a4 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a4 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a4 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a4 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R9 + ADOXQ R10, R10 + ADCXQ BX, R10 + + // | + +/* i = 5 */ + + // | a5 @ DX + MOVQ 40(DI), DX + XORQ AX, AX + + // | a5 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + MOVQ R11, 40(SP) + MOVQ $0x00, R11 + + // | a5 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a5 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a5 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a5 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a5 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a5 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a5 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a5 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R10 + ADOXQ R11, R11 + ADCXQ BX, R11 + + // | + +/* i = 6 */ + + // | a6 @ DX + MOVQ 48(DI), DX + XORQ AX, AX + + // | a6 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + MOVQ R12, 48(SP) + MOVQ $0x00, R12 + + // | a6 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a6 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a6 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a6 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a6 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a6 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a6 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a6 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R11 + ADOXQ R12, R12 + ADCXQ BX, R12 + + // | + +/* i = 7 */ + + // | a7 @ DX + MOVQ 56(DI), DX + XORQ AX, AX + + // | a7 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + MOVQ R13, 56(SP) + MOVQ $0x00, R13 + + // | a7 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a7 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a7 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a7 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a7 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a7 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a7 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a7 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R12 + ADOXQ R13, R13 + ADCXQ BX, R13 + + // | + +/* i = 8 */ + + // | a8 @ DX + MOVQ 64(DI), DX + XORQ AX, AX + + // | a8 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + MOVQ R14, 64(SP) + MOVQ $0x00, R14 + + // | a8 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a8 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a8 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a8 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a8 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a8 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a8 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a8 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R13 + ADOXQ R14, R14 + ADCXQ BX, R14 + + // | + +/* i = 9 */ + + // | a9 @ DX + MOVQ 72(DI), DX + XORQ AX, AX + + // | a9 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + MOVQ R15, 72(SP) + MOVQ $0x00, R15 + + // | a9 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a9 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a9 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a9 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a9 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a9 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a9 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a9 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R14 + ADOXQ R15, R15 + ADCXQ BX, R15 + + // | + +/* i = 10 */ + + // | a10 @ DX + MOVQ 80(DI), DX + XORQ AX, AX + + // | a10 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + MOVQ CX, 80(SP) + MOVQ $0x00, CX + + // | a10 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a10 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a10 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a10 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a10 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a10 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a10 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a10 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R15 + ADOXQ CX, CX + ADCXQ BX, CX + + // | + +/* i = 11 */ + + // | a11 @ DX + MOVQ 88(DI), DX + XORQ AX, AX + + // | a11 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + MOVQ R8, 88(SP) + MOVQ $0x00, R8 + + // | a11 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a11 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a11 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a11 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a11 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a11 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a11 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a11 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, CX + ADOXQ R8, R8 + ADCXQ BX, R8 + + // | + +/* i = 12 */ + + // | a12 @ DX + MOVQ 96(DI), DX + XORQ AX, AX + + // | a12 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + MOVQ R9, 96(SP) + MOVQ $0x00, R9 + + // | a12 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a12 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a12 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a12 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a12 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a12 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a12 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a12 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R8 + ADOXQ BX, R9 + ADCQ $0x00, R9 + + // | + +/* */ + + // | + // | W right + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) + // | 13 R10 | 14 R11 | 15 R12 | 16 R13 | 17 R14 | 18 R15 | 19 CX | 20 R8 | 21 R9 | 22 - | 23 - | 24 - | 25 - + + + MOVQ R10, 104(SP) + MOVQ R11, 112(SP) + MOVQ R12, 120(SP) + MOVQ R13, 128(SP) + MOVQ R14, 136(SP) + MOVQ R15, 144(SP) + MOVQ CX, 152(SP) + MOVQ R8, 160(SP) + MOVQ R9, 168(SP) + + // | + // | W right at stack + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) + // | 13 104(SP) | 14 112(SP) | 15 120(SP) | 16 128(SP) | 17 136(SP) | 18 144(SP) | 19 152(SP) | 20 160(SP) | 21 168(SP) | 22 - | 23 - | 24 - | 25 - + + + XORQ AX, AX + + // | + +/* i = 0 */ + + // | a0 @ DX + MOVQ (DI), DX + + // | a0 * b9 + MULXQ 72(SI), AX, CX + MOVQ AX, 176(SP) + + // | a0 * b10 + MULXQ 80(SI), AX, R8 + ADCXQ AX, CX + + // | a0 * b11 + MULXQ 88(SI), AX, R9 + ADCXQ AX, R8 + + // | a0 * b12 + MULXQ 96(SI), AX, R10 + ADCXQ AX, R9 + ADCQ $0x00, R10 + + // | + +/* i = 1 */ + + // | a1 @ DX + MOVQ 8(DI), DX + XORQ R11, R11 + + // | a1 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + MOVQ CX, 184(SP) + + // | a1 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a1 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a1 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R10 + ADOXQ R11, R11 + ADCXQ BX, R11 + + // | + +/* i = 2 */ + + // | a2 @ DX + MOVQ 16(DI), DX + XORQ R12, R12 + + // | a2 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + MOVQ R8, 192(SP) + + // | a2 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a2 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a2 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R11 + ADOXQ R12, R12 + ADCXQ BX, R12 + + // | + +/* i = 3 */ + + // | a3 @ DX + MOVQ 24(DI), DX + XORQ R13, R13 + + // | a3 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + MOVQ R9, 200(SP) + + // | a3 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a3 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a3 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R12 + ADOXQ R13, R13 + ADCXQ BX, R13 + + // | + +/* i = 4 */ + + // | a4 @ DX + MOVQ 32(DI), DX + XORQ R14, R14 + + // | a4 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + MOVQ R10, 208(SP) + + // | a4 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a4 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a4 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R13 + ADOXQ R14, R14 + ADCXQ BX, R14 + + // | + +/* i = 5 */ + + // | a5 @ DX + MOVQ 40(DI), DX + XORQ R15, R15 + + // | a5 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + MOVQ R11, 216(SP) + + // | a5 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a5 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a5 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R14 + ADOXQ R15, R15 + ADCXQ BX, R15 + + // | + +/* i = 6 */ + + // | a6 @ DX + MOVQ 48(DI), DX + XORQ CX, CX + + // | a6 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + MOVQ R12, 224(SP) + + // | a6 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a6 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a6 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R15 + ADOXQ CX, CX + ADCXQ BX, CX + + // | + +/* i = 7 */ + + // | a7 @ DX + MOVQ 56(DI), DX + XORQ R8, R8 + + // | a7 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a7 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a7 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a7 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, CX + ADOXQ R8, R8 + ADCXQ BX, R8 + + // | + +/* i = 8 */ + + // | a8 @ DX + MOVQ 64(DI), DX + XORQ R9, R9 + + // | a8 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a8 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a8 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a8 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R8 + ADOXQ R9, R9 + ADCXQ BX, R9 + + // | + +/* i = 9 */ + + // | a9 @ DX + MOVQ 72(DI), DX + XORQ R10, R10 + + // | a9 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a9 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a9 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a9 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R9 + ADOXQ R10, R10 + ADCXQ BX, R10 + + // | + +/* i = 10 */ + + // | a10 @ DX + MOVQ 80(DI), DX + XORQ R11, R11 + + // | a10 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a10 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a10 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a10 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R10 + ADOXQ R11, R11 + ADCXQ BX, R11 + + // | + +/* i = 11 */ + + // | a11 @ DX + MOVQ 88(DI), DX + XORQ R12, R12 + + // | a11 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a11 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a11 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a11 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R11 + ADOXQ R12, R12 + ADCXQ BX, R12 + + // | + +/* i = 12 */ + + // | a12 @ DX + MOVQ 96(DI), DX + XORQ DI, DI + + // | a12 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a12 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a12 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a12 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R12 + ADOXQ BX, DI + ADCQ $0x00, DI + + // | + +/* */ + + // | + // | W left + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 176(SP) | 10 184(SP) | 11 192(SP) | 12 200(SP) + // | 13 208(SP) | 14 216(SP) | 15 224(SP) | 16 R13 | 17 R14 | 18 R15 | 19 CX | 20 R8 | 21 R9 | 22 R10 | 23 R11 | 24 R12 | 25 DI + + + // | + // | W right + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) + // | 13 104(SP) | 14 112(SP) | 15 120(SP) | 16 128(SP) | 17 136(SP) | 18 144(SP) | 19 152(SP) | 20 160(SP) | 21 168(SP) | 22 - | 23 - | 24 - | 25 - + + + MOVQ 72(SP), AX + ADDQ AX, 176(SP) + MOVQ 80(SP), AX + ADCQ AX, 184(SP) + MOVQ 88(SP), AX + ADCQ AX, 192(SP) + MOVQ 96(SP), AX + ADCQ AX, 200(SP) + MOVQ 104(SP), AX + ADCQ AX, 208(SP) + MOVQ 112(SP), AX + ADCQ AX, 216(SP) + MOVQ 120(SP), AX + ADCQ AX, 224(SP) + ADCQ 128(SP), R13 + ADCQ 136(SP), R14 + ADCQ 144(SP), R15 + ADCQ 152(SP), CX + ADCQ 160(SP), R8 + ADCQ 168(SP), R9 + ADCQ $0x00, R10 + ADCQ $0x00, R11 + ADCQ $0x00, R12 + ADCQ $0x00, DI + + // | + // | W combined + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 176(SP) | 10 184(SP) | 11 192(SP) | 12 200(SP) + // | 13 208(SP) | 14 216(SP) | 15 224(SP) | 16 R13 | 17 R14 | 18 R15 | 19 CX | 20 R8 | 21 R9 | 22 R10 | 23 R11 | 24 R12 | 25 DI + + + MOVQ (SP), BX + MOVQ 8(SP), SI + MOVQ DI, (SP) + MOVQ 16(SP), DI + MOVQ R12, 8(SP) + MOVQ 24(SP), R12 + MOVQ R11, 16(SP) + MOVQ 32(SP), R11 + MOVQ R10, 24(SP) + MOVQ 40(SP), R10 + MOVQ R9, 32(SP) + MOVQ 48(SP), R9 + MOVQ R8, 40(SP) + MOVQ 56(SP), R8 + MOVQ CX, 48(SP) + MOVQ 64(SP), CX + MOVQ R15, 56(SP) + MOVQ 176(SP), R15 + MOVQ R14, 64(SP) + MOVQ R13, 72(SP) + + // | fetch modulus + MOVQ p+24(FP), R13 + + // | + // | W ready to mont + // | 0 BX | 1 SI | 2 DI | 3 R12 | 4 R11 | 5 R10 | 6 R9 | 7 R8 | 8 CX | 9 R15 | 10 184(SP) | 11 192(SP) | 12 200(SP) + // | 13 208(SP) | 14 216(SP) | 15 224(SP) | 16 72(SP) | 17 64(SP) | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | + +/* montgomery reduction q1 */ + + // | clear flags + XORQ AX, AX + + // | + +/* i = 0 */ + + // | + // | W + // | 0 BX | 1 SI | 2 DI | 3 R12 | 4 R11 | 5 R10 | 6 R9 | 7 R8 | 8 CX | 9 R15 | 10 184(SP) | 11 192(SP) | 12 200(SP) + // | 13 208(SP) | 14 216(SP) | 15 224(SP) | 16 72(SP) | 17 64(SP) | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | | u0 = w0 * inp + MOVQ BX, DX + MULXQ inp+32(FP), DX, R14 + + // | save u0 + MOVQ DX, 80(SP) + + // | + +/* */ + + // | j0 + + // | w0 @ BX + MULXQ (R13), AX, R14 + ADOXQ AX, BX + ADCXQ R14, SI + + // | j1 + + // | w1 @ SI + MULXQ 8(R13), AX, R14 + ADOXQ AX, SI + ADCXQ R14, DI + + // | j2 + + // | w2 @ DI + MULXQ 16(R13), AX, R14 + ADOXQ AX, DI + ADCXQ R14, R12 + + // | j3 + + // | w3 @ R12 + MULXQ 24(R13), AX, R14 + ADOXQ AX, R12 + ADCXQ R14, R11 + + // | j4 + + // | w4 @ R11 + MULXQ 32(R13), AX, R14 + ADOXQ AX, R11 + ADCXQ R14, R10 + + // | j5 + + // | w5 @ R10 + MULXQ 40(R13), AX, R14 + ADOXQ AX, R10 + ADCXQ R14, R9 + + // | j6 + + // | w6 @ R9 + MULXQ 48(R13), AX, R14 + ADOXQ AX, R9 + ADCXQ R14, R8 + + // | j7 + + // | w7 @ R8 + MULXQ 56(R13), AX, R14 + ADOXQ AX, R8 + ADCXQ R14, CX + + // | j8 + + // | w8 @ CX + MULXQ 64(R13), AX, R14 + ADOXQ AX, CX + ADCXQ R14, R15 + ADOXQ BX, R15 + ADCXQ BX, BX + MOVQ $0x00, AX + ADOXQ AX, BX + + // | clear flags + XORQ AX, AX + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 SI | 2 DI | 3 R12 | 4 R11 | 5 R10 | 6 R9 | 7 R8 | 8 CX | 9 R15 | 10 184(SP) | 11 192(SP) | 12 200(SP) + // | 13 208(SP) | 14 216(SP) | 15 224(SP) | 16 72(SP) | 17 64(SP) | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | | u1 = w1 * inp + MOVQ SI, DX + MULXQ inp+32(FP), DX, R14 + + // | save u1 + MOVQ DX, 88(SP) + + // | + +/* */ + + // | j0 + + // | w1 @ SI + MULXQ (R13), AX, R14 + ADOXQ AX, SI + ADCXQ R14, DI + + // | j1 + + // | w2 @ DI + MULXQ 8(R13), AX, R14 + ADOXQ AX, DI + ADCXQ R14, R12 + + // | j2 + + // | w3 @ R12 + MULXQ 16(R13), AX, R14 + ADOXQ AX, R12 + ADCXQ R14, R11 + + // | j3 + + // | w4 @ R11 + MULXQ 24(R13), AX, R14 + ADOXQ AX, R11 + ADCXQ R14, R10 + + // | j4 + + // | w5 @ R10 + MULXQ 32(R13), AX, R14 + ADOXQ AX, R10 + ADCXQ R14, R9 + + // | j5 + + // | w6 @ R9 + MULXQ 40(R13), AX, R14 + ADOXQ AX, R9 + ADCXQ R14, R8 + + // | j6 + + // | w7 @ R8 + MULXQ 48(R13), AX, R14 + ADOXQ AX, R8 + ADCXQ R14, CX + + // | j7 + + // | w8 @ CX + MULXQ 56(R13), AX, R14 + ADOXQ AX, CX + ADCXQ R14, R15 + + // | j8 + + // | w9 @ R15 + MULXQ 64(R13), AX, R14 + ADOXQ AX, R15 + + // | w10 @ 184(SP) + // | move to temp register + MOVQ 184(SP), AX + ADCXQ R14, AX + ADOXQ BX, AX + + // | move to an idle register + // | w10 @ AX + MOVQ AX, BX + ADCXQ SI, SI + MOVQ $0x00, AX + ADOXQ AX, SI + + // | clear flags + XORQ AX, AX + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 DI | 3 R12 | 4 R11 | 5 R10 | 6 R9 | 7 R8 | 8 CX | 9 R15 | 10 BX | 11 192(SP) | 12 200(SP) + // | 13 208(SP) | 14 216(SP) | 15 224(SP) | 16 72(SP) | 17 64(SP) | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | | u2 = w2 * inp + MOVQ DI, DX + MULXQ inp+32(FP), DX, R14 + + // | save u2 + MOVQ DX, 96(SP) + + // | + +/* */ + + // | j0 + + // | w2 @ DI + MULXQ (R13), AX, R14 + ADOXQ AX, DI + ADCXQ R14, R12 + + // | j1 + + // | w3 @ R12 + MULXQ 8(R13), AX, R14 + ADOXQ AX, R12 + ADCXQ R14, R11 + + // | j2 + + // | w4 @ R11 + MULXQ 16(R13), AX, R14 + ADOXQ AX, R11 + ADCXQ R14, R10 + + // | j3 + + // | w5 @ R10 + MULXQ 24(R13), AX, R14 + ADOXQ AX, R10 + ADCXQ R14, R9 + + // | j4 + + // | w6 @ R9 + MULXQ 32(R13), AX, R14 + ADOXQ AX, R9 + ADCXQ R14, R8 + + // | j5 + + // | w7 @ R8 + MULXQ 40(R13), AX, R14 + ADOXQ AX, R8 + ADCXQ R14, CX + + // | j6 + + // | w8 @ CX + MULXQ 48(R13), AX, R14 + ADOXQ AX, CX + ADCXQ R14, R15 + + // | j7 + + // | w9 @ R15 + MULXQ 56(R13), AX, R14 + ADOXQ AX, R15 + ADCXQ R14, BX + + // | j8 + + // | w10 @ BX + MULXQ 64(R13), AX, R14 + ADOXQ AX, BX + + // | w11 @ 192(SP) + // | move to temp register + MOVQ 192(SP), AX + ADCXQ R14, AX + ADOXQ SI, AX + + // | move to an idle register + // | w11 @ AX + MOVQ AX, SI + ADCXQ DI, DI + MOVQ $0x00, AX + ADOXQ AX, DI + + // | clear flags + XORQ AX, AX + + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 R12 | 4 R11 | 5 R10 | 6 R9 | 7 R8 | 8 CX | 9 R15 | 10 BX | 11 SI | 12 200(SP) + // | 13 208(SP) | 14 216(SP) | 15 224(SP) | 16 72(SP) | 17 64(SP) | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | | u3 = w3 * inp + MOVQ R12, DX + MULXQ inp+32(FP), DX, R14 + + // | save u3 + MOVQ DX, 104(SP) + + // | + +/* */ + + // | j0 + + // | w3 @ R12 + MULXQ (R13), AX, R14 + ADOXQ AX, R12 + ADCXQ R14, R11 + + // | j1 + + // | w4 @ R11 + MULXQ 8(R13), AX, R14 + ADOXQ AX, R11 + ADCXQ R14, R10 + + // | j2 + + // | w5 @ R10 + MULXQ 16(R13), AX, R14 + ADOXQ AX, R10 + ADCXQ R14, R9 + + // | j3 + + // | w6 @ R9 + MULXQ 24(R13), AX, R14 + ADOXQ AX, R9 + ADCXQ R14, R8 + + // | j4 + + // | w7 @ R8 + MULXQ 32(R13), AX, R14 + ADOXQ AX, R8 + ADCXQ R14, CX + + // | j5 + + // | w8 @ CX + MULXQ 40(R13), AX, R14 + ADOXQ AX, CX + ADCXQ R14, R15 + + // | j6 + + // | w9 @ R15 + MULXQ 48(R13), AX, R14 + ADOXQ AX, R15 + ADCXQ R14, BX + + // | j7 + + // | w10 @ BX + MULXQ 56(R13), AX, R14 + ADOXQ AX, BX + ADCXQ R14, SI + + // | j8 + + // | w11 @ SI + MULXQ 64(R13), AX, R14 + ADOXQ AX, SI + + // | w12 @ 200(SP) + // | move to temp register + MOVQ 200(SP), AX + ADCXQ R14, AX + ADOXQ DI, AX + + // | move to an idle register + // | w12 @ AX + MOVQ AX, DI + ADCXQ R12, R12 + MOVQ $0x00, AX + ADOXQ AX, R12 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 4 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 R11 | 5 R10 | 6 R9 | 7 R8 | 8 CX | 9 R15 | 10 BX | 11 SI | 12 DI + // | 13 208(SP) | 14 216(SP) | 15 224(SP) | 16 72(SP) | 17 64(SP) | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | | u4 = w4 * inp + MOVQ R11, DX + MULXQ inp+32(FP), DX, R14 + + // | save u4 + MOVQ DX, 112(SP) + + // | + +/* */ + + // | j0 + + // | w4 @ R11 + MULXQ (R13), AX, R14 + ADOXQ AX, R11 + ADCXQ R14, R10 + + // | j1 + + // | w5 @ R10 + MULXQ 8(R13), AX, R14 + ADOXQ AX, R10 + ADCXQ R14, R9 + + // | j2 + + // | w6 @ R9 + MULXQ 16(R13), AX, R14 + ADOXQ AX, R9 + ADCXQ R14, R8 + + // | j3 + + // | w7 @ R8 + MULXQ 24(R13), AX, R14 + ADOXQ AX, R8 + ADCXQ R14, CX + + // | j4 + + // | w8 @ CX + MULXQ 32(R13), AX, R14 + ADOXQ AX, CX + ADCXQ R14, R15 + + // | j5 + + // | w9 @ R15 + MULXQ 40(R13), AX, R14 + ADOXQ AX, R15 + ADCXQ R14, BX + + // | j6 + + // | w10 @ BX + MULXQ 48(R13), AX, R14 + ADOXQ AX, BX + ADCXQ R14, SI + + // | j7 + + // | w11 @ SI + MULXQ 56(R13), AX, R14 + ADOXQ AX, SI + ADCXQ R14, DI + + // | j8 + + // | w12 @ DI + MULXQ 64(R13), AX, R14 + ADOXQ AX, DI + + // | w13 @ 208(SP) + // | move to temp register + MOVQ 208(SP), AX + ADCXQ R14, AX + ADOXQ R12, AX + + // | move to an idle register + // | w13 @ AX + MOVQ AX, R12 + ADCXQ R11, R11 + MOVQ $0x00, AX + ADOXQ AX, R11 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 5 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 R10 | 6 R9 | 7 R8 | 8 CX | 9 R15 | 10 BX | 11 SI | 12 DI + // | 13 R12 | 14 216(SP) | 15 224(SP) | 16 72(SP) | 17 64(SP) | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | | u5 = w5 * inp + MOVQ R10, DX + MULXQ inp+32(FP), DX, R14 + + // | save u5 + MOVQ DX, 120(SP) + + // | + +/* */ + + // | j0 + + // | w5 @ R10 + MULXQ (R13), AX, R14 + ADOXQ AX, R10 + ADCXQ R14, R9 + + // | j1 + + // | w6 @ R9 + MULXQ 8(R13), AX, R14 + ADOXQ AX, R9 + ADCXQ R14, R8 + + // | j2 + + // | w7 @ R8 + MULXQ 16(R13), AX, R14 + ADOXQ AX, R8 + ADCXQ R14, CX + + // | j3 + + // | w8 @ CX + MULXQ 24(R13), AX, R14 + ADOXQ AX, CX + ADCXQ R14, R15 + + // | j4 + + // | w9 @ R15 + MULXQ 32(R13), AX, R14 + ADOXQ AX, R15 + ADCXQ R14, BX + + // | j5 + + // | w10 @ BX + MULXQ 40(R13), AX, R14 + ADOXQ AX, BX + ADCXQ R14, SI + + // | j6 + + // | w11 @ SI + MULXQ 48(R13), AX, R14 + ADOXQ AX, SI + ADCXQ R14, DI + + // | j7 + + // | w12 @ DI + MULXQ 56(R13), AX, R14 + ADOXQ AX, DI + ADCXQ R14, R12 + + // | j8 + + // | w13 @ R12 + MULXQ 64(R13), AX, R14 + ADOXQ AX, R12 + + // | w14 @ 216(SP) + // | move to temp register + MOVQ 216(SP), AX + ADCXQ R14, AX + ADOXQ R11, AX + + // | move to an idle register + // | w14 @ AX + MOVQ AX, R11 + ADCXQ R10, R10 + MOVQ $0x00, AX + ADOXQ AX, R10 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 6 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 R9 | 7 R8 | 8 CX | 9 R15 | 10 BX | 11 SI | 12 DI + // | 13 R12 | 14 R11 | 15 224(SP) | 16 72(SP) | 17 64(SP) | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | | u6 = w6 * inp + MOVQ R9, DX + MULXQ inp+32(FP), DX, R14 + + // | save u6 + MOVQ DX, 128(SP) + + // | + +/* */ + + // | j0 + + // | w6 @ R9 + MULXQ (R13), AX, R14 + ADOXQ AX, R9 + ADCXQ R14, R8 + + // | j1 + + // | w7 @ R8 + MULXQ 8(R13), AX, R14 + ADOXQ AX, R8 + ADCXQ R14, CX + + // | j2 + + // | w8 @ CX + MULXQ 16(R13), AX, R14 + ADOXQ AX, CX + ADCXQ R14, R15 + + // | j3 + + // | w9 @ R15 + MULXQ 24(R13), AX, R14 + ADOXQ AX, R15 + ADCXQ R14, BX + + // | j4 + + // | w10 @ BX + MULXQ 32(R13), AX, R14 + ADOXQ AX, BX + ADCXQ R14, SI + + // | j5 + + // | w11 @ SI + MULXQ 40(R13), AX, R14 + ADOXQ AX, SI + ADCXQ R14, DI + + // | j6 + + // | w12 @ DI + MULXQ 48(R13), AX, R14 + ADOXQ AX, DI + ADCXQ R14, R12 + + // | j7 + + // | w13 @ R12 + MULXQ 56(R13), AX, R14 + ADOXQ AX, R12 + ADCXQ R14, R11 + + // | j8 + + // | w14 @ R11 + MULXQ 64(R13), AX, R14 + ADOXQ AX, R11 + + // | w15 @ 224(SP) + // | move to temp register + MOVQ 224(SP), AX + ADCXQ R14, AX + ADOXQ R10, AX + + // | move to an idle register + // | w15 @ AX + MOVQ AX, R10 + ADCXQ R9, R9 + MOVQ $0x00, AX + ADOXQ AX, R9 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 7 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 R8 | 8 CX | 9 R15 | 10 BX | 11 SI | 12 DI + // | 13 R12 | 14 R11 | 15 R10 | 16 72(SP) | 17 64(SP) | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | | u7 = w7 * inp + MOVQ R8, DX + MULXQ inp+32(FP), DX, R14 + + // | save u7 + MOVQ DX, 136(SP) + + // | + +/* */ + + // | j0 + + // | w7 @ R8 + MULXQ (R13), AX, R14 + ADOXQ AX, R8 + ADCXQ R14, CX + + // | j1 + + // | w8 @ CX + MULXQ 8(R13), AX, R14 + ADOXQ AX, CX + ADCXQ R14, R15 + + // | j2 + + // | w9 @ R15 + MULXQ 16(R13), AX, R14 + ADOXQ AX, R15 + ADCXQ R14, BX + + // | j3 + + // | w10 @ BX + MULXQ 24(R13), AX, R14 + ADOXQ AX, BX + ADCXQ R14, SI + + // | j4 + + // | w11 @ SI + MULXQ 32(R13), AX, R14 + ADOXQ AX, SI + ADCXQ R14, DI + + // | j5 + + // | w12 @ DI + MULXQ 40(R13), AX, R14 + ADOXQ AX, DI + ADCXQ R14, R12 + + // | j6 + + // | w13 @ R12 + MULXQ 48(R13), AX, R14 + ADOXQ AX, R12 + ADCXQ R14, R11 + + // | j7 + + // | w14 @ R11 + MULXQ 56(R13), AX, R14 + ADOXQ AX, R11 + ADCXQ R14, R10 + + // | j8 + + // | w15 @ R10 + MULXQ 64(R13), AX, R14 + ADOXQ AX, R10 + + // | w16 @ 72(SP) + // | move to temp register + MOVQ 72(SP), AX + ADCXQ R14, AX + ADOXQ R9, AX + + // | move to an idle register + // | w16 @ AX + MOVQ AX, R9 + ADCXQ R8, R8 + MOVQ $0x00, AX + ADOXQ AX, R8 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 8 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 CX | 9 R15 | 10 BX | 11 SI | 12 DI + // | 13 R12 | 14 R11 | 15 R10 | 16 R9 | 17 64(SP) | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | | u8 = w8 * inp + MOVQ CX, DX + MULXQ inp+32(FP), DX, R14 + + // | save u8 + MOVQ DX, 72(SP) + + // | + +/* */ + + // | j0 + + // | w8 @ CX + MULXQ (R13), AX, R14 + ADOXQ AX, CX + ADCXQ R14, R15 + + // | j1 + + // | w9 @ R15 + MULXQ 8(R13), AX, R14 + ADOXQ AX, R15 + ADCXQ R14, BX + + // | j2 + + // | w10 @ BX + MULXQ 16(R13), AX, R14 + ADOXQ AX, BX + ADCXQ R14, SI + + // | j3 + + // | w11 @ SI + MULXQ 24(R13), AX, R14 + ADOXQ AX, SI + ADCXQ R14, DI + + // | j4 + + // | w12 @ DI + MULXQ 32(R13), AX, R14 + ADOXQ AX, DI + ADCXQ R14, R12 + + // | j5 + + // | w13 @ R12 + MULXQ 40(R13), AX, R14 + ADOXQ AX, R12 + ADCXQ R14, R11 + + // | j6 + + // | w14 @ R11 + MULXQ 48(R13), AX, R14 + ADOXQ AX, R11 + ADCXQ R14, R10 + + // | j7 + + // | w15 @ R10 + MULXQ 56(R13), AX, R14 + ADOXQ AX, R10 + ADCXQ R14, R9 + + // | j8 + + // | w16 @ R9 + MULXQ 64(R13), AX, R14 + ADOXQ AX, R9 + + // | w17 @ 64(SP) + // | move to temp register + MOVQ 64(SP), AX + ADCXQ R14, AX + ADOXQ R8, AX + + // | move to an idle register + // | w17 @ AX + MOVQ AX, R8 + ADCXQ CX, CX + MOVQ $0x00, AX + ADOXQ AX, CX + + // | + // | W montgomery reduction q1 ends + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R15 | 10 BX | 11 SI | 12 DI + // | 13 R12 | 14 R11 | 15 R10 | 16 R9 | 17 R8 | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | carry from q1 should be added to w18 + MOVQ CX, 64(SP) + + // | + +/* montgomerry reduction q2 */ + + // | clear flags + XORQ CX, CX + + // | + +/* i = 0 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R15 | 10 BX | 11 SI | 12 DI + // | 13 R12 | 14 R11 | 15 R10 | 16 R9 | 17 R8 | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | u0 @ 80(SP) + MOVQ 80(SP), DX + + // | + +/* */ + + // | j9 + + // | w9 @ R15 + MULXQ 72(R13), AX, R14 + ADOXQ AX, R15 + ADCXQ R14, BX + + // | j10 + + // | w10 @ BX + MULXQ 80(R13), AX, R14 + ADOXQ AX, BX + ADCXQ R14, SI + + // | j11 + + // | w11 @ SI + MULXQ 88(R13), AX, R14 + ADOXQ AX, SI + ADCXQ R14, DI + + // | j12 + + // | w12 @ DI + MULXQ 96(R13), AX, R14 + ADOXQ AX, DI + ADCXQ R14, R12 + ADOXQ CX, R12 + MOVQ $0x00, CX + ADCXQ CX, CX + MOVQ $0x00, AX + ADOXQ AX, CX + + // | clear flags + XORQ AX, AX + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R15 | 10 BX | 11 SI | 12 DI + // | 13 R12 | 14 R11 | 15 R10 | 16 R9 | 17 R8 | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | u1 @ 88(SP) + MOVQ 88(SP), DX + + // | + +/* */ + + // | j9 + + // | w10 @ BX + MULXQ 72(R13), AX, R14 + ADOXQ AX, BX + MOVQ BX, 80(SP) + ADCXQ R14, SI + + // | j10 + + // | w11 @ SI + MULXQ 80(R13), AX, R14 + ADOXQ AX, SI + ADCXQ R14, DI + + // | j11 + + // | w12 @ DI + MULXQ 88(R13), AX, R14 + ADOXQ AX, DI + ADCXQ R14, R12 + + // | j12 + + // | w13 @ R12 + MULXQ 96(R13), AX, R14 + ADOXQ AX, R12 + ADCXQ R14, R11 + ADOXQ CX, R11 + MOVQ $0x00, CX + ADCXQ CX, CX + MOVQ $0x00, AX + ADOXQ AX, CX + + // | clear flags + XORQ AX, AX + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R15 | 10 80(SP) | 11 SI | 12 DI + // | 13 R12 | 14 R11 | 15 R10 | 16 R9 | 17 R8 | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | u2 @ 96(SP) + MOVQ 96(SP), DX + + // | + +/* */ + + // | j9 + + // | w11 @ SI + MULXQ 72(R13), AX, R14 + ADOXQ AX, SI + MOVQ SI, 88(SP) + ADCXQ R14, DI + + // | j10 + + // | w12 @ DI + MULXQ 80(R13), AX, R14 + ADOXQ AX, DI + ADCXQ R14, R12 + + // | j11 + + // | w13 @ R12 + MULXQ 88(R13), AX, R14 + ADOXQ AX, R12 + ADCXQ R14, R11 + + // | j12 + + // | w14 @ R11 + MULXQ 96(R13), AX, R14 + ADOXQ AX, R11 + ADCXQ R14, R10 + ADOXQ CX, R10 + MOVQ $0x00, CX + ADCXQ CX, CX + MOVQ $0x00, AX + ADOXQ AX, CX + + // | clear flags + XORQ AX, AX + + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R15 | 10 80(SP) | 11 88(SP) | 12 DI + // | 13 R12 | 14 R11 | 15 R10 | 16 R9 | 17 R8 | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | u3 @ 104(SP) + MOVQ 104(SP), DX + + // | + +/* */ + + // | j9 + + // | w12 @ DI + MULXQ 72(R13), AX, R14 + ADOXQ AX, DI + MOVQ DI, 96(SP) + ADCXQ R14, R12 + + // | j10 + + // | w13 @ R12 + MULXQ 80(R13), AX, R14 + ADOXQ AX, R12 + ADCXQ R14, R11 + + // | j11 + + // | w14 @ R11 + MULXQ 88(R13), AX, R14 + ADOXQ AX, R11 + ADCXQ R14, R10 + + // | j12 + + // | w15 @ R10 + MULXQ 96(R13), AX, R14 + ADOXQ AX, R10 + ADCXQ R14, R9 + ADOXQ CX, R9 + MOVQ $0x00, CX + ADCXQ CX, CX + MOVQ $0x00, AX + ADOXQ AX, CX + + // | clear flags + XORQ AX, AX + + // | + +/* i = 4 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R15 | 10 80(SP) | 11 88(SP) | 12 96(SP) + // | 13 R12 | 14 R11 | 15 R10 | 16 R9 | 17 R8 | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | u4 @ 112(SP) + MOVQ 112(SP), DX + + // | + +/* */ + + // | j9 + + // | w13 @ R12 + MULXQ 72(R13), AX, R14 + ADOXQ AX, R12 + MOVQ R12, 104(SP) + ADCXQ R14, R11 + + // | j10 + + // | w14 @ R11 + MULXQ 80(R13), AX, R14 + ADOXQ AX, R11 + ADCXQ R14, R10 + + // | j11 + + // | w15 @ R10 + MULXQ 88(R13), AX, R14 + ADOXQ AX, R10 + ADCXQ R14, R9 + + // | j12 + + // | w16 @ R9 + MULXQ 96(R13), AX, R14 + ADOXQ AX, R9 + ADCXQ R14, R8 + ADOXQ CX, R8 + + // | bring the carry from q1 + MOVQ 64(SP), CX + MOVQ $0x00, AX + ADCXQ AX, CX + ADOXQ AX, CX + + // | clear flags + XORQ AX, AX + + // | + +/* i = 5 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R15 | 10 80(SP) | 11 88(SP) | 12 96(SP) + // | 13 104(SP) | 14 R11 | 15 R10 | 16 R9 | 17 R8 | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | u5 @ 120(SP) + MOVQ 120(SP), DX + + // | + +/* */ + + // | j9 + + // | w14 @ R11 + MULXQ 72(R13), AX, R14 + ADOXQ AX, R11 + ADCXQ R14, R10 + + // | j10 + + // | w15 @ R10 + MULXQ 80(R13), AX, R14 + ADOXQ AX, R10 + ADCXQ R14, R9 + + // | j11 + + // | w16 @ R9 + MULXQ 88(R13), AX, R14 + ADOXQ AX, R9 + ADCXQ R14, R8 + + // | j12 + + // | w17 @ R8 + MULXQ 96(R13), AX, R14 + ADOXQ AX, R8 + + // | w18 @ 56(SP) + // | move to an idle register + MOVQ 56(SP), BX + + // | w18 @ BX + ADCXQ R14, BX + ADOXQ CX, BX + MOVQ $0x00, CX + ADCXQ CX, CX + MOVQ $0x00, AX + ADOXQ AX, CX + + // | clear flags + XORQ AX, AX + + // | + +/* i = 6 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R15 | 10 80(SP) | 11 88(SP) | 12 96(SP) + // | 13 104(SP) | 14 R11 | 15 R10 | 16 R9 | 17 R8 | 18 BX | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | u6 @ 128(SP) + MOVQ 128(SP), DX + + // | + +/* */ + + // | j9 + + // | w15 @ R10 + MULXQ 72(R13), AX, R14 + ADOXQ AX, R10 + ADCXQ R14, R9 + + // | j10 + + // | w16 @ R9 + MULXQ 80(R13), AX, R14 + ADOXQ AX, R9 + ADCXQ R14, R8 + + // | j11 + + // | w17 @ R8 + MULXQ 88(R13), AX, R14 + ADOXQ AX, R8 + ADCXQ R14, BX + + // | j12 + + // | w18 @ BX + MULXQ 96(R13), AX, R14 + ADOXQ AX, BX + + // | w19 @ 48(SP) + // | move to an idle register + MOVQ 48(SP), DI + + // | w19 @ DI + ADCXQ R14, DI + ADOXQ CX, DI + MOVQ $0x00, CX + ADCXQ CX, CX + MOVQ $0x00, AX + ADOXQ AX, CX + + // | clear flags + XORQ AX, AX + + // | + +/* i = 7 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R15 | 10 80(SP) | 11 88(SP) | 12 96(SP) + // | 13 104(SP) | 14 R11 | 15 R10 | 16 R9 | 17 R8 | 18 BX | 19 DI | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | u7 @ 136(SP) + MOVQ 136(SP), DX + + // | + +/* */ + + // | j9 + + // | w16 @ R9 + MULXQ 72(R13), AX, R14 + ADOXQ AX, R9 + ADCXQ R14, R8 + + // | j10 + + // | w17 @ R8 + MULXQ 80(R13), AX, R14 + ADOXQ AX, R8 + ADCXQ R14, BX + + // | j11 + + // | w18 @ BX + MULXQ 88(R13), AX, R14 + ADOXQ AX, BX + ADCXQ R14, DI + + // | j12 + + // | w19 @ DI + MULXQ 96(R13), AX, R14 + ADOXQ AX, DI + + // | w20 @ 40(SP) + // | move to an idle register + MOVQ 40(SP), SI + + // | w20 @ SI + ADCXQ R14, SI + ADOXQ CX, SI + MOVQ $0x00, CX + ADCXQ CX, CX + MOVQ $0x00, AX + ADOXQ AX, CX + + // | clear flags + XORQ AX, AX + + // | + +/* i = 8 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R15 | 10 80(SP) | 11 88(SP) | 12 96(SP) + // | 13 104(SP) | 14 R11 | 15 R10 | 16 R9 | 17 R8 | 18 BX | 19 DI | 20 SI | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | u8 @ 72(SP) + MOVQ 72(SP), DX + + // | + +/* */ + + // | j9 + + // | w17 @ R8 + MULXQ 72(R13), AX, R14 + ADOXQ AX, R8 + ADCXQ R14, BX + + // | j10 + + // | w18 @ BX + MULXQ 80(R13), AX, R14 + ADOXQ AX, BX + ADCXQ R14, DI + + // | j11 + + // | w19 @ DI + MULXQ 88(R13), AX, R14 + ADOXQ AX, DI + ADCXQ R14, SI + + // | j12 + + // | w20 @ SI + MULXQ 96(R13), AX, R14 + ADOXQ AX, SI + + // | w21 @ 32(SP) + // | move to an idle register + MOVQ 32(SP), R12 + + // | w21 @ R12 + ADCXQ R14, R12 + ADOXQ CX, R12 + MOVQ $0x00, CX + ADCXQ CX, CX + MOVQ $0x00, AX + ADOXQ AX, CX + + // | + // | q2 ends + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R15 | 10 80(SP) | 11 88(SP) | 12 96(SP) + // | 13 104(SP) | 14 R11 | 15 R10 | 16 R9 | 17 R8 | 18 BX | 19 DI | 20 SI | 21 R12 | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | save the carry from q2 + // | should be added to w22 + MOVQ CX, 64(SP) + + // | + +/* q2 q3 transition swap */ + + MOVQ 80(SP), CX + MOVQ R12, 32(SP) + MOVQ 88(SP), R12 + MOVQ SI, 40(SP) + MOVQ 96(SP), SI + MOVQ DI, 48(SP) + MOVQ 104(SP), DI + + // | + // | W q2 q3 transition + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R15 | 10 CX | 11 R12 | 12 SI + // | 13 DI | 14 R11 | 15 R10 | 16 R9 | 17 R8 | 18 BX | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | + +/* montgomery reduction q3 */ + + // | clear flags + XORQ AX, AX + + // | + +/* i = 9 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R15 | 10 CX | 11 R12 | 12 SI + // | 13 DI | 14 R11 | 15 R10 | 16 R9 | 17 R8 | 18 BX | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | | u9 = w9 * inp + MOVQ R15, DX + MULXQ inp+32(FP), DX, R14 + + // | save u9 + MOVQ DX, 56(SP) + + // | + +/* */ + + // | j0 + + // | w9 @ R15 + MULXQ (R13), AX, R14 + ADOXQ AX, R15 + ADCXQ R14, CX + + // | j1 + + // | w10 @ CX + MULXQ 8(R13), AX, R14 + ADOXQ AX, CX + ADCXQ R14, R12 + + // | j2 + + // | w11 @ R12 + MULXQ 16(R13), AX, R14 + ADOXQ AX, R12 + ADCXQ R14, SI + + // | j3 + + // | w12 @ SI + MULXQ 24(R13), AX, R14 + ADOXQ AX, SI + ADCXQ R14, DI + + // | j4 + + // | w13 @ DI + MULXQ 32(R13), AX, R14 + ADOXQ AX, DI + ADCXQ R14, R11 + + // | j5 + + // | w14 @ R11 + MULXQ 40(R13), AX, R14 + ADOXQ AX, R11 + ADCXQ R14, R10 + + // | j6 + + // | w15 @ R10 + MULXQ 48(R13), AX, R14 + ADOXQ AX, R10 + ADCXQ R14, R9 + + // | j7 + + // | w16 @ R9 + MULXQ 56(R13), AX, R14 + ADOXQ AX, R9 + ADCXQ R14, R8 + + // | j8 + + // | w17 @ R8 + MULXQ 64(R13), AX, R14 + ADOXQ AX, R8 + ADCXQ R14, BX + ADOXQ R15, BX + ADCXQ R15, R15 + MOVQ $0x00, AX + ADOXQ AX, R15 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 10 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 CX | 11 R12 | 12 SI + // | 13 DI | 14 R11 | 15 R10 | 16 R9 | 17 R8 | 18 BX | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | | u10 = w10 * inp + MOVQ CX, DX + MULXQ inp+32(FP), DX, R14 + + // | save u10 + MOVQ DX, 72(SP) + + // | + +/* */ + + // | j0 + + // | w10 @ CX + MULXQ (R13), AX, R14 + ADOXQ AX, CX + ADCXQ R14, R12 + + // | j1 + + // | w11 @ R12 + MULXQ 8(R13), AX, R14 + ADOXQ AX, R12 + ADCXQ R14, SI + + // | j2 + + // | w12 @ SI + MULXQ 16(R13), AX, R14 + ADOXQ AX, SI + ADCXQ R14, DI + + // | j3 + + // | w13 @ DI + MULXQ 24(R13), AX, R14 + ADOXQ AX, DI + ADCXQ R14, R11 + + // | j4 + + // | w14 @ R11 + MULXQ 32(R13), AX, R14 + ADOXQ AX, R11 + ADCXQ R14, R10 + + // | j5 + + // | w15 @ R10 + MULXQ 40(R13), AX, R14 + ADOXQ AX, R10 + ADCXQ R14, R9 + + // | j6 + + // | w16 @ R9 + MULXQ 48(R13), AX, R14 + ADOXQ AX, R9 + ADCXQ R14, R8 + + // | j7 + + // | w17 @ R8 + MULXQ 56(R13), AX, R14 + ADOXQ AX, R8 + ADCXQ R14, BX + + // | j8 + + // | w18 @ BX + MULXQ 64(R13), AX, R14 + ADOXQ AX, BX + + // | w19 @ 48(SP) + // | move to temp register + MOVQ 48(SP), AX + ADCXQ R14, AX + ADOXQ R15, AX + + // | move to an idle register + // | w19 @ AX + MOVQ AX, R15 + ADCXQ CX, CX + MOVQ $0x00, AX + ADOXQ AX, CX + + // | clear flags + XORQ AX, AX + + // | + +/* i = 11 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 R12 | 12 SI + // | 13 DI | 14 R11 | 15 R10 | 16 R9 | 17 R8 | 18 BX | 19 R15 | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | | u11 = w11 * inp + MOVQ R12, DX + MULXQ inp+32(FP), DX, R14 + + // | save u11 + MOVQ DX, 48(SP) + + // | + +/* */ + + // | j0 + + // | w11 @ R12 + MULXQ (R13), AX, R14 + ADOXQ AX, R12 + ADCXQ R14, SI + + // | j1 + + // | w12 @ SI + MULXQ 8(R13), AX, R14 + ADOXQ AX, SI + ADCXQ R14, DI + + // | j2 + + // | w13 @ DI + MULXQ 16(R13), AX, R14 + ADOXQ AX, DI + ADCXQ R14, R11 + + // | j3 + + // | w14 @ R11 + MULXQ 24(R13), AX, R14 + ADOXQ AX, R11 + ADCXQ R14, R10 + + // | j4 + + // | w15 @ R10 + MULXQ 32(R13), AX, R14 + ADOXQ AX, R10 + ADCXQ R14, R9 + + // | j5 + + // | w16 @ R9 + MULXQ 40(R13), AX, R14 + ADOXQ AX, R9 + ADCXQ R14, R8 + + // | j6 + + // | w17 @ R8 + MULXQ 48(R13), AX, R14 + ADOXQ AX, R8 + ADCXQ R14, BX + + // | j7 + + // | w18 @ BX + MULXQ 56(R13), AX, R14 + ADOXQ AX, BX + ADCXQ R14, R15 + + // | j8 + + // | w19 @ R15 + MULXQ 64(R13), AX, R14 + ADOXQ AX, R15 + + // | w20 @ 40(SP) + // | move to temp register + MOVQ 40(SP), AX + ADCXQ R14, AX + ADOXQ CX, AX + + // | move to an idle register + // | w20 @ AX + MOVQ AX, CX + ADCXQ R12, R12 + MOVQ $0x00, AX + ADOXQ AX, R12 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 12 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 SI + // | 13 DI | 14 R11 | 15 R10 | 16 R9 | 17 R8 | 18 BX | 19 R15 | 20 CX | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | | u12 = w12 * inp + MOVQ SI, DX + MULXQ inp+32(FP), DX, R14 + + // | save u12 + MOVQ DX, 40(SP) + + // | + +/* */ + + // | j0 + + // | w12 @ SI + MULXQ (R13), AX, R14 + ADOXQ AX, SI + ADCXQ R14, DI + + // | j1 + + // | w13 @ DI + MULXQ 8(R13), AX, R14 + ADOXQ AX, DI + ADCXQ R14, R11 + + // | j2 + + // | w14 @ R11 + MULXQ 16(R13), AX, R14 + ADOXQ AX, R11 + ADCXQ R14, R10 + + // | j3 + + // | w15 @ R10 + MULXQ 24(R13), AX, R14 + ADOXQ AX, R10 + ADCXQ R14, R9 + + // | j4 + + // | w16 @ R9 + MULXQ 32(R13), AX, R14 + ADOXQ AX, R9 + ADCXQ R14, R8 + + // | j5 + + // | w17 @ R8 + MULXQ 40(R13), AX, R14 + ADOXQ AX, R8 + ADCXQ R14, BX + + // | j6 + + // | w18 @ BX + MULXQ 48(R13), AX, R14 + ADOXQ AX, BX + ADCXQ R14, R15 + + // | j7 + + // | w19 @ R15 + MULXQ 56(R13), AX, R14 + ADOXQ AX, R15 + ADCXQ R14, CX + + // | j8 + + // | w20 @ CX + MULXQ 64(R13), AX, R14 + ADOXQ AX, CX + + // | w21 @ 32(SP) + // | move to temp register + MOVQ 32(SP), AX + ADCXQ R14, AX + ADOXQ R12, AX + + // | move to an idle register + // | w21 @ AX + MOVQ AX, R12 + ADCXQ SI, SI + MOVQ $0x00, AX + ADOXQ AX, SI + + // | + // | W q3 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - + // | 13 DI | 14 R11 | 15 R10 | 16 R9 | 17 R8 | 18 BX | 19 R15 | 20 CX | 21 R12 | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | aggregate carries from q2 & q3 + // | should be added to w22 + ADCQ 64(SP), SI + + // | + +/* montgomerry reduction q4 */ + + // | clear flags + XORQ AX, AX + + // | + +/* i = 0 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - + // | 13 DI | 14 R11 | 15 R10 | 16 R9 | 17 R8 | 18 BX | 19 R15 | 20 CX | 21 R12 | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | u0 @ 56(SP) + MOVQ 56(SP), DX + + // | + +/* */ + + // | j9 + + // | w18 @ BX + MULXQ 72(R13), AX, R14 + ADOXQ AX, BX + ADCXQ R14, R15 + MOVQ BX, 32(SP) + + // | j10 + + // | w19 @ R15 + MULXQ 80(R13), AX, R14 + ADOXQ AX, R15 + ADCXQ R14, CX + + // | j11 + + // | w20 @ CX + MULXQ 88(R13), AX, R14 + ADOXQ AX, CX + ADCXQ R14, R12 + + // | j12 + + // | w21 @ R12 + MULXQ 96(R13), AX, R14 + ADOXQ AX, R12 + + // | w22 @ 24(SP) + // | move to an idle register + MOVQ 24(SP), BX + ADCXQ R14, BX + + // | bring carry from q2 & q3 + // | w22 @ BX + ADOXQ SI, BX + MOVQ $0x00, SI + ADCXQ SI, SI + MOVQ $0x00, R14 + ADOXQ R14, SI + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - + // | 13 DI | 14 R11 | 15 R10 | 16 R9 | 17 R8 | 18 32(SP) | 19 R15 | 20 CX | 21 R12 | 22 BX | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | u1 @ 72(SP) + MOVQ 72(SP), DX + + // | + +/* */ + + // | j9 + + // | w19 @ R15 + MULXQ 72(R13), AX, R14 + ADOXQ AX, R15 + ADCXQ R14, CX + MOVQ R15, 24(SP) + + // | j10 + + // | w20 @ CX + MULXQ 80(R13), AX, R14 + ADOXQ AX, CX + ADCXQ R14, R12 + + // | j11 + + // | w21 @ R12 + MULXQ 88(R13), AX, R14 + ADOXQ AX, R12 + ADCXQ R14, BX + + // | j12 + + // | w22 @ BX + MULXQ 96(R13), AX, R14 + ADOXQ AX, BX + + // | w23 @ 16(SP) + // | move to an idle register + MOVQ 16(SP), R15 + ADCXQ R14, R15 + + // | w23 @ R15 + ADOXQ SI, R15 + MOVQ $0x00, SI + ADCXQ SI, SI + MOVQ $0x00, R14 + ADOXQ R14, SI + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - + // | 13 DI | 14 R11 | 15 R10 | 16 R9 | 17 R8 | 18 32(SP) | 19 24(SP) | 20 CX | 21 R12 | 22 BX | 23 R15 | 24 8(SP) | 25 (SP) + + + // | u2 @ 48(SP) + MOVQ 48(SP), DX + + // | + +/* */ + + // | j9 + + // | w20 @ CX + MULXQ 72(R13), AX, R14 + ADOXQ AX, CX + ADCXQ R14, R12 + MOVQ CX, 16(SP) + + // | j10 + + // | w21 @ R12 + MULXQ 80(R13), AX, R14 + ADOXQ AX, R12 + ADCXQ R14, BX + + // | j11 + + // | w22 @ BX + MULXQ 88(R13), AX, R14 + ADOXQ AX, BX + ADCXQ R14, R15 + + // | j12 + + // | w23 @ R15 + MULXQ 96(R13), AX, R14 + ADOXQ AX, R15 + + // | w24 @ 8(SP) + // | move to an idle register + MOVQ 8(SP), CX + ADCXQ R14, CX + + // | w24 @ CX + ADOXQ SI, CX + MOVQ $0x00, SI + ADCXQ SI, SI + MOVQ $0x00, R14 + ADOXQ R14, SI + + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - + // | 13 DI | 14 R11 | 15 R10 | 16 R9 | 17 R8 | 18 32(SP) | 19 24(SP) | 20 16(SP) | 21 R12 | 22 BX | 23 R15 | 24 CX | 25 (SP) + + + // | u3 @ 40(SP) + MOVQ 40(SP), DX + + // | + +/* */ + + // | j9 + + // | w21 @ R12 + MULXQ 72(R13), AX, R14 + ADOXQ AX, R12 + ADCXQ R14, BX + + // | j10 + + // | w22 @ BX + MULXQ 80(R13), AX, R14 + ADOXQ AX, BX + ADCXQ R14, R15 + + // | j11 + + // | w23 @ R15 + MULXQ 88(R13), AX, R14 + ADOXQ AX, R15 + ADCXQ R14, CX + + // | j12 + + // | w24 @ CX + MULXQ 96(R13), AX, R14 + ADOXQ AX, CX + + // | w25 @ (SP) + // | move to an idle register + MOVQ (SP), AX + ADCXQ R14, AX + + // | w25 @ AX + ADOXQ SI, AX + MOVQ $0x00, SI + ADCXQ SI, SI + MOVQ $0x00, R14 + ADOXQ R14, SI + + // | + // | W q4 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - + // | 13 DI | 14 R11 | 15 R10 | 16 R9 | 17 R8 | 18 32(SP) | 19 24(SP) | 20 16(SP) | 21 R12 | 22 BX | 23 R15 | 24 CX | 25 AX + + + // | + +/* modular reduction */ + + MOVQ DI, R14 + SUBQ (R13), R14 + MOVQ R11, DX + SBBQ 8(R13), DX + MOVQ DX, (SP) + MOVQ R10, DX + SBBQ 16(R13), DX + MOVQ DX, 8(SP) + MOVQ R9, DX + SBBQ 24(R13), DX + MOVQ DX, 40(SP) + MOVQ R8, DX + SBBQ 32(R13), DX + MOVQ DX, 48(SP) + MOVQ 32(SP), DX + SBBQ 40(R13), DX + MOVQ DX, 56(SP) + MOVQ 24(SP), DX + SBBQ 48(R13), DX + MOVQ DX, 64(SP) + MOVQ 16(SP), DX + SBBQ 56(R13), DX + MOVQ DX, 72(SP) + MOVQ R12, DX + SBBQ 64(R13), DX + MOVQ DX, 80(SP) + MOVQ BX, DX + SBBQ 72(R13), DX + MOVQ DX, 88(SP) + MOVQ R15, DX + SBBQ 80(R13), DX + MOVQ DX, 96(SP) + MOVQ CX, DX + SBBQ 88(R13), DX + MOVQ DX, 104(SP) + MOVQ AX, DX + SBBQ 96(R13), DX + MOVQ DX, 112(SP) + SBBQ $0x00, SI + + // | + +/* out */ + + MOVQ c+0(FP), SI + CMOVQCC R14, DI + MOVQ DI, (SI) + CMOVQCC (SP), R11 + MOVQ R11, 8(SI) + CMOVQCC 8(SP), R10 + MOVQ R10, 16(SI) + CMOVQCC 40(SP), R9 + MOVQ R9, 24(SI) + CMOVQCC 48(SP), R8 + MOVQ R8, 32(SI) + MOVQ 32(SP), DX + CMOVQCC 56(SP), DX + MOVQ DX, 40(SI) + MOVQ 24(SP), DX + CMOVQCC 64(SP), DX + MOVQ DX, 48(SI) + MOVQ 16(SP), DX + CMOVQCC 72(SP), DX + MOVQ DX, 56(SI) + CMOVQCC 80(SP), R12 + MOVQ R12, 64(SI) + CMOVQCC 88(SP), BX + MOVQ BX, 72(SI) + CMOVQCC 96(SP), R15 + MOVQ R15, 80(SI) + CMOVQCC 104(SP), CX + MOVQ CX, 88(SI) + CMOVQCC 112(SP), AX + MOVQ AX, 96(SI) + RET + + // | + +/* end */ + + +// func mul_no_adx_bmi2_13(c *[13]uint64, a *[13]uint64, b *[13]uint64, p *[13]uint64, inp uint64) +TEXT ·mul_no_adx_bmi2_13(SB), NOSPLIT, $240-40 + // | + +/* inputs */ + + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + MOVQ $0x00, R9 + MOVQ $0x00, R10 + MOVQ $0x00, R11 + MOVQ $0x00, R12 + MOVQ $0x00, R13 + MOVQ $0x00, R14 + MOVQ $0x00, R15 + + // | + +/* i = 0 */ + + // | a0 @ CX + MOVQ (DI), CX + + // | a0 * b0 + MOVQ (SI), AX + MULQ CX + MOVQ AX, (SP) + MOVQ DX, R8 + + // | a0 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + + // | a0 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + + // | a0 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + + // | a0 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + + // | a0 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + + // | a0 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + + // | a0 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + + // | + +/* i = 1 */ + + // | a1 @ CX + MOVQ 8(DI), CX + MOVQ $0x00, BX + + // | a1 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R10 + ADCQ $0x00, BX + MOVQ R8, 8(SP) + MOVQ $0x00, R8 + + // | a1 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + + // | a1 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + + // | + +/* i = 2 */ + + // | a2 @ CX + MOVQ 16(DI), CX + MOVQ $0x00, BX + + // | a2 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ $0x00, R11 + ADCQ $0x00, BX + MOVQ R9, 16(SP) + MOVQ $0x00, R9 + + // | a2 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + + // | a2 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + + // | + +/* i = 3 */ + + // | a3 @ CX + MOVQ 24(DI), CX + MOVQ $0x00, BX + + // | a3 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ $0x00, R12 + ADCQ $0x00, BX + MOVQ R10, 24(SP) + MOVQ $0x00, R10 + + // | a3 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + + // | a3 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + + // | + +/* i = 4 */ + + // | a4 @ CX + MOVQ 32(DI), CX + MOVQ $0x00, BX + + // | a4 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ $0x00, R13 + ADCQ $0x00, BX + MOVQ R11, 32(SP) + MOVQ $0x00, R11 + + // | a4 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + + // | a4 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + + // | + +/* i = 5 */ + + // | a5 @ CX + MOVQ 40(DI), CX + MOVQ $0x00, BX + + // | a5 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ $0x00, R14 + ADCQ $0x00, BX + MOVQ R12, 40(SP) + MOVQ $0x00, R12 + + // | a5 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + + // | a5 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + + // | + +/* i = 6 */ + + // | a6 @ CX + MOVQ 48(DI), CX + MOVQ $0x00, BX + + // | a6 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ $0x00, R15 + ADCQ $0x00, BX + MOVQ R13, 48(SP) + MOVQ $0x00, R13 + + // | a6 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + + // | a6 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + + // | + +/* i = 7 */ + + // | a7 @ CX + MOVQ 56(DI), CX + MOVQ $0x00, BX + + // | a7 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ $0x00, R8 + ADCQ $0x00, BX + MOVQ R14, 56(SP) + MOVQ $0x00, R14 + + // | a7 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + + // | a7 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + + // | + +/* i = 8 */ + + // | a8 @ CX + MOVQ 64(DI), CX + MOVQ $0x00, BX + + // | a8 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ $0x00, R9 + ADCQ $0x00, BX + MOVQ R15, 64(SP) + MOVQ $0x00, R15 + + // | a8 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + + // | a8 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + + // | + +/* i = 9 */ + + // | a9 @ CX + MOVQ 72(DI), CX + MOVQ $0x00, BX + + // | a9 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R10 + ADCQ $0x00, BX + MOVQ R8, 72(SP) + MOVQ $0x00, R8 + + // | a9 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a9 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a9 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a9 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a9 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a9 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + + // | a9 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + + // | + +/* i = 10 */ + + // | a10 @ CX + MOVQ 80(DI), CX + MOVQ $0x00, BX + + // | a10 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ $0x00, R11 + ADCQ $0x00, BX + MOVQ R9, 80(SP) + MOVQ $0x00, R9 + + // | a10 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a10 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a10 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a10 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a10 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a10 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + + // | a10 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + + // | + +/* i = 11 */ + + // | a11 @ CX + MOVQ 88(DI), CX + MOVQ $0x00, BX + + // | a11 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ $0x00, R12 + ADCQ $0x00, BX + MOVQ R10, 88(SP) + MOVQ $0x00, R10 + + // | a11 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a11 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a11 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a11 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a11 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a11 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + + // | a11 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + + // | + +/* i = 12 */ + + // | a12 @ CX + MOVQ 96(DI), CX + MOVQ $0x00, BX + + // | a12 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ $0x00, R13 + ADCQ $0x00, BX + + // | a12 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a12 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a12 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a12 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a12 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a12 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ $0x00, BX + + // | a12 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, BX + + // | + +/* */ + + // | + // | W part 1 multiplication + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 R11 + // | 13 R12 | 14 R13 | 15 R14 | 16 R15 | 17 R8 | 18 R9 | 19 R10 | 20 BX | 21 - | 22 - | 23 - | 24 - | 25 - + + + MOVQ R11, 96(SP) + MOVQ R12, 104(SP) + MOVQ R13, 112(SP) + MOVQ R14, 120(SP) + MOVQ R15, 128(SP) + MOVQ R8, 136(SP) + MOVQ R9, 144(SP) + MOVQ R10, 152(SP) + MOVQ BX, 160(SP) + + // | + // | W part 1 moved to stack + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) + // | 13 104(SP) | 14 112(SP) | 15 120(SP) | 16 128(SP) | 17 136(SP) | 18 144(SP) | 19 152(SP) | 20 160(SP) | 21 - | 22 - | 23 - | 24 - | 25 - + + + MOVQ $0x00, R9 + MOVQ $0x00, R10 + MOVQ $0x00, R11 + MOVQ $0x00, R12 + MOVQ $0x00, R13 + MOVQ $0x00, R14 + MOVQ $0x00, R15 + + // | + +/* i = 0 */ + + // | a0 @ CX + MOVQ (DI), CX + + // | a0 * b8 + MOVQ 64(SI), AX + MULQ CX + MOVQ AX, 168(SP) + MOVQ DX, R8 + + // | a0 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + + // | a0 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + + // | a0 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + + // | a0 * b12 + MOVQ 96(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + + // | + +/* i = 1 */ + + // | a1 @ CX + MOVQ 8(DI), CX + MOVQ $0x00, BX + + // | a1 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R10 + ADCQ $0x00, BX + MOVQ R8, 176(SP) + MOVQ $0x00, R8 + + // | a1 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + + // | a1 * b12 + MOVQ 96(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + + // | + +/* i = 2 */ + + // | a2 @ CX + MOVQ 16(DI), CX + MOVQ $0x00, BX + + // | a2 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ $0x00, R11 + ADCQ $0x00, BX + MOVQ R9, 184(SP) + MOVQ $0x00, R9 + + // | a2 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + + // | a2 * b12 + MOVQ 96(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + + // | + +/* i = 3 */ + + // | a3 @ CX + MOVQ 24(DI), CX + MOVQ $0x00, BX + + // | a3 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ $0x00, R12 + ADCQ $0x00, BX + MOVQ R10, 192(SP) + MOVQ $0x00, R10 + + // | a3 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + + // | a3 * b12 + MOVQ 96(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + + // | + +/* i = 4 */ + + // | a4 @ CX + MOVQ 32(DI), CX + MOVQ $0x00, BX + + // | a4 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ $0x00, R13 + ADCQ $0x00, BX + MOVQ R11, 200(SP) + MOVQ $0x00, R11 + + // | a4 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + + // | a4 * b12 + MOVQ 96(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + + // | + +/* i = 5 */ + + // | a5 @ CX + MOVQ 40(DI), CX + MOVQ $0x00, BX + + // | a5 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ $0x00, R14 + ADCQ $0x00, BX + MOVQ R12, 208(SP) + MOVQ $0x00, R12 + + // | a5 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + + // | a5 * b12 + MOVQ 96(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + + // | + +/* i = 6 */ + + // | a6 @ CX + MOVQ 48(DI), CX + MOVQ $0x00, BX + + // | a6 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ $0x00, R15 + ADCQ $0x00, BX + MOVQ R13, 216(SP) + MOVQ $0x00, R13 + + // | a6 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + + // | a6 * b12 + MOVQ 96(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + + // | + +/* i = 7 */ + + // | a7 @ CX + MOVQ 56(DI), CX + MOVQ $0x00, BX + + // | a7 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ $0x00, R8 + ADCQ $0x00, BX + MOVQ R14, 224(SP) + MOVQ $0x00, R14 + + // | a7 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + + // | a7 * b12 + MOVQ 96(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + + // | + +/* i = 8 */ + + // | a8 @ CX + MOVQ 64(DI), CX + MOVQ $0x00, BX + + // | a8 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ $0x00, R9 + ADCQ $0x00, BX + MOVQ R15, 232(SP) + MOVQ $0x00, R15 + + // | a8 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + + // | a8 * b12 + MOVQ 96(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + + // | + +/* i = 9 */ + + // | a9 @ CX + MOVQ 72(DI), CX + MOVQ $0x00, BX + + // | a9 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R10 + ADCQ $0x00, BX + + // | a9 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a9 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a9 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + + // | a9 * b12 + MOVQ 96(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + + // | + +/* i = 10 */ + + // | a10 @ CX + MOVQ 80(DI), CX + MOVQ $0x00, BX + + // | a10 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ $0x00, R11 + ADCQ $0x00, BX + + // | a10 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a10 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a10 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + + // | a10 * b12 + MOVQ 96(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + + // | + +/* i = 11 */ + + // | a11 @ CX + MOVQ 88(DI), CX + MOVQ $0x00, BX + + // | a11 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ $0x00, R12 + ADCQ $0x00, BX + + // | a11 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a11 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a11 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + + // | a11 * b12 + MOVQ 96(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + + // | + +/* i = 12 */ + + // | a12 @ CX + MOVQ 96(DI), CX + MOVQ $0x00, BX + + // | a12 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ $0x00, R13 + ADCQ $0x00, BX + + // | a12 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a12 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a12 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ $0x00, BX + + // | a12 * b12 + MOVQ 96(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, BX + + // | + +/* */ + + // | + // | W part 2 multiplication + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 168(SP) | 9 176(SP) | 10 184(SP) | 11 192(SP) | 12 200(SP) + // | 13 208(SP) | 14 216(SP) | 15 224(SP) | 16 232(SP) | 17 R8 | 18 R9 | 19 R10 | 20 R11 | 21 R12 | 22 R13 | 23 R14 | 24 R15 | 25 BX + + + // | + // | W part 1 + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) + // | 13 104(SP) | 14 112(SP) | 15 120(SP) | 16 128(SP) | 17 136(SP) | 18 144(SP) | 19 152(SP) | 20 160(SP) | 21 - | 22 - | 23 - | 24 - | 25 - + + + MOVQ 64(SP), AX + ADDQ AX, 168(SP) + MOVQ 72(SP), AX + ADCQ AX, 176(SP) + MOVQ 80(SP), AX + ADCQ AX, 184(SP) + MOVQ 88(SP), AX + ADCQ AX, 192(SP) + MOVQ 96(SP), AX + ADCQ AX, 200(SP) + MOVQ 104(SP), AX + ADCQ AX, 208(SP) + MOVQ 112(SP), AX + ADCQ AX, 216(SP) + MOVQ 120(SP), AX + ADCQ AX, 224(SP) + MOVQ 128(SP), AX + ADCQ AX, 232(SP) + ADCQ 136(SP), R8 + ADCQ 144(SP), R9 + ADCQ 152(SP), R10 + ADCQ 160(SP), R11 + ADCQ $0x00, R12 + ADCQ $0x00, R13 + ADCQ $0x00, R14 + ADCQ $0x00, R15 + ADCQ $0x00, BX + + // | + // | W combined + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 168(SP) | 9 176(SP) | 10 184(SP) | 11 192(SP) | 12 200(SP) + // | 13 208(SP) | 14 216(SP) | 15 224(SP) | 16 232(SP) | 17 R8 | 18 R9 | 19 R10 | 20 R11 | 21 R12 | 22 R13 | 23 R14 | 24 R15 | 25 BX + + + MOVQ (SP), CX + MOVQ 8(SP), DI + MOVQ 16(SP), SI + MOVQ BX, (SP) + MOVQ 24(SP), BX + MOVQ R15, 8(SP) + MOVQ 32(SP), R15 + MOVQ R14, 16(SP) + MOVQ 40(SP), R14 + MOVQ R13, 24(SP) + MOVQ 48(SP), R13 + MOVQ R12, 32(SP) + MOVQ 56(SP), R12 + MOVQ R11, 40(SP) + MOVQ 168(SP), R11 + MOVQ R10, 48(SP) + MOVQ R9, 56(SP) + MOVQ R8, 64(SP) + + // | fetch modulus + MOVQ p+24(FP), R10 + + // | + +/* montgomery reduction q1 */ + + // | + +/* i = 0 */ + + // | + // | W + // | 0 CX | 1 DI | 2 SI | 3 BX | 4 R15 | 5 R14 | 6 R13 | 7 R12 | 8 R11 | 9 176(SP) | 10 184(SP) | 11 192(SP) | 12 200(SP) + // | 13 208(SP) | 14 216(SP) | 15 224(SP) | 16 232(SP) | 17 64(SP) | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | | u0 = w0 * inp + MOVQ CX, AX + MULQ inp+32(FP) + MOVQ AX, R9 + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u0 + MOVQ R9, 72(SP) + + // | j0 + + // | w0 @ CX + MOVQ (R10), AX + MULQ R9 + ADDQ AX, CX + ADCQ DX, R8 + + // | j1 + + // | w1 @ DI + MOVQ 8(R10), AX + MULQ R9 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w2 @ SI + MOVQ 16(R10), AX + MULQ R9 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w3 @ BX + MOVQ 24(R10), AX + MULQ R9 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w4 @ R15 + MOVQ 32(R10), AX + MULQ R9 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R8, R15 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w5 @ R14 + MOVQ 40(R10), AX + MULQ R9 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w6 @ R13 + MOVQ 48(R10), AX + MULQ R9 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w7 @ R12 + MOVQ 56(R10), AX + MULQ R9 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + + // | w8 @ R11 + ADCQ DX, R11 + ADCQ $0x00, CX + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 DI | 2 SI | 3 BX | 4 R15 | 5 R14 | 6 R13 | 7 R12 | 8 R11 | 9 176(SP) | 10 184(SP) | 11 192(SP) | 12 200(SP) + // | 13 208(SP) | 14 216(SP) | 15 224(SP) | 16 232(SP) | 17 64(SP) | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | | u1 = w1 * inp + MOVQ DI, AX + MULQ inp+32(FP) + MOVQ AX, R9 + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u1 + MOVQ R9, 80(SP) + + // | j0 + + // | w1 @ DI + MOVQ (R10), AX + MULQ R9 + ADDQ AX, DI + ADCQ DX, R8 + + // | j1 + + // | w2 @ SI + MOVQ 8(R10), AX + MULQ R9 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w3 @ BX + MOVQ 16(R10), AX + MULQ R9 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w4 @ R15 + MOVQ 24(R10), AX + MULQ R9 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R8, R15 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w5 @ R14 + MOVQ 32(R10), AX + MULQ R9 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w6 @ R13 + MOVQ 40(R10), AX + MULQ R9 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w7 @ R12 + MOVQ 48(R10), AX + MULQ R9 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w8 @ R11 + MOVQ 56(R10), AX + MULQ R9 + ADDQ AX, R11 + ADCQ DX, CX + ADDQ R8, R11 + + // | move to idle register + MOVQ 176(SP), DI + + // | w9 @ DI + ADCQ CX, DI + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 SI | 3 BX | 4 R15 | 5 R14 | 6 R13 | 7 R12 | 8 R11 | 9 DI | 10 184(SP) | 11 192(SP) | 12 200(SP) + // | 13 208(SP) | 14 216(SP) | 15 224(SP) | 16 232(SP) | 17 64(SP) | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | | u2 = w2 * inp + MOVQ SI, AX + MULQ inp+32(FP) + MOVQ AX, R9 + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u2 + MOVQ R9, 88(SP) + + // | j0 + + // | w2 @ SI + MOVQ (R10), AX + MULQ R9 + ADDQ AX, SI + ADCQ DX, R8 + + // | j1 + + // | w3 @ BX + MOVQ 8(R10), AX + MULQ R9 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w4 @ R15 + MOVQ 16(R10), AX + MULQ R9 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R8, R15 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w5 @ R14 + MOVQ 24(R10), AX + MULQ R9 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w6 @ R13 + MOVQ 32(R10), AX + MULQ R9 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w7 @ R12 + MOVQ 40(R10), AX + MULQ R9 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w8 @ R11 + MOVQ 48(R10), AX + MULQ R9 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w9 @ DI + MOVQ 56(R10), AX + MULQ R9 + ADDQ AX, DI + ADCQ DX, CX + ADDQ R8, DI + + // | move to idle register + MOVQ 184(SP), SI + + // | w10 @ SI + ADCQ CX, SI + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 BX | 4 R15 | 5 R14 | 6 R13 | 7 R12 | 8 R11 | 9 DI | 10 SI | 11 192(SP) | 12 200(SP) + // | 13 208(SP) | 14 216(SP) | 15 224(SP) | 16 232(SP) | 17 64(SP) | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | | u3 = w3 * inp + MOVQ BX, AX + MULQ inp+32(FP) + MOVQ AX, R9 + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u3 + MOVQ R9, 96(SP) + + // | j0 + + // | w3 @ BX + MOVQ (R10), AX + MULQ R9 + ADDQ AX, BX + ADCQ DX, R8 + + // | j1 + + // | w4 @ R15 + MOVQ 8(R10), AX + MULQ R9 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R8, R15 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w5 @ R14 + MOVQ 16(R10), AX + MULQ R9 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w6 @ R13 + MOVQ 24(R10), AX + MULQ R9 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w7 @ R12 + MOVQ 32(R10), AX + MULQ R9 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w8 @ R11 + MOVQ 40(R10), AX + MULQ R9 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w9 @ DI + MOVQ 48(R10), AX + MULQ R9 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w10 @ SI + MOVQ 56(R10), AX + MULQ R9 + ADDQ AX, SI + ADCQ DX, CX + ADDQ R8, SI + + // | move to idle register + MOVQ 192(SP), BX + + // | w11 @ BX + ADCQ CX, BX + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 4 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 R15 | 5 R14 | 6 R13 | 7 R12 | 8 R11 | 9 DI | 10 SI | 11 BX | 12 200(SP) + // | 13 208(SP) | 14 216(SP) | 15 224(SP) | 16 232(SP) | 17 64(SP) | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | | u4 = w4 * inp + MOVQ R15, AX + MULQ inp+32(FP) + MOVQ AX, R9 + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u4 + MOVQ R9, 104(SP) + + // | j0 + + // | w4 @ R15 + MOVQ (R10), AX + MULQ R9 + ADDQ AX, R15 + ADCQ DX, R8 + + // | j1 + + // | w5 @ R14 + MOVQ 8(R10), AX + MULQ R9 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w6 @ R13 + MOVQ 16(R10), AX + MULQ R9 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w7 @ R12 + MOVQ 24(R10), AX + MULQ R9 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w8 @ R11 + MOVQ 32(R10), AX + MULQ R9 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w9 @ DI + MOVQ 40(R10), AX + MULQ R9 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w10 @ SI + MOVQ 48(R10), AX + MULQ R9 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w11 @ BX + MOVQ 56(R10), AX + MULQ R9 + ADDQ AX, BX + ADCQ DX, CX + ADDQ R8, BX + + // | move to idle register + MOVQ 200(SP), R15 + + // | w12 @ R15 + ADCQ CX, R15 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 5 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 R14 | 6 R13 | 7 R12 | 8 R11 | 9 DI | 10 SI | 11 BX | 12 R15 + // | 13 208(SP) | 14 216(SP) | 15 224(SP) | 16 232(SP) | 17 64(SP) | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | | u5 = w5 * inp + MOVQ R14, AX + MULQ inp+32(FP) + MOVQ AX, R9 + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u5 + MOVQ R9, 112(SP) + + // | j0 + + // | w5 @ R14 + MOVQ (R10), AX + MULQ R9 + ADDQ AX, R14 + ADCQ DX, R8 + + // | j1 + + // | w6 @ R13 + MOVQ 8(R10), AX + MULQ R9 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w7 @ R12 + MOVQ 16(R10), AX + MULQ R9 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w8 @ R11 + MOVQ 24(R10), AX + MULQ R9 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w9 @ DI + MOVQ 32(R10), AX + MULQ R9 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w10 @ SI + MOVQ 40(R10), AX + MULQ R9 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w11 @ BX + MOVQ 48(R10), AX + MULQ R9 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w12 @ R15 + MOVQ 56(R10), AX + MULQ R9 + ADDQ AX, R15 + ADCQ DX, CX + ADDQ R8, R15 + + // | move to idle register + MOVQ 208(SP), R14 + + // | w13 @ R14 + ADCQ CX, R14 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 6 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 R13 | 7 R12 | 8 R11 | 9 DI | 10 SI | 11 BX | 12 R15 + // | 13 R14 | 14 216(SP) | 15 224(SP) | 16 232(SP) | 17 64(SP) | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | | u6 = w6 * inp + MOVQ R13, AX + MULQ inp+32(FP) + MOVQ AX, R9 + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u6 + MOVQ R9, 120(SP) + + // | j0 + + // | w6 @ R13 + MOVQ (R10), AX + MULQ R9 + ADDQ AX, R13 + ADCQ DX, R8 + + // | j1 + + // | w7 @ R12 + MOVQ 8(R10), AX + MULQ R9 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w8 @ R11 + MOVQ 16(R10), AX + MULQ R9 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w9 @ DI + MOVQ 24(R10), AX + MULQ R9 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w10 @ SI + MOVQ 32(R10), AX + MULQ R9 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w11 @ BX + MOVQ 40(R10), AX + MULQ R9 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w12 @ R15 + MOVQ 48(R10), AX + MULQ R9 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R8, R15 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w13 @ R14 + MOVQ 56(R10), AX + MULQ R9 + ADDQ AX, R14 + ADCQ DX, CX + ADDQ R8, R14 + + // | move to idle register + MOVQ 216(SP), R13 + + // | w14 @ R13 + ADCQ CX, R13 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 7 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 R12 | 8 R11 | 9 DI | 10 SI | 11 BX | 12 R15 + // | 13 R14 | 14 R13 | 15 224(SP) | 16 232(SP) | 17 64(SP) | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | | u7 = w7 * inp + MOVQ R12, AX + MULQ inp+32(FP) + MOVQ AX, R9 + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u7 + MOVQ R9, 128(SP) + + // | j0 + + // | w7 @ R12 + MOVQ (R10), AX + MULQ R9 + ADDQ AX, R12 + ADCQ DX, R8 + + // | j1 + + // | w8 @ R11 + MOVQ 8(R10), AX + MULQ R9 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w9 @ DI + MOVQ 16(R10), AX + MULQ R9 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w10 @ SI + MOVQ 24(R10), AX + MULQ R9 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w11 @ BX + MOVQ 32(R10), AX + MULQ R9 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w12 @ R15 + MOVQ 40(R10), AX + MULQ R9 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R8, R15 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w13 @ R14 + MOVQ 48(R10), AX + MULQ R9 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w14 @ R13 + MOVQ 56(R10), AX + MULQ R9 + ADDQ AX, R13 + ADCQ DX, CX + ADDQ R8, R13 + + // | move to idle register + MOVQ 224(SP), R12 + + // | w15 @ R12 + ADCQ CX, R12 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + // | W q1 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R11 | 9 DI | 10 SI | 11 BX | 12 R15 + // | 13 R14 | 14 R13 | 15 R12 | 16 232(SP) | 17 64(SP) | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | save the carry from q1 + // | should be added to w16 + MOVQ CX, 136(SP) + + // | + +/* montgomerry reduction q2 */ + + // | + +/* i = 0 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R11 | 9 DI | 10 SI | 11 BX | 12 R15 + // | 13 R14 | 14 R13 | 15 R12 | 16 232(SP) | 17 64(SP) | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + MOVQ $0x00, R8 + + // | + +/* */ + + // | j8 + + // | w8 @ R11 + MOVQ 64(R10), AX + MULQ 72(SP) + ADDQ AX, R11 + ADCQ DX, R8 + + // | j9 + + // | w9 @ DI + MOVQ 72(R10), AX + MULQ 72(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j10 + + // | w10 @ SI + MOVQ 80(R10), AX + MULQ 72(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j11 + + // | w11 @ BX + MOVQ 88(R10), AX + MULQ 72(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j12 + + // | w12 @ R15 + MOVQ 96(R10), AX + MULQ 72(SP) + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R8, R15 + + // | w13 @ R14 + ADCQ DX, R14 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R11 | 9 DI | 10 SI | 11 BX | 12 R15 + // | 13 R14 | 14 R13 | 15 R12 | 16 232(SP) | 17 64(SP) | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + MOVQ $0x00, R8 + + // | + +/* */ + + // | j8 + + // | w9 @ DI + MOVQ 64(R10), AX + MULQ 80(SP) + ADDQ AX, DI + ADCQ DX, R8 + MOVQ DI, 72(SP) + + // | j9 + + // | w10 @ SI + MOVQ 72(R10), AX + MULQ 80(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j10 + + // | w11 @ BX + MOVQ 80(R10), AX + MULQ 80(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j11 + + // | w12 @ R15 + MOVQ 88(R10), AX + MULQ 80(SP) + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R8, R15 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j12 + + // | w13 @ R14 + MOVQ 96(R10), AX + MULQ 80(SP) + ADDQ AX, R14 + ADCQ DX, CX + ADDQ R8, R14 + + // | w14 @ R13 + ADCQ CX, R13 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R11 | 9 72(SP) | 10 SI | 11 BX | 12 R15 + // | 13 R14 | 14 R13 | 15 R12 | 16 232(SP) | 17 64(SP) | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + MOVQ $0x00, R8 + + // | + +/* */ + + // | j8 + + // | w10 @ SI + MOVQ 64(R10), AX + MULQ 88(SP) + ADDQ AX, SI + ADCQ DX, R8 + MOVQ SI, 80(SP) + + // | j9 + + // | w11 @ BX + MOVQ 72(R10), AX + MULQ 88(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j10 + + // | w12 @ R15 + MOVQ 80(R10), AX + MULQ 88(SP) + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R8, R15 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j11 + + // | w13 @ R14 + MOVQ 88(R10), AX + MULQ 88(SP) + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j12 + + // | w14 @ R13 + MOVQ 96(R10), AX + MULQ 88(SP) + ADDQ AX, R13 + ADCQ DX, CX + ADDQ R8, R13 + + // | w15 @ R12 + ADCQ CX, R12 + + // | bring the carry from q1 + MOVQ 136(SP), CX + ADCQ $0x00, CX + + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R11 | 9 72(SP) | 10 80(SP) | 11 BX | 12 R15 + // | 13 R14 | 14 R13 | 15 R12 | 16 232(SP) | 17 64(SP) | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + MOVQ $0x00, R8 + + // | + +/* */ + + // | j8 + + // | w11 @ BX + MOVQ 64(R10), AX + MULQ 96(SP) + ADDQ AX, BX + ADCQ DX, R8 + MOVQ BX, 88(SP) + + // | j9 + + // | w12 @ R15 + MOVQ 72(R10), AX + MULQ 96(SP) + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R8, R15 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j10 + + // | w13 @ R14 + MOVQ 80(R10), AX + MULQ 96(SP) + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j11 + + // | w14 @ R13 + MOVQ 88(R10), AX + MULQ 96(SP) + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j12 + + // | w15 @ R12 + MOVQ 96(R10), AX + MULQ 96(SP) + ADDQ AX, R12 + ADCQ DX, CX + ADDQ R8, R12 + + // | move to an idle register + MOVQ 232(SP), R9 + + // | w16 @ R9 + ADCQ CX, R9 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 4 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R11 | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 R15 + // | 13 R14 | 14 R13 | 15 R12 | 16 R9 | 17 64(SP) | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + MOVQ $0x00, R8 + + // | + +/* */ + + // | j8 + + // | w12 @ R15 + MOVQ 64(R10), AX + MULQ 104(SP) + ADDQ AX, R15 + ADCQ DX, R8 + + // | j9 + + // | w13 @ R14 + MOVQ 72(R10), AX + MULQ 104(SP) + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j10 + + // | w14 @ R13 + MOVQ 80(R10), AX + MULQ 104(SP) + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j11 + + // | w15 @ R12 + MOVQ 88(R10), AX + MULQ 104(SP) + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j12 + + // | w16 @ R9 + MOVQ 96(R10), AX + MULQ 104(SP) + ADDQ AX, R9 + ADCQ DX, CX + ADDQ R8, R9 + + // | move to an idle register + MOVQ 64(SP), BX + + // | w17 @ BX + ADCQ CX, BX + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 5 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R11 | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 R15 + // | 13 R14 | 14 R13 | 15 R12 | 16 R9 | 17 BX | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + MOVQ $0x00, R8 + + // | + +/* */ + + // | j8 + + // | w13 @ R14 + MOVQ 64(R10), AX + MULQ 112(SP) + ADDQ AX, R14 + ADCQ DX, R8 + + // | j9 + + // | w14 @ R13 + MOVQ 72(R10), AX + MULQ 112(SP) + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j10 + + // | w15 @ R12 + MOVQ 80(R10), AX + MULQ 112(SP) + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j11 + + // | w16 @ R9 + MOVQ 88(R10), AX + MULQ 112(SP) + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R8, R9 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j12 + + // | w17 @ BX + MOVQ 96(R10), AX + MULQ 112(SP) + ADDQ AX, BX + ADCQ DX, CX + ADDQ R8, BX + + // | move to an idle register + MOVQ 56(SP), DI + + // | w18 @ DI + ADCQ CX, DI + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 6 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R11 | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 R15 + // | 13 R14 | 14 R13 | 15 R12 | 16 R9 | 17 BX | 18 DI | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + MOVQ $0x00, R8 + + // | + +/* */ + + // | j8 + + // | w14 @ R13 + MOVQ 64(R10), AX + MULQ 120(SP) + ADDQ AX, R13 + ADCQ DX, R8 + + // | j9 + + // | w15 @ R12 + MOVQ 72(R10), AX + MULQ 120(SP) + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j10 + + // | w16 @ R9 + MOVQ 80(R10), AX + MULQ 120(SP) + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R8, R9 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j11 + + // | w17 @ BX + MOVQ 88(R10), AX + MULQ 120(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j12 + + // | w18 @ DI + MOVQ 96(R10), AX + MULQ 120(SP) + ADDQ AX, DI + ADCQ DX, CX + ADDQ R8, DI + + // | move to an idle register + MOVQ 48(SP), SI + + // | w19 @ SI + ADCQ CX, SI + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 7 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R11 | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 R15 + // | 13 R14 | 14 R13 | 15 R12 | 16 R9 | 17 BX | 18 DI | 19 SI | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + MOVQ $0x00, R8 + + // | + +/* */ + + // | j8 + + // | w15 @ R12 + MOVQ 64(R10), AX + MULQ 128(SP) + ADDQ AX, R12 + ADCQ DX, R8 + + // | j9 + + // | w16 @ R9 + MOVQ 72(R10), AX + MULQ 128(SP) + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R8, R9 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j10 + + // | w17 @ BX + MOVQ 80(R10), AX + MULQ 128(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j11 + + // | w18 @ DI + MOVQ 88(R10), AX + MULQ 128(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j12 + + // | w19 @ SI + MOVQ 96(R10), AX + MULQ 128(SP) + ADDQ AX, SI + ADCQ DX, CX + ADDQ R8, SI + + // | tolarete this limb to stay in stack + // | w20 @ 40(SP) + ADCQ CX, 40(SP) + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + // | q2 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R11 | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 R15 + // | 13 R14 | 14 R13 | 15 R12 | 16 R9 | 17 BX | 18 DI | 19 SI | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | save the carry from q2 + // | should be added to w21 + MOVQ CX, 136(SP) + + // | + +/* q2 q3 transition swap */ + + MOVQ 72(SP), CX + MOVQ SI, 72(SP) + MOVQ 80(SP), SI + MOVQ DI, 80(SP) + MOVQ 88(SP), DI + MOVQ BX, 88(SP) + + // | + // | W q2 q3 transition + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R11 | 9 CX | 10 SI | 11 DI | 12 R15 + // | 13 R14 | 14 R13 | 15 R12 | 16 R9 | 17 88(SP) | 18 80(SP) | 19 72(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | + +/* montgomery reduction q3 */ + + // | + +/* i = 8 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R11 | 9 CX | 10 SI | 11 DI | 12 R15 + // | 13 R14 | 14 R13 | 15 R12 | 16 R9 | 17 88(SP) | 18 80(SP) | 19 72(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | | u8 = w8 * inp + MOVQ R11, AX + MULQ inp+32(FP) + MOVQ AX, BX + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u8 + MOVQ BX, 96(SP) + + // | j0 + + // | w8 @ R11 + MOVQ (R10), AX + MULQ BX + ADDQ AX, R11 + ADCQ DX, R8 + + // | j1 + + // | w9 @ CX + MOVQ 8(R10), AX + MULQ BX + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R8, CX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w10 @ SI + MOVQ 16(R10), AX + MULQ BX + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w11 @ DI + MOVQ 24(R10), AX + MULQ BX + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w12 @ R15 + MOVQ 32(R10), AX + MULQ BX + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R8, R15 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w13 @ R14 + MOVQ 40(R10), AX + MULQ BX + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w14 @ R13 + MOVQ 48(R10), AX + MULQ BX + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w15 @ R12 + MOVQ 56(R10), AX + MULQ BX + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + + // | w16 @ R9 + ADCQ DX, R9 + ADCQ $0x00, R11 + + // | + +/* i = 9 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 CX | 10 SI | 11 DI | 12 R15 + // | 13 R14 | 14 R13 | 15 R12 | 16 R9 | 17 88(SP) | 18 80(SP) | 19 72(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | | u9 = w9 * inp + MOVQ CX, AX + MULQ inp+32(FP) + MOVQ AX, BX + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u9 + MOVQ BX, 104(SP) + + // | j0 + + // | w9 @ CX + MOVQ (R10), AX + MULQ BX + ADDQ AX, CX + ADCQ DX, R8 + + // | j1 + + // | w10 @ SI + MOVQ 8(R10), AX + MULQ BX + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w11 @ DI + MOVQ 16(R10), AX + MULQ BX + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w12 @ R15 + MOVQ 24(R10), AX + MULQ BX + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R8, R15 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w13 @ R14 + MOVQ 32(R10), AX + MULQ BX + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w14 @ R13 + MOVQ 40(R10), AX + MULQ BX + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w15 @ R12 + MOVQ 48(R10), AX + MULQ BX + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w16 @ R9 + MOVQ 56(R10), AX + MULQ BX + ADDQ AX, R9 + ADCQ DX, R11 + ADDQ R8, R9 + + // | move to idle register + MOVQ 88(SP), CX + + // | w17 @ CX + ADCQ R11, CX + MOVQ $0x00, R11 + ADCQ $0x00, R11 + + // | + +/* i = 10 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 SI | 11 DI | 12 R15 + // | 13 R14 | 14 R13 | 15 R12 | 16 R9 | 17 CX | 18 80(SP) | 19 72(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | | u10 = w10 * inp + MOVQ SI, AX + MULQ inp+32(FP) + MOVQ AX, BX + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u10 + MOVQ BX, 88(SP) + + // | j0 + + // | w10 @ SI + MOVQ (R10), AX + MULQ BX + ADDQ AX, SI + ADCQ DX, R8 + + // | j1 + + // | w11 @ DI + MOVQ 8(R10), AX + MULQ BX + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w12 @ R15 + MOVQ 16(R10), AX + MULQ BX + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R8, R15 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w13 @ R14 + MOVQ 24(R10), AX + MULQ BX + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w14 @ R13 + MOVQ 32(R10), AX + MULQ BX + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w15 @ R12 + MOVQ 40(R10), AX + MULQ BX + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w16 @ R9 + MOVQ 48(R10), AX + MULQ BX + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R8, R9 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w17 @ CX + MOVQ 56(R10), AX + MULQ BX + ADDQ AX, CX + ADCQ DX, R11 + ADDQ R8, CX + + // | move to idle register + MOVQ 80(SP), SI + + // | w18 @ SI + ADCQ R11, SI + MOVQ $0x00, R11 + ADCQ $0x00, R11 + + // | + +/* i = 11 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 DI | 12 R15 + // | 13 R14 | 14 R13 | 15 R12 | 16 R9 | 17 CX | 18 SI | 19 72(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | | u11 = w11 * inp + MOVQ DI, AX + MULQ inp+32(FP) + MOVQ AX, BX + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u11 + MOVQ BX, 80(SP) + + // | j0 + + // | w11 @ DI + MOVQ (R10), AX + MULQ BX + ADDQ AX, DI + ADCQ DX, R8 + + // | j1 + + // | w12 @ R15 + MOVQ 8(R10), AX + MULQ BX + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R8, R15 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w13 @ R14 + MOVQ 16(R10), AX + MULQ BX + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w14 @ R13 + MOVQ 24(R10), AX + MULQ BX + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w15 @ R12 + MOVQ 32(R10), AX + MULQ BX + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w16 @ R9 + MOVQ 40(R10), AX + MULQ BX + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R8, R9 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w17 @ CX + MOVQ 48(R10), AX + MULQ BX + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R8, CX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w18 @ SI + MOVQ 56(R10), AX + MULQ BX + ADDQ AX, SI + ADCQ DX, R11 + ADDQ R8, SI + + // | move to idle register + MOVQ 72(SP), DI + + // | w19 @ DI + ADCQ R11, DI + MOVQ $0x00, R11 + ADCQ $0x00, R11 + + // | + +/* i = 12 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 R15 + // | 13 R14 | 14 R13 | 15 R12 | 16 R9 | 17 CX | 18 SI | 19 DI | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | | u12 = w12 * inp + MOVQ R15, AX + MULQ inp+32(FP) + MOVQ AX, BX + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u12 + MOVQ BX, 72(SP) + + // | j0 + + // | w12 @ R15 + MOVQ (R10), AX + MULQ BX + ADDQ AX, R15 + ADCQ DX, R8 + + // | j1 + + // | w13 @ R14 + MOVQ 8(R10), AX + MULQ BX + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w14 @ R13 + MOVQ 16(R10), AX + MULQ BX + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w15 @ R12 + MOVQ 24(R10), AX + MULQ BX + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w16 @ R9 + MOVQ 32(R10), AX + MULQ BX + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R8, R9 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w17 @ CX + MOVQ 40(R10), AX + MULQ BX + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R8, CX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w18 @ SI + MOVQ 48(R10), AX + MULQ BX + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w19 @ DI + MOVQ 56(R10), AX + MULQ BX + ADDQ AX, DI + ADCQ DX, R11 + ADDQ R8, DI + + // | move to idle register + MOVQ 40(SP), R15 + + // | w20 @ R15 + ADCQ R11, R15 + MOVQ $0x00, R11 + ADCQ $0x00, R11 + + // | + // | W q3 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - + // | 13 R14 | 14 R13 | 15 R12 | 16 R9 | 17 CX | 18 SI | 19 DI | 20 R15 | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + // | aggregate carries from q2 & q3 + // | should be added to w21 + ADCQ R11, 136(SP) + + // | + +/* montgomerry reduction q4 */ + + // | + +/* i = 0 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - + // | 13 R14 | 14 R13 | 15 R12 | 16 R9 | 17 CX | 18 SI | 19 DI | 20 R15 | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + MOVQ $0x00, R8 + + // | + +/* */ + + // | j8 + + // | w16 @ R9 + MOVQ 64(R10), AX + MULQ 96(SP) + ADDQ AX, R9 + ADCQ DX, R8 + + // | j9 + + // | w17 @ CX + MOVQ 72(R10), AX + MULQ 96(SP) + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R8, CX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j10 + + // | w18 @ SI + MOVQ 80(R10), AX + MULQ 96(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j11 + + // | w19 @ DI + MOVQ 88(R10), AX + MULQ 96(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j12 + + // | w20 @ R15 + MOVQ 96(R10), AX + MULQ 96(SP) + ADDQ AX, R15 + ADCQ 136(SP), DX + ADDQ R8, R15 + MOVQ 32(SP), BX + + // | w21 @ BX + ADCQ DX, BX + MOVQ $0x00, R11 + ADCQ $0x00, R11 + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - + // | 13 R14 | 14 R13 | 15 R12 | 16 R9 | 17 CX | 18 SI | 19 DI | 20 R15 | 21 BX | 22 24(SP) | 23 16(SP) | 24 8(SP) | 25 (SP) + + + MOVQ $0x00, R8 + + // | + +/* */ + + // | j8 + + // | w17 @ CX + MOVQ 64(R10), AX + MULQ 104(SP) + ADDQ AX, CX + ADCQ DX, R8 + MOVQ CX, 40(SP) + + // | j9 + + // | w18 @ SI + MOVQ 72(R10), AX + MULQ 104(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j10 + + // | w19 @ DI + MOVQ 80(R10), AX + MULQ 104(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j11 + + // | w20 @ R15 + MOVQ 88(R10), AX + MULQ 104(SP) + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R8, R15 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j12 + + // | w21 @ BX + MOVQ 96(R10), AX + MULQ 104(SP) + ADDQ AX, BX + ADCQ DX, R11 + ADDQ R8, BX + MOVQ 24(SP), CX + + // | w22 @ CX + ADCQ R11, CX + MOVQ $0x00, R11 + ADCQ $0x00, R11 + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - + // | 13 R14 | 14 R13 | 15 R12 | 16 R9 | 17 40(SP) | 18 SI | 19 DI | 20 R15 | 21 BX | 22 CX | 23 16(SP) | 24 8(SP) | 25 (SP) + + + MOVQ $0x00, R8 + + // | + +/* */ + + // | j8 + + // | w18 @ SI + MOVQ 64(R10), AX + MULQ 88(SP) + ADDQ AX, SI + ADCQ DX, R8 + MOVQ SI, 24(SP) + + // | j9 + + // | w19 @ DI + MOVQ 72(R10), AX + MULQ 88(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j10 + + // | w20 @ R15 + MOVQ 80(R10), AX + MULQ 88(SP) + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R8, R15 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j11 + + // | w21 @ BX + MOVQ 88(R10), AX + MULQ 88(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j12 + + // | w22 @ CX + MOVQ 96(R10), AX + MULQ 88(SP) + ADDQ AX, CX + ADCQ DX, R11 + ADDQ R8, CX + MOVQ 16(SP), SI + + // | w23 @ SI + ADCQ R11, SI + MOVQ $0x00, R11 + ADCQ $0x00, R11 + + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - + // | 13 R14 | 14 R13 | 15 R12 | 16 R9 | 17 40(SP) | 18 24(SP) | 19 DI | 20 R15 | 21 BX | 22 CX | 23 SI | 24 8(SP) | 25 (SP) + + + MOVQ $0x00, R8 + + // | + +/* */ + + // | j8 + + // | w19 @ DI + MOVQ 64(R10), AX + MULQ 80(SP) + ADDQ AX, DI + ADCQ DX, R8 + MOVQ DI, 16(SP) + + // | j9 + + // | w20 @ R15 + MOVQ 72(R10), AX + MULQ 80(SP) + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R8, R15 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j10 + + // | w21 @ BX + MOVQ 80(R10), AX + MULQ 80(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j11 + + // | w22 @ CX + MOVQ 88(R10), AX + MULQ 80(SP) + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R8, CX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j12 + + // | w23 @ SI + MOVQ 96(R10), AX + MULQ 80(SP) + ADDQ AX, SI + ADCQ DX, R11 + ADDQ R8, SI + MOVQ 8(SP), DI + + // | w24 @ DI + ADCQ R11, DI + MOVQ $0x00, R11 + ADCQ $0x00, R11 + + // | + +/* i = 4 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - + // | 13 R14 | 14 R13 | 15 R12 | 16 R9 | 17 40(SP) | 18 24(SP) | 19 16(SP) | 20 R15 | 21 BX | 22 CX | 23 SI | 24 DI | 25 (SP) + + + MOVQ $0x00, R8 + + // | + +/* */ + + // | j8 + + // | w20 @ R15 + MOVQ 64(R10), AX + MULQ 72(SP) + ADDQ AX, R15 + ADCQ DX, R8 + + // | j9 + + // | w21 @ BX + MOVQ 72(R10), AX + MULQ 72(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j10 + + // | w22 @ CX + MOVQ 80(R10), AX + MULQ 72(SP) + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R8, CX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j11 + + // | w23 @ SI + MOVQ 88(R10), AX + MULQ 72(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j12 + + // | w24 @ DI + MOVQ 96(R10), AX + MULQ 72(SP) + ADDQ AX, DI + ADCQ DX, R11 + ADDQ R8, DI + + // | very last limb goes to short carry register + MOVQ (SP), R8 + + // | w-1 @ R8 + ADCQ R11, R8 + MOVQ $0x00, R11 + ADCQ $0x00, R11 + + // | + // | W q4 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - + // | 13 R14 | 14 R13 | 15 R12 | 16 R9 | 17 40(SP) | 18 24(SP) | 19 16(SP) | 20 R15 | 21 BX | 22 CX | 23 SI | 24 DI | 25 R8 + + + // | + +/* modular reduction */ + + MOVQ R14, DX + SUBQ (R10), DX + MOVQ DX, (SP) + MOVQ R13, DX + SBBQ 8(R10), DX + MOVQ DX, 8(SP) + MOVQ R12, DX + SBBQ 16(R10), DX + MOVQ DX, 112(SP) + MOVQ R9, DX + SBBQ 24(R10), DX + MOVQ DX, 120(SP) + MOVQ 40(SP), DX + SBBQ 32(R10), DX + MOVQ DX, 128(SP) + MOVQ 24(SP), DX + SBBQ 40(R10), DX + MOVQ DX, 144(SP) + MOVQ 16(SP), DX + SBBQ 48(R10), DX + MOVQ DX, 152(SP) + MOVQ R15, DX + SBBQ 56(R10), DX + MOVQ DX, 160(SP) + MOVQ BX, DX + SBBQ 64(R10), DX + MOVQ DX, 168(SP) + MOVQ CX, DX + SBBQ 72(R10), DX + MOVQ DX, 176(SP) + MOVQ SI, DX + SBBQ 80(R10), DX + MOVQ DX, 184(SP) + MOVQ DI, DX + SBBQ 88(R10), DX + MOVQ DX, 192(SP) + MOVQ R8, DX + SBBQ 96(R10), DX + MOVQ DX, 200(SP) + SBBQ $0x00, R11 + + // | + +/* out */ + + MOVQ c+0(FP), R11 + CMOVQCC (SP), R14 + MOVQ R14, (R11) + CMOVQCC 8(SP), R13 + MOVQ R13, 8(R11) + CMOVQCC 112(SP), R12 + MOVQ R12, 16(R11) + CMOVQCC 120(SP), R9 + MOVQ R9, 24(R11) + MOVQ 40(SP), DX + CMOVQCC 128(SP), DX + MOVQ DX, 32(R11) + MOVQ 24(SP), DX + CMOVQCC 144(SP), DX + MOVQ DX, 40(R11) + MOVQ 16(SP), DX + CMOVQCC 152(SP), DX + MOVQ DX, 48(R11) + CMOVQCC 160(SP), R15 + MOVQ R15, 56(R11) + CMOVQCC 168(SP), BX + MOVQ BX, 64(R11) + CMOVQCC 176(SP), CX + MOVQ CX, 72(R11) + CMOVQCC 184(SP), SI + MOVQ SI, 80(R11) + CMOVQCC 192(SP), DI + MOVQ DI, 88(R11) + CMOVQCC 200(SP), R8 + MOVQ R8, 96(R11) + RET + + // | + +/* end */ + + +// func cpy14(dst *[14]uint64, src *[14]uint64) +TEXT ·cpy14(SB), NOSPLIT, $0-16 + MOVQ dst+0(FP), DI + MOVQ src+8(FP), SI + MOVQ (SI), R8 + MOVQ R8, (DI) + MOVQ 8(SI), R8 + MOVQ R8, 8(DI) + MOVQ 16(SI), R8 + MOVQ R8, 16(DI) + MOVQ 24(SI), R8 + MOVQ R8, 24(DI) + MOVQ 32(SI), R8 + MOVQ R8, 32(DI) + MOVQ 40(SI), R8 + MOVQ R8, 40(DI) + MOVQ 48(SI), R8 + MOVQ R8, 48(DI) + MOVQ 56(SI), R8 + MOVQ R8, 56(DI) + MOVQ 64(SI), R8 + MOVQ R8, 64(DI) + MOVQ 72(SI), R8 + MOVQ R8, 72(DI) + MOVQ 80(SI), R8 + MOVQ R8, 80(DI) + MOVQ 88(SI), R8 + MOVQ R8, 88(DI) + MOVQ 96(SI), R8 + MOVQ R8, 96(DI) + MOVQ 104(SI), R8 + MOVQ R8, 104(DI) + RET + +// func eq14(a *[14]uint64, b *[14]uint64) bool +TEXT ·eq14(SB), NOSPLIT, $0-17 + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + MOVB $0x00, ret+16(FP) + MOVQ (DI), R8 + CMPQ (SI), R8 + JNE ret + MOVQ 8(DI), R8 + CMPQ 8(SI), R8 + JNE ret + MOVQ 16(DI), R8 + CMPQ 16(SI), R8 + JNE ret + MOVQ 24(DI), R8 + CMPQ 24(SI), R8 + JNE ret + MOVQ 32(DI), R8 + CMPQ 32(SI), R8 + JNE ret + MOVQ 40(DI), R8 + CMPQ 40(SI), R8 + JNE ret + MOVQ 48(DI), R8 + CMPQ 48(SI), R8 + JNE ret + MOVQ 56(DI), R8 + CMPQ 56(SI), R8 + JNE ret + MOVQ 64(DI), R8 + CMPQ 64(SI), R8 + JNE ret + MOVQ 72(DI), R8 + CMPQ 72(SI), R8 + JNE ret + MOVQ 80(DI), R8 + CMPQ 80(SI), R8 + JNE ret + MOVQ 88(DI), R8 + CMPQ 88(SI), R8 + JNE ret + MOVQ 96(DI), R8 + CMPQ 96(SI), R8 + JNE ret + MOVQ 104(DI), R8 + CMPQ 104(SI), R8 + JNE ret + MOVB $0x01, ret+16(FP) + +ret: + RET + +// func cmp14(a *[14]uint64, b *[14]uint64) int8 +TEXT ·cmp14(SB), NOSPLIT, $0-17 + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + MOVQ 104(DI), R8 + CMPQ 104(SI), R8 + JB gt + JA lt + MOVQ 96(DI), R8 + CMPQ 96(SI), R8 + JB gt + JA lt + MOVQ 88(DI), R8 + CMPQ 88(SI), R8 + JB gt + JA lt + MOVQ 80(DI), R8 + CMPQ 80(SI), R8 + JB gt + JA lt + MOVQ 72(DI), R8 + CMPQ 72(SI), R8 + JB gt + JA lt + MOVQ 64(DI), R8 + CMPQ 64(SI), R8 + JB gt + JA lt + MOVQ 56(DI), R8 + CMPQ 56(SI), R8 + JB gt + JA lt + MOVQ 48(DI), R8 + CMPQ 48(SI), R8 + JB gt + JA lt + MOVQ 40(DI), R8 + CMPQ 40(SI), R8 + JB gt + JA lt + MOVQ 32(DI), R8 + CMPQ 32(SI), R8 + JB gt + JA lt + MOVQ 24(DI), R8 + CMPQ 24(SI), R8 + JB gt + JA lt + MOVQ 16(DI), R8 + CMPQ 16(SI), R8 + JB gt + JA lt + MOVQ 8(DI), R8 + CMPQ 8(SI), R8 + JB gt + JA lt + MOVQ (DI), R8 + CMPQ (SI), R8 + JB gt + JA lt + MOVB $0x00, ret+16(FP) + JMP ret + +gt: + MOVB $0x01, ret+16(FP) + JMP ret + +lt: + MOVB $0xff, ret+16(FP) + +ret: + RET + +// func add14(c *[14]uint64, a *[14]uint64, b *[14]uint64, p *[14]uint64) +TEXT ·add14(SB), NOSPLIT, $144-32 + // | + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + XORQ AX, AX + + // | + MOVQ (DI), CX + ADDQ (SI), CX + MOVQ 8(DI), DX + ADCQ 8(SI), DX + MOVQ 16(DI), R8 + ADCQ 16(SI), R8 + MOVQ 24(DI), R9 + ADCQ 24(SI), R9 + MOVQ 32(DI), R10 + ADCQ 32(SI), R10 + MOVQ 40(DI), R11 + ADCQ 40(SI), R11 + MOVQ 48(DI), R12 + ADCQ 48(SI), R12 + MOVQ 56(DI), R13 + ADCQ 56(SI), R13 + MOVQ 64(DI), R14 + ADCQ 64(SI), R14 + MOVQ 72(DI), R15 + ADCQ 72(SI), R15 + MOVQ 80(DI), BX + ADCQ 80(SI), BX + MOVQ BX, (SP) + MOVQ 88(DI), BX + ADCQ 88(SI), BX + MOVQ BX, 8(SP) + MOVQ 96(DI), BX + ADCQ 96(SI), BX + MOVQ BX, 16(SP) + MOVQ 104(DI), BX + ADCQ 104(SI), BX + MOVQ BX, 24(SP) + ADCQ $0x00, AX + + // | + MOVQ p+24(FP), SI + MOVQ CX, BX + SUBQ (SI), BX + MOVQ BX, 32(SP) + MOVQ DX, BX + SBBQ 8(SI), BX + MOVQ BX, 40(SP) + MOVQ R8, BX + SBBQ 16(SI), BX + MOVQ BX, 48(SP) + MOVQ R9, BX + SBBQ 24(SI), BX + MOVQ BX, 56(SP) + MOVQ R10, BX + SBBQ 32(SI), BX + MOVQ BX, 64(SP) + MOVQ R11, BX + SBBQ 40(SI), BX + MOVQ BX, 72(SP) + MOVQ R12, BX + SBBQ 48(SI), BX + MOVQ BX, 80(SP) + MOVQ R13, BX + SBBQ 56(SI), BX + MOVQ BX, 88(SP) + MOVQ R14, BX + SBBQ 64(SI), BX + MOVQ BX, 96(SP) + MOVQ R15, BX + SBBQ 72(SI), BX + MOVQ BX, 104(SP) + MOVQ (SP), BX + SBBQ 80(SI), BX + MOVQ BX, 112(SP) + MOVQ 8(SP), BX + SBBQ 88(SI), BX + MOVQ BX, 120(SP) + MOVQ 16(SP), BX + SBBQ 96(SI), BX + MOVQ BX, 128(SP) + MOVQ 24(SP), BX + SBBQ 104(SI), BX + MOVQ BX, 136(SP) + SBBQ $0x00, AX + + // | + MOVQ c+0(FP), DI + CMOVQCC 32(SP), CX + MOVQ CX, (DI) + CMOVQCC 40(SP), DX + MOVQ DX, 8(DI) + CMOVQCC 48(SP), R8 + MOVQ R8, 16(DI) + CMOVQCC 56(SP), R9 + MOVQ R9, 24(DI) + CMOVQCC 64(SP), R10 + MOVQ R10, 32(DI) + CMOVQCC 72(SP), R11 + MOVQ R11, 40(DI) + CMOVQCC 80(SP), R12 + MOVQ R12, 48(DI) + CMOVQCC 88(SP), R13 + MOVQ R13, 56(DI) + CMOVQCC 96(SP), R14 + MOVQ R14, 64(DI) + CMOVQCC 104(SP), R15 + MOVQ R15, 72(DI) + MOVQ (SP), BX + CMOVQCC 112(SP), BX + MOVQ BX, 80(DI) + MOVQ 8(SP), BX + CMOVQCC 120(SP), BX + MOVQ BX, 88(DI) + MOVQ 16(SP), BX + CMOVQCC 128(SP), BX + MOVQ BX, 96(DI) + MOVQ 24(SP), BX + CMOVQCC 136(SP), BX + MOVQ BX, 104(DI) + RET + + // | + +/* end */ + + RET + +// func addn14(a *[14]uint64, b *[14]uint64) uint64 +TEXT ·addn14(SB), NOSPLIT, $32-24 + // | + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + + // | + MOVQ (DI), CX + ADDQ (SI), CX + MOVQ 8(DI), DX + ADCQ 8(SI), DX + MOVQ 16(DI), R8 + ADCQ 16(SI), R8 + MOVQ 24(DI), R9 + ADCQ 24(SI), R9 + MOVQ 32(DI), R10 + ADCQ 32(SI), R10 + MOVQ 40(DI), R11 + ADCQ 40(SI), R11 + MOVQ 48(DI), R12 + ADCQ 48(SI), R12 + MOVQ 56(DI), R13 + ADCQ 56(SI), R13 + MOVQ 64(DI), R14 + ADCQ 64(SI), R14 + MOVQ 72(DI), R15 + ADCQ 72(SI), R15 + MOVQ 80(DI), BX + ADCQ 80(SI), BX + MOVQ BX, (SP) + MOVQ 88(DI), BX + ADCQ 88(SI), BX + MOVQ BX, 8(SP) + MOVQ 96(DI), BX + ADCQ 96(SI), BX + MOVQ BX, 16(SP) + MOVQ 104(DI), BX + ADCQ 104(SI), BX + MOVQ BX, 24(SP) + ADCQ $0x00, AX + + // | + MOVQ CX, (DI) + MOVQ DX, 8(DI) + MOVQ R8, 16(DI) + MOVQ R9, 24(DI) + MOVQ R10, 32(DI) + MOVQ R11, 40(DI) + MOVQ R12, 48(DI) + MOVQ R13, 56(DI) + MOVQ R14, 64(DI) + MOVQ R15, 72(DI) + MOVQ (SP), BX + MOVQ BX, 80(DI) + MOVQ 8(SP), BX + MOVQ BX, 88(DI) + MOVQ 16(SP), BX + MOVQ BX, 96(DI) + MOVQ 24(SP), BX + MOVQ BX, 104(DI) + MOVQ AX, ret+16(FP) + RET + + // | + +/* end */ + + RET + +// func double14(c *[14]uint64, a *[14]uint64, p *[14]uint64) +TEXT ·double14(SB), NOSPLIT, $144-24 + // | + MOVQ a+8(FP), DI + XORQ AX, AX + MOVQ (DI), CX + ADDQ CX, CX + MOVQ 8(DI), DX + ADCQ DX, DX + MOVQ 16(DI), R8 + ADCQ R8, R8 + MOVQ 24(DI), R9 + ADCQ R9, R9 + MOVQ 32(DI), R10 + ADCQ R10, R10 + MOVQ 40(DI), R11 + ADCQ R11, R11 + MOVQ 48(DI), R12 + ADCQ R12, R12 + MOVQ 56(DI), R13 + ADCQ R13, R13 + MOVQ 64(DI), R14 + ADCQ R14, R14 + MOVQ 72(DI), R15 + ADCQ R15, R15 + MOVQ 80(DI), BX + ADCQ BX, BX + MOVQ BX, (SP) + MOVQ 88(DI), BX + ADCQ BX, BX + MOVQ BX, 8(SP) + MOVQ 96(DI), BX + ADCQ BX, BX + MOVQ BX, 16(SP) + MOVQ 104(DI), BX + ADCQ BX, BX + MOVQ BX, 24(SP) + ADCQ $0x00, AX + + // | + MOVQ p+16(FP), SI + MOVQ CX, BX + SUBQ (SI), BX + MOVQ BX, 32(SP) + MOVQ DX, BX + SBBQ 8(SI), BX + MOVQ BX, 40(SP) + MOVQ R8, BX + SBBQ 16(SI), BX + MOVQ BX, 48(SP) + MOVQ R9, BX + SBBQ 24(SI), BX + MOVQ BX, 56(SP) + MOVQ R10, BX + SBBQ 32(SI), BX + MOVQ BX, 64(SP) + MOVQ R11, BX + SBBQ 40(SI), BX + MOVQ BX, 72(SP) + MOVQ R12, BX + SBBQ 48(SI), BX + MOVQ BX, 80(SP) + MOVQ R13, BX + SBBQ 56(SI), BX + MOVQ BX, 88(SP) + MOVQ R14, BX + SBBQ 64(SI), BX + MOVQ BX, 96(SP) + MOVQ R15, BX + SBBQ 72(SI), BX + MOVQ BX, 104(SP) + MOVQ (SP), BX + SBBQ 80(SI), BX + MOVQ BX, 112(SP) + MOVQ 8(SP), BX + SBBQ 88(SI), BX + MOVQ BX, 120(SP) + MOVQ 16(SP), BX + SBBQ 96(SI), BX + MOVQ BX, 128(SP) + MOVQ 24(SP), BX + SBBQ 104(SI), BX + MOVQ BX, 136(SP) + SBBQ $0x00, AX + + // | + MOVQ c+0(FP), DI + CMOVQCC 32(SP), CX + MOVQ CX, (DI) + CMOVQCC 40(SP), DX + MOVQ DX, 8(DI) + CMOVQCC 48(SP), R8 + MOVQ R8, 16(DI) + CMOVQCC 56(SP), R9 + MOVQ R9, 24(DI) + CMOVQCC 64(SP), R10 + MOVQ R10, 32(DI) + CMOVQCC 72(SP), R11 + MOVQ R11, 40(DI) + CMOVQCC 80(SP), R12 + MOVQ R12, 48(DI) + CMOVQCC 88(SP), R13 + MOVQ R13, 56(DI) + CMOVQCC 96(SP), R14 + MOVQ R14, 64(DI) + CMOVQCC 104(SP), R15 + MOVQ R15, 72(DI) + MOVQ (SP), BX + CMOVQCC 112(SP), BX + MOVQ BX, 80(DI) + MOVQ 8(SP), BX + CMOVQCC 120(SP), BX + MOVQ BX, 88(DI) + MOVQ 16(SP), BX + CMOVQCC 128(SP), BX + MOVQ BX, 96(DI) + MOVQ 24(SP), BX + CMOVQCC 136(SP), BX + MOVQ BX, 104(DI) + RET + + // | + +/* end */ + + RET + +// func sub14(c *[14]uint64, a *[14]uint64, b *[14]uint64, p *[14]uint64) +TEXT ·sub14(SB), NOSPLIT, $144-32 + // | + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + XORQ AX, AX + MOVQ (DI), CX + SUBQ (SI), CX + MOVQ 8(DI), DX + SBBQ 8(SI), DX + MOVQ 16(DI), R8 + SBBQ 16(SI), R8 + MOVQ 24(DI), R9 + SBBQ 24(SI), R9 + MOVQ 32(DI), R10 + SBBQ 32(SI), R10 + MOVQ 40(DI), R11 + SBBQ 40(SI), R11 + MOVQ 48(DI), R12 + SBBQ 48(SI), R12 + MOVQ 56(DI), R13 + SBBQ 56(SI), R13 + MOVQ 64(DI), R14 + SBBQ 64(SI), R14 + MOVQ 72(DI), R15 + SBBQ 72(SI), R15 + MOVQ 80(DI), BX + SBBQ 80(SI), BX + MOVQ BX, (SP) + MOVQ 88(DI), BX + SBBQ 88(SI), BX + MOVQ BX, 8(SP) + MOVQ 96(DI), BX + SBBQ 96(SI), BX + MOVQ BX, 16(SP) + MOVQ 104(DI), BX + SBBQ 104(SI), BX + MOVQ BX, 24(SP) + + // | + MOVQ p+24(FP), SI + CMOVQCS (SI), AX + MOVQ AX, 32(SP) + CMOVQCS 8(SI), AX + MOVQ AX, 40(SP) + CMOVQCS 16(SI), AX + MOVQ AX, 48(SP) + CMOVQCS 24(SI), AX + MOVQ AX, 56(SP) + CMOVQCS 32(SI), AX + MOVQ AX, 64(SP) + CMOVQCS 40(SI), AX + MOVQ AX, 72(SP) + CMOVQCS 48(SI), AX + MOVQ AX, 80(SP) + CMOVQCS 56(SI), AX + MOVQ AX, 88(SP) + CMOVQCS 64(SI), AX + MOVQ AX, 96(SP) + CMOVQCS 72(SI), AX + MOVQ AX, 104(SP) + CMOVQCS 80(SI), AX + MOVQ AX, 112(SP) + CMOVQCS 88(SI), AX + MOVQ AX, 120(SP) + CMOVQCS 96(SI), AX + MOVQ AX, 128(SP) + CMOVQCS 104(SI), AX + MOVQ AX, 136(SP) + + // | + MOVQ c+0(FP), DI + ADDQ 32(SP), CX + MOVQ CX, (DI) + ADCQ 40(SP), DX + MOVQ DX, 8(DI) + ADCQ 48(SP), R8 + MOVQ R8, 16(DI) + ADCQ 56(SP), R9 + MOVQ R9, 24(DI) + ADCQ 64(SP), R10 + MOVQ R10, 32(DI) + ADCQ 72(SP), R11 + MOVQ R11, 40(DI) + ADCQ 80(SP), R12 + MOVQ R12, 48(DI) + ADCQ 88(SP), R13 + MOVQ R13, 56(DI) + ADCQ 96(SP), R14 + MOVQ R14, 64(DI) + ADCQ 104(SP), R15 + MOVQ R15, 72(DI) + MOVQ (SP), BX + ADCQ 112(SP), BX + MOVQ BX, 80(DI) + MOVQ 8(SP), BX + ADCQ 120(SP), BX + MOVQ BX, 88(DI) + MOVQ 16(SP), BX + ADCQ 128(SP), BX + MOVQ BX, 96(DI) + MOVQ 24(SP), BX + ADCQ 136(SP), BX + MOVQ BX, 104(DI) + RET + + // | + +/* end */ + + RET + +// func subn14(a *[14]uint64, b *[14]uint64) uint64 +TEXT ·subn14(SB), NOSPLIT, $32-24 + // | + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + XORQ AX, AX + + // | + MOVQ (DI), CX + SUBQ (SI), CX + MOVQ 8(DI), DX + SBBQ 8(SI), DX + MOVQ 16(DI), R8 + SBBQ 16(SI), R8 + MOVQ 24(DI), R9 + SBBQ 24(SI), R9 + MOVQ 32(DI), R10 + SBBQ 32(SI), R10 + MOVQ 40(DI), R11 + SBBQ 40(SI), R11 + MOVQ 48(DI), R12 + SBBQ 48(SI), R12 + MOVQ 56(DI), R13 + SBBQ 56(SI), R13 + MOVQ 64(DI), R14 + SBBQ 64(SI), R14 + MOVQ 72(DI), R15 + SBBQ 72(SI), R15 + MOVQ 80(DI), BX + SBBQ 80(SI), BX + MOVQ BX, (SP) + MOVQ 88(DI), BX + SBBQ 88(SI), BX + MOVQ BX, 8(SP) + MOVQ 96(DI), BX + SBBQ 96(SI), BX + MOVQ BX, 16(SP) + MOVQ 104(DI), BX + SBBQ 104(SI), BX + MOVQ BX, 24(SP) + ADCQ $0x00, AX + + // | + MOVQ CX, (DI) + MOVQ DX, 8(DI) + MOVQ R8, 16(DI) + MOVQ R9, 24(DI) + MOVQ R10, 32(DI) + MOVQ R11, 40(DI) + MOVQ R12, 48(DI) + MOVQ R13, 56(DI) + MOVQ R14, 64(DI) + MOVQ R15, 72(DI) + MOVQ (SP), BX + MOVQ BX, 80(DI) + MOVQ 8(SP), BX + MOVQ BX, 88(DI) + MOVQ 16(SP), BX + MOVQ BX, 96(DI) + MOVQ 24(SP), BX + MOVQ BX, 104(DI) + MOVQ AX, ret+16(FP) + RET + + // | + +/* end */ + + RET + +// func _neg14(c *[14]uint64, a *[14]uint64, p *[14]uint64) +TEXT ·_neg14(SB), NOSPLIT, $32-24 + // | + MOVQ a+8(FP), DI + + // | + MOVQ p+16(FP), SI + MOVQ (SI), CX + SUBQ (DI), CX + MOVQ 8(SI), DX + SBBQ 8(DI), DX + MOVQ 16(SI), R8 + SBBQ 16(DI), R8 + MOVQ 24(SI), R9 + SBBQ 24(DI), R9 + MOVQ 32(SI), R10 + SBBQ 32(DI), R10 + MOVQ 40(SI), R11 + SBBQ 40(DI), R11 + MOVQ 48(SI), R12 + SBBQ 48(DI), R12 + MOVQ 56(SI), R13 + SBBQ 56(DI), R13 + MOVQ 64(SI), R14 + SBBQ 64(DI), R14 + MOVQ 72(SI), R15 + SBBQ 72(DI), R15 + MOVQ 80(SI), BX + SBBQ 80(DI), BX + MOVQ BX, (SP) + MOVQ 88(SI), BX + SBBQ 88(DI), BX + MOVQ BX, 8(SP) + MOVQ 96(SI), BX + SBBQ 96(DI), BX + MOVQ BX, 16(SP) + MOVQ 104(SI), BX + SBBQ 104(DI), BX + MOVQ BX, 24(SP) + + // | + MOVQ c+0(FP), DI + MOVQ CX, (DI) + MOVQ DX, 8(DI) + MOVQ R8, 16(DI) + MOVQ R9, 24(DI) + MOVQ R10, 32(DI) + MOVQ R11, 40(DI) + MOVQ R12, 48(DI) + MOVQ R13, 56(DI) + MOVQ R14, 64(DI) + MOVQ R15, 72(DI) + MOVQ (SP), BX + MOVQ BX, 80(DI) + MOVQ 8(SP), BX + MOVQ BX, 88(DI) + MOVQ 16(SP), BX + MOVQ BX, 96(DI) + MOVQ 24(SP), BX + MOVQ BX, 104(DI) + RET + + // | + +/* end */ + + RET + +// func mul_two_14(a *[14]uint64) +TEXT ·mul_two_14(SB), NOSPLIT, $0-8 + MOVQ a+0(FP), DI + XORQ AX, AX + RCLQ $0x01, (DI) + RCLQ $0x01, 8(DI) + RCLQ $0x01, 16(DI) + RCLQ $0x01, 24(DI) + RCLQ $0x01, 32(DI) + RCLQ $0x01, 40(DI) + RCLQ $0x01, 48(DI) + RCLQ $0x01, 56(DI) + RCLQ $0x01, 64(DI) + RCLQ $0x01, 72(DI) + RCLQ $0x01, 80(DI) + RCLQ $0x01, 88(DI) + RCLQ $0x01, 96(DI) + RCLQ $0x01, 104(DI) + RET + +// func div_two_14(a *[14]uint64) +TEXT ·div_two_14(SB), NOSPLIT, $0-8 + MOVQ a+0(FP), DI + XORQ AX, AX + RCRQ $0x01, 104(DI) + RCRQ $0x01, 96(DI) + RCRQ $0x01, 88(DI) + RCRQ $0x01, 80(DI) + RCRQ $0x01, 72(DI) + RCRQ $0x01, 64(DI) + RCRQ $0x01, 56(DI) + RCRQ $0x01, 48(DI) + RCRQ $0x01, 40(DI) + RCRQ $0x01, 32(DI) + RCRQ $0x01, 24(DI) + RCRQ $0x01, 16(DI) + RCRQ $0x01, 8(DI) + RCRQ $0x01, (DI) + RET + +// func mul14(c *[14]uint64, a *[14]uint64, b *[14]uint64, p *[14]uint64, inp uint64) +TEXT ·mul14(SB), NOSPLIT, $256-40 + // | + +/* inputs */ + + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + XORQ AX, AX + + // | + +/* i = 0 */ + + // | a0 @ DX + MOVQ (DI), DX + + // | a0 * b0 + MULXQ (SI), AX, CX + MOVQ AX, (SP) + + // | a0 * b1 + MULXQ 8(SI), AX, R8 + ADCXQ AX, CX + + // | a0 * b2 + MULXQ 16(SI), AX, R9 + ADCXQ AX, R8 + + // | a0 * b3 + MULXQ 24(SI), AX, R10 + ADCXQ AX, R9 + + // | a0 * b4 + MULXQ 32(SI), AX, R11 + ADCXQ AX, R10 + + // | a0 * b5 + MULXQ 40(SI), AX, R12 + ADCXQ AX, R11 + + // | a0 * b6 + MULXQ 48(SI), AX, R13 + ADCXQ AX, R12 + + // | a0 * b7 + MULXQ 56(SI), AX, R14 + ADCXQ AX, R13 + + // | a0 * b8 + MULXQ 64(SI), AX, R15 + ADCXQ AX, R14 + ADCQ $0x00, R15 + + // | + +/* i = 1 */ + + // | a1 @ DX + MOVQ 8(DI), DX + XORQ AX, AX + + // | a1 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + MOVQ CX, 8(SP) + MOVQ $0x00, CX + + // | a1 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a1 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a1 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a1 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a1 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a1 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a1 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a1 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R15 + ADOXQ CX, CX + ADCXQ BX, CX + + // | + +/* i = 2 */ + + // | a2 @ DX + MOVQ 16(DI), DX + XORQ AX, AX + + // | a2 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + MOVQ R8, 16(SP) + MOVQ $0x00, R8 + + // | a2 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a2 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a2 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a2 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a2 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a2 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a2 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a2 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, CX + ADOXQ R8, R8 + ADCXQ BX, R8 + + // | + +/* i = 3 */ + + // | a3 @ DX + MOVQ 24(DI), DX + XORQ AX, AX + + // | a3 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + MOVQ R9, 24(SP) + MOVQ $0x00, R9 + + // | a3 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a3 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a3 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a3 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a3 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a3 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a3 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a3 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R8 + ADOXQ R9, R9 + ADCXQ BX, R9 + + // | + +/* i = 4 */ + + // | a4 @ DX + MOVQ 32(DI), DX + XORQ AX, AX + + // | a4 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + MOVQ R10, 32(SP) + MOVQ $0x00, R10 + + // | a4 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a4 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a4 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a4 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a4 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a4 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a4 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a4 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R9 + ADOXQ R10, R10 + ADCXQ BX, R10 + + // | + +/* i = 5 */ + + // | a5 @ DX + MOVQ 40(DI), DX + XORQ AX, AX + + // | a5 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + MOVQ R11, 40(SP) + MOVQ $0x00, R11 + + // | a5 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a5 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a5 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a5 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a5 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a5 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a5 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a5 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R10 + ADOXQ R11, R11 + ADCXQ BX, R11 + + // | + +/* i = 6 */ + + // | a6 @ DX + MOVQ 48(DI), DX + XORQ AX, AX + + // | a6 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + MOVQ R12, 48(SP) + MOVQ $0x00, R12 + + // | a6 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a6 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a6 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a6 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a6 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a6 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a6 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a6 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R11 + ADOXQ R12, R12 + ADCXQ BX, R12 + + // | + +/* i = 7 */ + + // | a7 @ DX + MOVQ 56(DI), DX + XORQ AX, AX + + // | a7 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + MOVQ R13, 56(SP) + MOVQ $0x00, R13 + + // | a7 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a7 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a7 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a7 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a7 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a7 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a7 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a7 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R12 + ADOXQ R13, R13 + ADCXQ BX, R13 + + // | + +/* i = 8 */ + + // | a8 @ DX + MOVQ 64(DI), DX + XORQ AX, AX + + // | a8 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + MOVQ R14, 64(SP) + MOVQ $0x00, R14 + + // | a8 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a8 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a8 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a8 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a8 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a8 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a8 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a8 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R13 + ADOXQ R14, R14 + ADCXQ BX, R14 + + // | + +/* i = 9 */ + + // | a9 @ DX + MOVQ 72(DI), DX + XORQ AX, AX + + // | a9 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + MOVQ R15, 72(SP) + MOVQ $0x00, R15 + + // | a9 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a9 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a9 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a9 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a9 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a9 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a9 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a9 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R14 + ADOXQ R15, R15 + ADCXQ BX, R15 + + // | + +/* i = 10 */ + + // | a10 @ DX + MOVQ 80(DI), DX + XORQ AX, AX + + // | a10 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + MOVQ CX, 80(SP) + MOVQ $0x00, CX + + // | a10 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a10 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a10 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a10 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a10 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a10 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a10 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a10 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R15 + ADOXQ CX, CX + ADCXQ BX, CX + + // | + +/* i = 11 */ + + // | a11 @ DX + MOVQ 88(DI), DX + XORQ AX, AX + + // | a11 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + MOVQ R8, 88(SP) + MOVQ $0x00, R8 + + // | a11 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a11 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a11 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a11 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a11 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a11 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a11 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a11 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, CX + ADOXQ R8, R8 + ADCXQ BX, R8 + + // | + +/* i = 12 */ + + // | a12 @ DX + MOVQ 96(DI), DX + XORQ AX, AX + + // | a12 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + MOVQ R9, 96(SP) + MOVQ $0x00, R9 + + // | a12 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a12 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a12 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a12 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a12 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a12 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a12 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a12 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R8 + ADOXQ R9, R9 + ADCXQ BX, R9 + + // | + +/* i = 13 */ + + // | a13 @ DX + MOVQ 104(DI), DX + XORQ AX, AX + + // | a13 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + MOVQ R10, 104(SP) + MOVQ $0x00, R10 + + // | a13 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a13 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a13 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a13 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a13 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a13 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a13 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a13 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R9 + ADOXQ BX, R10 + ADCQ $0x00, R10 + + // | + +/* */ + + // | + // | W right + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) + // | 14 R11 | 15 R12 | 16 R13 | 17 R14 | 18 R15 | 19 CX | 20 R8 | 21 R9 | 22 R10 | 23 - | 24 - | 25 - | 26 - | 27 - + + + MOVQ R11, 112(SP) + MOVQ R12, 120(SP) + MOVQ R13, 128(SP) + MOVQ R14, 136(SP) + MOVQ R15, 144(SP) + MOVQ CX, 152(SP) + MOVQ R8, 160(SP) + MOVQ R9, 168(SP) + MOVQ R10, 176(SP) + + // | + // | W right at stack + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) + // | 14 112(SP) | 15 120(SP) | 16 128(SP) | 17 136(SP) | 18 144(SP) | 19 152(SP) | 20 160(SP) | 21 168(SP) | 22 176(SP) | 23 - | 24 - | 25 - | 26 - | 27 - + + + XORQ AX, AX + + // | + +/* i = 0 */ + + // | a0 @ DX + MOVQ (DI), DX + + // | a0 * b9 + MULXQ 72(SI), AX, CX + MOVQ AX, 184(SP) + + // | a0 * b10 + MULXQ 80(SI), AX, R8 + ADCXQ AX, CX + + // | a0 * b11 + MULXQ 88(SI), AX, R9 + ADCXQ AX, R8 + + // | a0 * b12 + MULXQ 96(SI), AX, R10 + ADCXQ AX, R9 + + // | a0 * b13 + MULXQ 104(SI), AX, R11 + ADCXQ AX, R10 + ADCQ $0x00, R11 + + // | + +/* i = 1 */ + + // | a1 @ DX + MOVQ 8(DI), DX + XORQ R12, R12 + + // | a1 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + MOVQ CX, 192(SP) + + // | a1 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a1 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a1 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a1 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R11 + ADOXQ R12, R12 + ADCXQ BX, R12 + + // | + +/* i = 2 */ + + // | a2 @ DX + MOVQ 16(DI), DX + XORQ R13, R13 + + // | a2 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + MOVQ R8, 200(SP) + + // | a2 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a2 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a2 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a2 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R12 + ADOXQ R13, R13 + ADCXQ BX, R13 + + // | + +/* i = 3 */ + + // | a3 @ DX + MOVQ 24(DI), DX + XORQ R14, R14 + + // | a3 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + MOVQ R9, 208(SP) + + // | a3 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a3 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a3 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a3 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R13 + ADOXQ R14, R14 + ADCXQ BX, R14 + + // | + +/* i = 4 */ + + // | a4 @ DX + MOVQ 32(DI), DX + XORQ R15, R15 + + // | a4 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + MOVQ R10, 216(SP) + + // | a4 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a4 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a4 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a4 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R14 + ADOXQ R15, R15 + ADCXQ BX, R15 + + // | + +/* i = 5 */ + + // | a5 @ DX + MOVQ 40(DI), DX + XORQ CX, CX + + // | a5 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + MOVQ R11, 224(SP) + + // | a5 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a5 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a5 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a5 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R15 + ADOXQ CX, CX + ADCXQ BX, CX + + // | + +/* i = 6 */ + + // | a6 @ DX + MOVQ 48(DI), DX + XORQ R8, R8 + + // | a6 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + MOVQ R12, 232(SP) + + // | a6 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a6 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a6 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a6 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, CX + ADOXQ R8, R8 + ADCXQ BX, R8 + + // | + +/* i = 7 */ + + // | a7 @ DX + MOVQ 56(DI), DX + XORQ R9, R9 + + // | a7 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + MOVQ R13, 240(SP) + + // | a7 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a7 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a7 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a7 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R8 + ADOXQ R9, R9 + ADCXQ BX, R9 + + // | + +/* i = 8 */ + + // | a8 @ DX + MOVQ 64(DI), DX + XORQ R10, R10 + + // | a8 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + MOVQ R14, 248(SP) + + // | a8 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a8 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a8 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a8 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R9 + ADOXQ R10, R10 + ADCXQ BX, R10 + + // | + +/* i = 9 */ + + // | a9 @ DX + MOVQ 72(DI), DX + XORQ R11, R11 + + // | a9 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a9 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a9 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a9 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a9 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R10 + ADOXQ R11, R11 + ADCXQ BX, R11 + + // | + +/* i = 10 */ + + // | a10 @ DX + MOVQ 80(DI), DX + XORQ R12, R12 + + // | a10 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a10 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a10 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a10 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a10 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R11 + ADOXQ R12, R12 + ADCXQ BX, R12 + + // | + +/* i = 11 */ + + // | a11 @ DX + MOVQ 88(DI), DX + XORQ R13, R13 + + // | a11 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a11 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a11 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a11 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a11 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R12 + ADOXQ R13, R13 + ADCXQ BX, R13 + + // | + +/* i = 12 */ + + // | a12 @ DX + MOVQ 96(DI), DX + XORQ R14, R14 + + // | a12 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a12 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a12 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a12 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a12 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R13 + ADOXQ R14, R14 + ADCXQ BX, R14 + + // | + +/* i = 13 */ + + // | a13 @ DX + MOVQ 104(DI), DX + XORQ DI, DI + + // | a13 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a13 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a13 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a13 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a13 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R14 + ADOXQ BX, DI + ADCQ $0x00, DI + + // | + +/* */ + + // | + // | W left + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 184(SP) | 10 192(SP) | 11 200(SP) | 12 208(SP) | 13 216(SP) + // | 14 224(SP) | 15 232(SP) | 16 240(SP) | 17 248(SP) | 18 R15 | 19 CX | 20 R8 | 21 R9 | 22 R10 | 23 R11 | 24 R12 | 25 R13 | 26 R14 | 27 DI + + + // | + // | W right + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) + // | 14 112(SP) | 15 120(SP) | 16 128(SP) | 17 136(SP) | 18 144(SP) | 19 152(SP) | 20 160(SP) | 21 168(SP) | 22 176(SP) | 23 - | 24 - | 25 - | 26 - | 27 - + + + MOVQ 72(SP), AX + ADDQ AX, 184(SP) + MOVQ 80(SP), AX + ADCQ AX, 192(SP) + MOVQ 88(SP), AX + ADCQ AX, 200(SP) + MOVQ 96(SP), AX + ADCQ AX, 208(SP) + MOVQ 104(SP), AX + ADCQ AX, 216(SP) + MOVQ 112(SP), AX + ADCQ AX, 224(SP) + MOVQ 120(SP), AX + ADCQ AX, 232(SP) + MOVQ 128(SP), AX + ADCQ AX, 240(SP) + MOVQ 136(SP), AX + ADCQ AX, 248(SP) + ADCQ 144(SP), R15 + ADCQ 152(SP), CX + ADCQ 160(SP), R8 + ADCQ 168(SP), R9 + ADCQ 176(SP), R10 + ADCQ $0x00, R11 + ADCQ $0x00, R12 + ADCQ $0x00, R13 + ADCQ $0x00, R14 + ADCQ $0x00, DI + + // | + // | W combined + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 184(SP) | 10 192(SP) | 11 200(SP) | 12 208(SP) | 13 216(SP) + // | 14 224(SP) | 15 232(SP) | 16 240(SP) | 17 248(SP) | 18 R15 | 19 CX | 20 R8 | 21 R9 | 22 R10 | 23 R11 | 24 R12 | 25 R13 | 26 R14 | 27 DI + + + MOVQ (SP), BX + MOVQ 8(SP), SI + MOVQ DI, (SP) + MOVQ 16(SP), DI + MOVQ R14, 8(SP) + MOVQ 24(SP), R14 + MOVQ R13, 16(SP) + MOVQ 32(SP), R13 + MOVQ R12, 24(SP) + MOVQ 40(SP), R12 + MOVQ R11, 32(SP) + MOVQ 48(SP), R11 + MOVQ R10, 40(SP) + MOVQ 56(SP), R10 + MOVQ R9, 48(SP) + MOVQ 64(SP), R9 + MOVQ R8, 56(SP) + MOVQ 184(SP), R8 + MOVQ CX, 64(SP) + MOVQ R15, 72(SP) + + // | fetch modulus + MOVQ p+24(FP), CX + + // | + // | W ready to mont + // | 0 BX | 1 SI | 2 DI | 3 R14 | 4 R13 | 5 R12 | 6 R11 | 7 R10 | 8 R9 | 9 R8 | 10 192(SP) | 11 200(SP) | 12 208(SP) | 13 216(SP) + // | 14 224(SP) | 15 232(SP) | 16 240(SP) | 17 248(SP) | 18 72(SP) | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | + +/* montgomery reduction q1 */ + + // | clear flags + XORQ AX, AX + + // | + +/* i = 0 */ + + // | + // | W + // | 0 BX | 1 SI | 2 DI | 3 R14 | 4 R13 | 5 R12 | 6 R11 | 7 R10 | 8 R9 | 9 R8 | 10 192(SP) | 11 200(SP) | 12 208(SP) | 13 216(SP) + // | 14 224(SP) | 15 232(SP) | 16 240(SP) | 17 248(SP) | 18 72(SP) | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | | u0 = w0 * inp + MOVQ BX, DX + MULXQ inp+32(FP), DX, R15 + + // | save u0 + MOVQ DX, 80(SP) + + // | + +/* */ + + // | j0 + + // | w0 @ BX + MULXQ (CX), AX, R15 + ADOXQ AX, BX + ADCXQ R15, SI + + // | j1 + + // | w1 @ SI + MULXQ 8(CX), AX, R15 + ADOXQ AX, SI + ADCXQ R15, DI + + // | j2 + + // | w2 @ DI + MULXQ 16(CX), AX, R15 + ADOXQ AX, DI + ADCXQ R15, R14 + + // | j3 + + // | w3 @ R14 + MULXQ 24(CX), AX, R15 + ADOXQ AX, R14 + ADCXQ R15, R13 + + // | j4 + + // | w4 @ R13 + MULXQ 32(CX), AX, R15 + ADOXQ AX, R13 + ADCXQ R15, R12 + + // | j5 + + // | w5 @ R12 + MULXQ 40(CX), AX, R15 + ADOXQ AX, R12 + ADCXQ R15, R11 + + // | j6 + + // | w6 @ R11 + MULXQ 48(CX), AX, R15 + ADOXQ AX, R11 + ADCXQ R15, R10 + + // | j7 + + // | w7 @ R10 + MULXQ 56(CX), AX, R15 + ADOXQ AX, R10 + ADCXQ R15, R9 + + // | j8 + + // | w8 @ R9 + MULXQ 64(CX), AX, R15 + ADOXQ AX, R9 + ADCXQ R15, R8 + ADOXQ BX, R8 + ADCXQ BX, BX + MOVQ $0x00, AX + ADOXQ AX, BX + + // | clear flags + XORQ AX, AX + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 SI | 2 DI | 3 R14 | 4 R13 | 5 R12 | 6 R11 | 7 R10 | 8 R9 | 9 R8 | 10 192(SP) | 11 200(SP) | 12 208(SP) | 13 216(SP) + // | 14 224(SP) | 15 232(SP) | 16 240(SP) | 17 248(SP) | 18 72(SP) | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | | u1 = w1 * inp + MOVQ SI, DX + MULXQ inp+32(FP), DX, R15 + + // | save u1 + MOVQ DX, 88(SP) + + // | + +/* */ + + // | j0 + + // | w1 @ SI + MULXQ (CX), AX, R15 + ADOXQ AX, SI + ADCXQ R15, DI + + // | j1 + + // | w2 @ DI + MULXQ 8(CX), AX, R15 + ADOXQ AX, DI + ADCXQ R15, R14 + + // | j2 + + // | w3 @ R14 + MULXQ 16(CX), AX, R15 + ADOXQ AX, R14 + ADCXQ R15, R13 + + // | j3 + + // | w4 @ R13 + MULXQ 24(CX), AX, R15 + ADOXQ AX, R13 + ADCXQ R15, R12 + + // | j4 + + // | w5 @ R12 + MULXQ 32(CX), AX, R15 + ADOXQ AX, R12 + ADCXQ R15, R11 + + // | j5 + + // | w6 @ R11 + MULXQ 40(CX), AX, R15 + ADOXQ AX, R11 + ADCXQ R15, R10 + + // | j6 + + // | w7 @ R10 + MULXQ 48(CX), AX, R15 + ADOXQ AX, R10 + ADCXQ R15, R9 + + // | j7 + + // | w8 @ R9 + MULXQ 56(CX), AX, R15 + ADOXQ AX, R9 + ADCXQ R15, R8 + + // | j8 + + // | w9 @ R8 + MULXQ 64(CX), AX, R15 + ADOXQ AX, R8 + + // | w10 @ 192(SP) + // | move to temp register + MOVQ 192(SP), AX + ADCXQ R15, AX + ADOXQ BX, AX + + // | move to an idle register + // | w10 @ AX + MOVQ AX, BX + ADCXQ SI, SI + MOVQ $0x00, AX + ADOXQ AX, SI + + // | clear flags + XORQ AX, AX + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 DI | 3 R14 | 4 R13 | 5 R12 | 6 R11 | 7 R10 | 8 R9 | 9 R8 | 10 BX | 11 200(SP) | 12 208(SP) | 13 216(SP) + // | 14 224(SP) | 15 232(SP) | 16 240(SP) | 17 248(SP) | 18 72(SP) | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | | u2 = w2 * inp + MOVQ DI, DX + MULXQ inp+32(FP), DX, R15 + + // | save u2 + MOVQ DX, 96(SP) + + // | + +/* */ + + // | j0 + + // | w2 @ DI + MULXQ (CX), AX, R15 + ADOXQ AX, DI + ADCXQ R15, R14 + + // | j1 + + // | w3 @ R14 + MULXQ 8(CX), AX, R15 + ADOXQ AX, R14 + ADCXQ R15, R13 + + // | j2 + + // | w4 @ R13 + MULXQ 16(CX), AX, R15 + ADOXQ AX, R13 + ADCXQ R15, R12 + + // | j3 + + // | w5 @ R12 + MULXQ 24(CX), AX, R15 + ADOXQ AX, R12 + ADCXQ R15, R11 + + // | j4 + + // | w6 @ R11 + MULXQ 32(CX), AX, R15 + ADOXQ AX, R11 + ADCXQ R15, R10 + + // | j5 + + // | w7 @ R10 + MULXQ 40(CX), AX, R15 + ADOXQ AX, R10 + ADCXQ R15, R9 + + // | j6 + + // | w8 @ R9 + MULXQ 48(CX), AX, R15 + ADOXQ AX, R9 + ADCXQ R15, R8 + + // | j7 + + // | w9 @ R8 + MULXQ 56(CX), AX, R15 + ADOXQ AX, R8 + ADCXQ R15, BX + + // | j8 + + // | w10 @ BX + MULXQ 64(CX), AX, R15 + ADOXQ AX, BX + + // | w11 @ 200(SP) + // | move to temp register + MOVQ 200(SP), AX + ADCXQ R15, AX + ADOXQ SI, AX + + // | move to an idle register + // | w11 @ AX + MOVQ AX, SI + ADCXQ DI, DI + MOVQ $0x00, AX + ADOXQ AX, DI + + // | clear flags + XORQ AX, AX + + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 R14 | 4 R13 | 5 R12 | 6 R11 | 7 R10 | 8 R9 | 9 R8 | 10 BX | 11 SI | 12 208(SP) | 13 216(SP) + // | 14 224(SP) | 15 232(SP) | 16 240(SP) | 17 248(SP) | 18 72(SP) | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | | u3 = w3 * inp + MOVQ R14, DX + MULXQ inp+32(FP), DX, R15 + + // | save u3 + MOVQ DX, 104(SP) + + // | + +/* */ + + // | j0 + + // | w3 @ R14 + MULXQ (CX), AX, R15 + ADOXQ AX, R14 + ADCXQ R15, R13 + + // | j1 + + // | w4 @ R13 + MULXQ 8(CX), AX, R15 + ADOXQ AX, R13 + ADCXQ R15, R12 + + // | j2 + + // | w5 @ R12 + MULXQ 16(CX), AX, R15 + ADOXQ AX, R12 + ADCXQ R15, R11 + + // | j3 + + // | w6 @ R11 + MULXQ 24(CX), AX, R15 + ADOXQ AX, R11 + ADCXQ R15, R10 + + // | j4 + + // | w7 @ R10 + MULXQ 32(CX), AX, R15 + ADOXQ AX, R10 + ADCXQ R15, R9 + + // | j5 + + // | w8 @ R9 + MULXQ 40(CX), AX, R15 + ADOXQ AX, R9 + ADCXQ R15, R8 + + // | j6 + + // | w9 @ R8 + MULXQ 48(CX), AX, R15 + ADOXQ AX, R8 + ADCXQ R15, BX + + // | j7 + + // | w10 @ BX + MULXQ 56(CX), AX, R15 + ADOXQ AX, BX + ADCXQ R15, SI + + // | j8 + + // | w11 @ SI + MULXQ 64(CX), AX, R15 + ADOXQ AX, SI + + // | w12 @ 208(SP) + // | move to temp register + MOVQ 208(SP), AX + ADCXQ R15, AX + ADOXQ DI, AX + + // | move to an idle register + // | w12 @ AX + MOVQ AX, DI + ADCXQ R14, R14 + MOVQ $0x00, AX + ADOXQ AX, R14 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 4 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 R13 | 5 R12 | 6 R11 | 7 R10 | 8 R9 | 9 R8 | 10 BX | 11 SI | 12 DI | 13 216(SP) + // | 14 224(SP) | 15 232(SP) | 16 240(SP) | 17 248(SP) | 18 72(SP) | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | | u4 = w4 * inp + MOVQ R13, DX + MULXQ inp+32(FP), DX, R15 + + // | save u4 + MOVQ DX, 112(SP) + + // | + +/* */ + + // | j0 + + // | w4 @ R13 + MULXQ (CX), AX, R15 + ADOXQ AX, R13 + ADCXQ R15, R12 + + // | j1 + + // | w5 @ R12 + MULXQ 8(CX), AX, R15 + ADOXQ AX, R12 + ADCXQ R15, R11 + + // | j2 + + // | w6 @ R11 + MULXQ 16(CX), AX, R15 + ADOXQ AX, R11 + ADCXQ R15, R10 + + // | j3 + + // | w7 @ R10 + MULXQ 24(CX), AX, R15 + ADOXQ AX, R10 + ADCXQ R15, R9 + + // | j4 + + // | w8 @ R9 + MULXQ 32(CX), AX, R15 + ADOXQ AX, R9 + ADCXQ R15, R8 + + // | j5 + + // | w9 @ R8 + MULXQ 40(CX), AX, R15 + ADOXQ AX, R8 + ADCXQ R15, BX + + // | j6 + + // | w10 @ BX + MULXQ 48(CX), AX, R15 + ADOXQ AX, BX + ADCXQ R15, SI + + // | j7 + + // | w11 @ SI + MULXQ 56(CX), AX, R15 + ADOXQ AX, SI + ADCXQ R15, DI + + // | j8 + + // | w12 @ DI + MULXQ 64(CX), AX, R15 + ADOXQ AX, DI + + // | w13 @ 216(SP) + // | move to temp register + MOVQ 216(SP), AX + ADCXQ R15, AX + ADOXQ R14, AX + + // | move to an idle register + // | w13 @ AX + MOVQ AX, R14 + ADCXQ R13, R13 + MOVQ $0x00, AX + ADOXQ AX, R13 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 5 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 R12 | 6 R11 | 7 R10 | 8 R9 | 9 R8 | 10 BX | 11 SI | 12 DI | 13 R14 + // | 14 224(SP) | 15 232(SP) | 16 240(SP) | 17 248(SP) | 18 72(SP) | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | | u5 = w5 * inp + MOVQ R12, DX + MULXQ inp+32(FP), DX, R15 + + // | save u5 + MOVQ DX, 120(SP) + + // | + +/* */ + + // | j0 + + // | w5 @ R12 + MULXQ (CX), AX, R15 + ADOXQ AX, R12 + ADCXQ R15, R11 + + // | j1 + + // | w6 @ R11 + MULXQ 8(CX), AX, R15 + ADOXQ AX, R11 + ADCXQ R15, R10 + + // | j2 + + // | w7 @ R10 + MULXQ 16(CX), AX, R15 + ADOXQ AX, R10 + ADCXQ R15, R9 + + // | j3 + + // | w8 @ R9 + MULXQ 24(CX), AX, R15 + ADOXQ AX, R9 + ADCXQ R15, R8 + + // | j4 + + // | w9 @ R8 + MULXQ 32(CX), AX, R15 + ADOXQ AX, R8 + ADCXQ R15, BX + + // | j5 + + // | w10 @ BX + MULXQ 40(CX), AX, R15 + ADOXQ AX, BX + ADCXQ R15, SI + + // | j6 + + // | w11 @ SI + MULXQ 48(CX), AX, R15 + ADOXQ AX, SI + ADCXQ R15, DI + + // | j7 + + // | w12 @ DI + MULXQ 56(CX), AX, R15 + ADOXQ AX, DI + ADCXQ R15, R14 + + // | j8 + + // | w13 @ R14 + MULXQ 64(CX), AX, R15 + ADOXQ AX, R14 + + // | w14 @ 224(SP) + // | move to temp register + MOVQ 224(SP), AX + ADCXQ R15, AX + ADOXQ R13, AX + + // | move to an idle register + // | w14 @ AX + MOVQ AX, R13 + ADCXQ R12, R12 + MOVQ $0x00, AX + ADOXQ AX, R12 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 6 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 R11 | 7 R10 | 8 R9 | 9 R8 | 10 BX | 11 SI | 12 DI | 13 R14 + // | 14 R13 | 15 232(SP) | 16 240(SP) | 17 248(SP) | 18 72(SP) | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | | u6 = w6 * inp + MOVQ R11, DX + MULXQ inp+32(FP), DX, R15 + + // | save u6 + MOVQ DX, 128(SP) + + // | + +/* */ + + // | j0 + + // | w6 @ R11 + MULXQ (CX), AX, R15 + ADOXQ AX, R11 + ADCXQ R15, R10 + + // | j1 + + // | w7 @ R10 + MULXQ 8(CX), AX, R15 + ADOXQ AX, R10 + ADCXQ R15, R9 + + // | j2 + + // | w8 @ R9 + MULXQ 16(CX), AX, R15 + ADOXQ AX, R9 + ADCXQ R15, R8 + + // | j3 + + // | w9 @ R8 + MULXQ 24(CX), AX, R15 + ADOXQ AX, R8 + ADCXQ R15, BX + + // | j4 + + // | w10 @ BX + MULXQ 32(CX), AX, R15 + ADOXQ AX, BX + ADCXQ R15, SI + + // | j5 + + // | w11 @ SI + MULXQ 40(CX), AX, R15 + ADOXQ AX, SI + ADCXQ R15, DI + + // | j6 + + // | w12 @ DI + MULXQ 48(CX), AX, R15 + ADOXQ AX, DI + ADCXQ R15, R14 + + // | j7 + + // | w13 @ R14 + MULXQ 56(CX), AX, R15 + ADOXQ AX, R14 + ADCXQ R15, R13 + + // | j8 + + // | w14 @ R13 + MULXQ 64(CX), AX, R15 + ADOXQ AX, R13 + + // | w15 @ 232(SP) + // | move to temp register + MOVQ 232(SP), AX + ADCXQ R15, AX + ADOXQ R12, AX + + // | move to an idle register + // | w15 @ AX + MOVQ AX, R12 + ADCXQ R11, R11 + MOVQ $0x00, AX + ADOXQ AX, R11 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 7 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 R10 | 8 R9 | 9 R8 | 10 BX | 11 SI | 12 DI | 13 R14 + // | 14 R13 | 15 R12 | 16 240(SP) | 17 248(SP) | 18 72(SP) | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | | u7 = w7 * inp + MOVQ R10, DX + MULXQ inp+32(FP), DX, R15 + + // | save u7 + MOVQ DX, 136(SP) + + // | + +/* */ + + // | j0 + + // | w7 @ R10 + MULXQ (CX), AX, R15 + ADOXQ AX, R10 + ADCXQ R15, R9 + + // | j1 + + // | w8 @ R9 + MULXQ 8(CX), AX, R15 + ADOXQ AX, R9 + ADCXQ R15, R8 + + // | j2 + + // | w9 @ R8 + MULXQ 16(CX), AX, R15 + ADOXQ AX, R8 + ADCXQ R15, BX + + // | j3 + + // | w10 @ BX + MULXQ 24(CX), AX, R15 + ADOXQ AX, BX + ADCXQ R15, SI + + // | j4 + + // | w11 @ SI + MULXQ 32(CX), AX, R15 + ADOXQ AX, SI + ADCXQ R15, DI + + // | j5 + + // | w12 @ DI + MULXQ 40(CX), AX, R15 + ADOXQ AX, DI + ADCXQ R15, R14 + + // | j6 + + // | w13 @ R14 + MULXQ 48(CX), AX, R15 + ADOXQ AX, R14 + ADCXQ R15, R13 + + // | j7 + + // | w14 @ R13 + MULXQ 56(CX), AX, R15 + ADOXQ AX, R13 + ADCXQ R15, R12 + + // | j8 + + // | w15 @ R12 + MULXQ 64(CX), AX, R15 + ADOXQ AX, R12 + + // | w16 @ 240(SP) + // | move to temp register + MOVQ 240(SP), AX + ADCXQ R15, AX + ADOXQ R11, AX + + // | move to an idle register + // | w16 @ AX + MOVQ AX, R11 + ADCXQ R10, R10 + MOVQ $0x00, AX + ADOXQ AX, R10 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 8 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R9 | 9 R8 | 10 BX | 11 SI | 12 DI | 13 R14 + // | 14 R13 | 15 R12 | 16 R11 | 17 248(SP) | 18 72(SP) | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | | u8 = w8 * inp + MOVQ R9, DX + MULXQ inp+32(FP), DX, R15 + + // | save u8 + MOVQ DX, 144(SP) + + // | + +/* */ + + // | j0 + + // | w8 @ R9 + MULXQ (CX), AX, R15 + ADOXQ AX, R9 + ADCXQ R15, R8 + + // | j1 + + // | w9 @ R8 + MULXQ 8(CX), AX, R15 + ADOXQ AX, R8 + ADCXQ R15, BX + + // | j2 + + // | w10 @ BX + MULXQ 16(CX), AX, R15 + ADOXQ AX, BX + ADCXQ R15, SI + + // | j3 + + // | w11 @ SI + MULXQ 24(CX), AX, R15 + ADOXQ AX, SI + ADCXQ R15, DI + + // | j4 + + // | w12 @ DI + MULXQ 32(CX), AX, R15 + ADOXQ AX, DI + ADCXQ R15, R14 + + // | j5 + + // | w13 @ R14 + MULXQ 40(CX), AX, R15 + ADOXQ AX, R14 + ADCXQ R15, R13 + + // | j6 + + // | w14 @ R13 + MULXQ 48(CX), AX, R15 + ADOXQ AX, R13 + ADCXQ R15, R12 + + // | j7 + + // | w15 @ R12 + MULXQ 56(CX), AX, R15 + ADOXQ AX, R12 + ADCXQ R15, R11 + + // | j8 + + // | w16 @ R11 + MULXQ 64(CX), AX, R15 + ADOXQ AX, R11 + + // | w17 @ 248(SP) + // | move to temp register + MOVQ 248(SP), AX + ADCXQ R15, AX + ADOXQ R10, AX + + // | move to an idle register + // | w17 @ AX + MOVQ AX, R10 + ADCXQ R9, R9 + MOVQ $0x00, AX + ADOXQ AX, R9 + + // | + // | W montgomery reduction q1 ends + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R8 | 10 BX | 11 SI | 12 DI | 13 R14 + // | 14 R13 | 15 R12 | 16 R11 | 17 R10 | 18 72(SP) | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | carry from q1 should be added to w18 + MOVQ R9, 152(SP) + + // | + +/* montgomerry reduction q2 */ + + // | clear flags + XORQ R9, R9 + + // | + +/* i = 0 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R8 | 10 BX | 11 SI | 12 DI | 13 R14 + // | 14 R13 | 15 R12 | 16 R11 | 17 R10 | 18 72(SP) | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | u0 @ 80(SP) + MOVQ 80(SP), DX + + // | + +/* */ + + // | j9 + + // | w9 @ R8 + MULXQ 72(CX), AX, R15 + ADOXQ AX, R8 + ADCXQ R15, BX + + // | j10 + + // | w10 @ BX + MULXQ 80(CX), AX, R15 + ADOXQ AX, BX + ADCXQ R15, SI + + // | j11 + + // | w11 @ SI + MULXQ 88(CX), AX, R15 + ADOXQ AX, SI + ADCXQ R15, DI + + // | j12 + + // | w12 @ DI + MULXQ 96(CX), AX, R15 + ADOXQ AX, DI + ADCXQ R15, R14 + + // | j13 + + // | w13 @ R14 + MULXQ 104(CX), AX, R15 + ADOXQ AX, R14 + ADCXQ R15, R13 + ADOXQ R9, R13 + MOVQ $0x00, R9 + ADCXQ R9, R9 + MOVQ $0x00, AX + ADOXQ AX, R9 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R8 | 10 BX | 11 SI | 12 DI | 13 R14 + // | 14 R13 | 15 R12 | 16 R11 | 17 R10 | 18 72(SP) | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | u1 @ 88(SP) + MOVQ 88(SP), DX + + // | + +/* */ + + // | j9 + + // | w10 @ BX + MULXQ 72(CX), AX, R15 + ADOXQ AX, BX + MOVQ BX, 80(SP) + ADCXQ R15, SI + + // | j10 + + // | w11 @ SI + MULXQ 80(CX), AX, R15 + ADOXQ AX, SI + ADCXQ R15, DI + + // | j11 + + // | w12 @ DI + MULXQ 88(CX), AX, R15 + ADOXQ AX, DI + ADCXQ R15, R14 + + // | j12 + + // | w13 @ R14 + MULXQ 96(CX), AX, R15 + ADOXQ AX, R14 + ADCXQ R15, R13 + + // | j13 + + // | w14 @ R13 + MULXQ 104(CX), AX, R15 + ADOXQ AX, R13 + ADCXQ R15, R12 + ADOXQ R9, R12 + MOVQ $0x00, R9 + ADCXQ R9, R9 + MOVQ $0x00, AX + ADOXQ AX, R9 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R8 | 10 80(SP) | 11 SI | 12 DI | 13 R14 + // | 14 R13 | 15 R12 | 16 R11 | 17 R10 | 18 72(SP) | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | u2 @ 96(SP) + MOVQ 96(SP), DX + + // | + +/* */ + + // | j9 + + // | w11 @ SI + MULXQ 72(CX), AX, R15 + ADOXQ AX, SI + MOVQ SI, 88(SP) + ADCXQ R15, DI + + // | j10 + + // | w12 @ DI + MULXQ 80(CX), AX, R15 + ADOXQ AX, DI + ADCXQ R15, R14 + + // | j11 + + // | w13 @ R14 + MULXQ 88(CX), AX, R15 + ADOXQ AX, R14 + ADCXQ R15, R13 + + // | j12 + + // | w14 @ R13 + MULXQ 96(CX), AX, R15 + ADOXQ AX, R13 + ADCXQ R15, R12 + + // | j13 + + // | w15 @ R12 + MULXQ 104(CX), AX, R15 + ADOXQ AX, R12 + ADCXQ R15, R11 + ADOXQ R9, R11 + MOVQ $0x00, R9 + ADCXQ R9, R9 + MOVQ $0x00, AX + ADOXQ AX, R9 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R8 | 10 80(SP) | 11 88(SP) | 12 DI | 13 R14 + // | 14 R13 | 15 R12 | 16 R11 | 17 R10 | 18 72(SP) | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | u3 @ 104(SP) + MOVQ 104(SP), DX + + // | + +/* */ + + // | j9 + + // | w12 @ DI + MULXQ 72(CX), AX, R15 + ADOXQ AX, DI + MOVQ DI, 96(SP) + ADCXQ R15, R14 + + // | j10 + + // | w13 @ R14 + MULXQ 80(CX), AX, R15 + ADOXQ AX, R14 + ADCXQ R15, R13 + + // | j11 + + // | w14 @ R13 + MULXQ 88(CX), AX, R15 + ADOXQ AX, R13 + ADCXQ R15, R12 + + // | j12 + + // | w15 @ R12 + MULXQ 96(CX), AX, R15 + ADOXQ AX, R12 + ADCXQ R15, R11 + + // | j13 + + // | w16 @ R11 + MULXQ 104(CX), AX, R15 + ADOXQ AX, R11 + ADCXQ R15, R10 + ADOXQ R9, R10 + + // | bring the carry from q1 + MOVQ 152(SP), R9 + MOVQ $0x00, AX + ADCXQ AX, R9 + ADOXQ AX, R9 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 4 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R8 | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 R14 + // | 14 R13 | 15 R12 | 16 R11 | 17 R10 | 18 72(SP) | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | u4 @ 112(SP) + MOVQ 112(SP), DX + + // | + +/* */ + + // | j9 + + // | w13 @ R14 + MULXQ 72(CX), AX, R15 + ADOXQ AX, R14 + MOVQ R14, 104(SP) + ADCXQ R15, R13 + + // | j10 + + // | w14 @ R13 + MULXQ 80(CX), AX, R15 + ADOXQ AX, R13 + ADCXQ R15, R12 + + // | j11 + + // | w15 @ R12 + MULXQ 88(CX), AX, R15 + ADOXQ AX, R12 + ADCXQ R15, R11 + + // | j12 + + // | w16 @ R11 + MULXQ 96(CX), AX, R15 + ADOXQ AX, R11 + ADCXQ R15, R10 + + // | j13 + + // | w17 @ R10 + MULXQ 104(CX), AX, R15 + ADOXQ AX, R10 + + // | w18 @ 72(SP) + // | move to an idle register + MOVQ 72(SP), BX + + // | w18 @ BX + ADCXQ R15, BX + ADOXQ R9, BX + MOVQ $0x00, R9 + ADCXQ R9, R9 + MOVQ $0x00, AX + ADOXQ AX, R9 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 5 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R8 | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) + // | 14 R13 | 15 R12 | 16 R11 | 17 R10 | 18 BX | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | u5 @ 120(SP) + MOVQ 120(SP), DX + + // | + +/* */ + + // | j9 + + // | w14 @ R13 + MULXQ 72(CX), AX, R15 + ADOXQ AX, R13 + MOVQ R13, 72(SP) + ADCXQ R15, R12 + + // | j10 + + // | w15 @ R12 + MULXQ 80(CX), AX, R15 + ADOXQ AX, R12 + ADCXQ R15, R11 + + // | j11 + + // | w16 @ R11 + MULXQ 88(CX), AX, R15 + ADOXQ AX, R11 + ADCXQ R15, R10 + + // | j12 + + // | w17 @ R10 + MULXQ 96(CX), AX, R15 + ADOXQ AX, R10 + ADCXQ R15, BX + + // | j13 + + // | w18 @ BX + MULXQ 104(CX), AX, R15 + ADOXQ AX, BX + + // | w19 @ 64(SP) + // | move to an idle register + MOVQ 64(SP), DI + + // | w19 @ DI + ADCXQ R15, DI + ADOXQ R9, DI + MOVQ $0x00, R9 + ADCXQ R9, R9 + MOVQ $0x00, AX + ADOXQ AX, R9 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 6 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R8 | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) + // | 14 72(SP) | 15 R12 | 16 R11 | 17 R10 | 18 BX | 19 DI | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | u6 @ 128(SP) + MOVQ 128(SP), DX + + // | + +/* */ + + // | j9 + + // | w15 @ R12 + MULXQ 72(CX), AX, R15 + ADOXQ AX, R12 + ADCXQ R15, R11 + + // | j10 + + // | w16 @ R11 + MULXQ 80(CX), AX, R15 + ADOXQ AX, R11 + ADCXQ R15, R10 + + // | j11 + + // | w17 @ R10 + MULXQ 88(CX), AX, R15 + ADOXQ AX, R10 + ADCXQ R15, BX + + // | j12 + + // | w18 @ BX + MULXQ 96(CX), AX, R15 + ADOXQ AX, BX + ADCXQ R15, DI + + // | j13 + + // | w19 @ DI + MULXQ 104(CX), AX, R15 + ADOXQ AX, DI + + // | w20 @ 56(SP) + // | move to an idle register + MOVQ 56(SP), SI + + // | w20 @ SI + ADCXQ R15, SI + ADOXQ R9, SI + MOVQ $0x00, R9 + ADCXQ R9, R9 + MOVQ $0x00, AX + ADOXQ AX, R9 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 7 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R8 | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) + // | 14 72(SP) | 15 R12 | 16 R11 | 17 R10 | 18 BX | 19 DI | 20 SI | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | u7 @ 136(SP) + MOVQ 136(SP), DX + + // | + +/* */ + + // | j9 + + // | w16 @ R11 + MULXQ 72(CX), AX, R15 + ADOXQ AX, R11 + ADCXQ R15, R10 + + // | j10 + + // | w17 @ R10 + MULXQ 80(CX), AX, R15 + ADOXQ AX, R10 + ADCXQ R15, BX + + // | j11 + + // | w18 @ BX + MULXQ 88(CX), AX, R15 + ADOXQ AX, BX + ADCXQ R15, DI + + // | j12 + + // | w19 @ DI + MULXQ 96(CX), AX, R15 + ADOXQ AX, DI + ADCXQ R15, SI + + // | j13 + + // | w20 @ SI + MULXQ 104(CX), AX, R15 + ADOXQ AX, SI + + // | w21 @ 48(SP) + // | move to an idle register + MOVQ 48(SP), R13 + + // | w21 @ R13 + ADCXQ R15, R13 + ADOXQ R9, R13 + MOVQ $0x00, R9 + ADCXQ R9, R9 + MOVQ $0x00, AX + ADOXQ AX, R9 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 8 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R8 | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) + // | 14 72(SP) | 15 R12 | 16 R11 | 17 R10 | 18 BX | 19 DI | 20 SI | 21 R13 | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | u8 @ 144(SP) + MOVQ 144(SP), DX + + // | + +/* */ + + // | j9 + + // | w17 @ R10 + MULXQ 72(CX), AX, R15 + ADOXQ AX, R10 + ADCXQ R15, BX + + // | j10 + + // | w18 @ BX + MULXQ 80(CX), AX, R15 + ADOXQ AX, BX + ADCXQ R15, DI + + // | j11 + + // | w19 @ DI + MULXQ 88(CX), AX, R15 + ADOXQ AX, DI + ADCXQ R15, SI + + // | j12 + + // | w20 @ SI + MULXQ 96(CX), AX, R15 + ADOXQ AX, SI + ADCXQ R15, R13 + + // | j13 + + // | w21 @ R13 + MULXQ 104(CX), AX, R15 + ADOXQ AX, R13 + + // | w22 @ 40(SP) + // | move to an idle register + MOVQ 40(SP), R14 + + // | w22 @ R14 + ADCXQ R15, R14 + ADOXQ R9, R14 + MOVQ $0x00, R9 + ADCXQ R9, R9 + MOVQ $0x00, AX + ADOXQ AX, R9 + + // | + // | q2 ends + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R8 | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) + // | 14 72(SP) | 15 R12 | 16 R11 | 17 R10 | 18 BX | 19 DI | 20 SI | 21 R13 | 22 R14 | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | save the carry from q2 + // | should be added to w23 + MOVQ R9, 152(SP) + + // | + +/* q2 q3 transition swap */ + + MOVQ 80(SP), R9 + MOVQ R14, 40(SP) + MOVQ 88(SP), R14 + MOVQ R13, 48(SP) + MOVQ 96(SP), R13 + MOVQ SI, 56(SP) + MOVQ 104(SP), SI + MOVQ DI, 64(SP) + MOVQ 72(SP), DI + + // | + // | W q2 q3 transition + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R8 | 10 R9 | 11 R14 | 12 R13 | 13 SI + // | 14 DI | 15 R12 | 16 R11 | 17 R10 | 18 BX | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | + +/* montgomery reduction q3 */ + + // | clear flags + XORQ AX, AX + + // | + +/* i = 9 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R8 | 10 R9 | 11 R14 | 12 R13 | 13 SI + // | 14 DI | 15 R12 | 16 R11 | 17 R10 | 18 BX | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | | u9 = w9 * inp + MOVQ R8, DX + MULXQ inp+32(FP), DX, R15 + + // | save u9 + MOVQ DX, 72(SP) + + // | + +/* */ + + // | j0 + + // | w9 @ R8 + MULXQ (CX), AX, R15 + ADOXQ AX, R8 + ADCXQ R15, R9 + + // | j1 + + // | w10 @ R9 + MULXQ 8(CX), AX, R15 + ADOXQ AX, R9 + ADCXQ R15, R14 + + // | j2 + + // | w11 @ R14 + MULXQ 16(CX), AX, R15 + ADOXQ AX, R14 + ADCXQ R15, R13 + + // | j3 + + // | w12 @ R13 + MULXQ 24(CX), AX, R15 + ADOXQ AX, R13 + ADCXQ R15, SI + + // | j4 + + // | w13 @ SI + MULXQ 32(CX), AX, R15 + ADOXQ AX, SI + ADCXQ R15, DI + + // | j5 + + // | w14 @ DI + MULXQ 40(CX), AX, R15 + ADOXQ AX, DI + ADCXQ R15, R12 + + // | j6 + + // | w15 @ R12 + MULXQ 48(CX), AX, R15 + ADOXQ AX, R12 + ADCXQ R15, R11 + + // | j7 + + // | w16 @ R11 + MULXQ 56(CX), AX, R15 + ADOXQ AX, R11 + ADCXQ R15, R10 + + // | j8 + + // | w17 @ R10 + MULXQ 64(CX), AX, R15 + ADOXQ AX, R10 + ADCXQ R15, BX + ADOXQ R8, BX + ADCXQ R8, R8 + MOVQ $0x00, AX + ADOXQ AX, R8 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 10 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 R9 | 11 R14 | 12 R13 | 13 SI + // | 14 DI | 15 R12 | 16 R11 | 17 R10 | 18 BX | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | | u10 = w10 * inp + MOVQ R9, DX + MULXQ inp+32(FP), DX, R15 + + // | save u10 + MOVQ DX, 80(SP) + + // | + +/* */ + + // | j0 + + // | w10 @ R9 + MULXQ (CX), AX, R15 + ADOXQ AX, R9 + ADCXQ R15, R14 + + // | j1 + + // | w11 @ R14 + MULXQ 8(CX), AX, R15 + ADOXQ AX, R14 + ADCXQ R15, R13 + + // | j2 + + // | w12 @ R13 + MULXQ 16(CX), AX, R15 + ADOXQ AX, R13 + ADCXQ R15, SI + + // | j3 + + // | w13 @ SI + MULXQ 24(CX), AX, R15 + ADOXQ AX, SI + ADCXQ R15, DI + + // | j4 + + // | w14 @ DI + MULXQ 32(CX), AX, R15 + ADOXQ AX, DI + ADCXQ R15, R12 + + // | j5 + + // | w15 @ R12 + MULXQ 40(CX), AX, R15 + ADOXQ AX, R12 + ADCXQ R15, R11 + + // | j6 + + // | w16 @ R11 + MULXQ 48(CX), AX, R15 + ADOXQ AX, R11 + ADCXQ R15, R10 + + // | j7 + + // | w17 @ R10 + MULXQ 56(CX), AX, R15 + ADOXQ AX, R10 + ADCXQ R15, BX + + // | j8 + + // | w18 @ BX + MULXQ 64(CX), AX, R15 + ADOXQ AX, BX + + // | w19 @ 64(SP) + // | move to temp register + MOVQ 64(SP), AX + ADCXQ R15, AX + ADOXQ R8, AX + + // | move to an idle register + // | w19 @ AX + MOVQ AX, R8 + ADCXQ R9, R9 + MOVQ $0x00, AX + ADOXQ AX, R9 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 11 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 R14 | 12 R13 | 13 SI + // | 14 DI | 15 R12 | 16 R11 | 17 R10 | 18 BX | 19 R8 | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | | u11 = w11 * inp + MOVQ R14, DX + MULXQ inp+32(FP), DX, R15 + + // | save u11 + MOVQ DX, 64(SP) + + // | + +/* */ + + // | j0 + + // | w11 @ R14 + MULXQ (CX), AX, R15 + ADOXQ AX, R14 + ADCXQ R15, R13 + + // | j1 + + // | w12 @ R13 + MULXQ 8(CX), AX, R15 + ADOXQ AX, R13 + ADCXQ R15, SI + + // | j2 + + // | w13 @ SI + MULXQ 16(CX), AX, R15 + ADOXQ AX, SI + ADCXQ R15, DI + + // | j3 + + // | w14 @ DI + MULXQ 24(CX), AX, R15 + ADOXQ AX, DI + ADCXQ R15, R12 + + // | j4 + + // | w15 @ R12 + MULXQ 32(CX), AX, R15 + ADOXQ AX, R12 + ADCXQ R15, R11 + + // | j5 + + // | w16 @ R11 + MULXQ 40(CX), AX, R15 + ADOXQ AX, R11 + ADCXQ R15, R10 + + // | j6 + + // | w17 @ R10 + MULXQ 48(CX), AX, R15 + ADOXQ AX, R10 + ADCXQ R15, BX + + // | j7 + + // | w18 @ BX + MULXQ 56(CX), AX, R15 + ADOXQ AX, BX + ADCXQ R15, R8 + + // | j8 + + // | w19 @ R8 + MULXQ 64(CX), AX, R15 + ADOXQ AX, R8 + + // | w20 @ 56(SP) + // | move to temp register + MOVQ 56(SP), AX + ADCXQ R15, AX + ADOXQ R9, AX + + // | move to an idle register + // | w20 @ AX + MOVQ AX, R9 + ADCXQ R14, R14 + MOVQ $0x00, AX + ADOXQ AX, R14 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 12 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 R13 | 13 SI + // | 14 DI | 15 R12 | 16 R11 | 17 R10 | 18 BX | 19 R8 | 20 R9 | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | | u12 = w12 * inp + MOVQ R13, DX + MULXQ inp+32(FP), DX, R15 + + // | save u12 + MOVQ DX, 56(SP) + + // | + +/* */ + + // | j0 + + // | w12 @ R13 + MULXQ (CX), AX, R15 + ADOXQ AX, R13 + ADCXQ R15, SI + + // | j1 + + // | w13 @ SI + MULXQ 8(CX), AX, R15 + ADOXQ AX, SI + ADCXQ R15, DI + + // | j2 + + // | w14 @ DI + MULXQ 16(CX), AX, R15 + ADOXQ AX, DI + ADCXQ R15, R12 + + // | j3 + + // | w15 @ R12 + MULXQ 24(CX), AX, R15 + ADOXQ AX, R12 + ADCXQ R15, R11 + + // | j4 + + // | w16 @ R11 + MULXQ 32(CX), AX, R15 + ADOXQ AX, R11 + ADCXQ R15, R10 + + // | j5 + + // | w17 @ R10 + MULXQ 40(CX), AX, R15 + ADOXQ AX, R10 + ADCXQ R15, BX + + // | j6 + + // | w18 @ BX + MULXQ 48(CX), AX, R15 + ADOXQ AX, BX + ADCXQ R15, R8 + + // | j7 + + // | w19 @ R8 + MULXQ 56(CX), AX, R15 + ADOXQ AX, R8 + ADCXQ R15, R9 + + // | j8 + + // | w20 @ R9 + MULXQ 64(CX), AX, R15 + ADOXQ AX, R9 + + // | w21 @ 48(SP) + // | move to temp register + MOVQ 48(SP), AX + ADCXQ R15, AX + ADOXQ R14, AX + + // | move to an idle register + // | w21 @ AX + MOVQ AX, R14 + ADCXQ R13, R13 + MOVQ $0x00, AX + ADOXQ AX, R13 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 13 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 SI + // | 14 DI | 15 R12 | 16 R11 | 17 R10 | 18 BX | 19 R8 | 20 R9 | 21 R14 | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | | u13 = w13 * inp + MOVQ SI, DX + MULXQ inp+32(FP), DX, R15 + + // | save u13 + MOVQ DX, 48(SP) + + // | + +/* */ + + // | j0 + + // | w13 @ SI + MULXQ (CX), AX, R15 + ADOXQ AX, SI + ADCXQ R15, DI + + // | j1 + + // | w14 @ DI + MULXQ 8(CX), AX, R15 + ADOXQ AX, DI + ADCXQ R15, R12 + + // | j2 + + // | w15 @ R12 + MULXQ 16(CX), AX, R15 + ADOXQ AX, R12 + ADCXQ R15, R11 + + // | j3 + + // | w16 @ R11 + MULXQ 24(CX), AX, R15 + ADOXQ AX, R11 + ADCXQ R15, R10 + + // | j4 + + // | w17 @ R10 + MULXQ 32(CX), AX, R15 + ADOXQ AX, R10 + ADCXQ R15, BX + + // | j5 + + // | w18 @ BX + MULXQ 40(CX), AX, R15 + ADOXQ AX, BX + ADCXQ R15, R8 + + // | j6 + + // | w19 @ R8 + MULXQ 48(CX), AX, R15 + ADOXQ AX, R8 + ADCXQ R15, R9 + + // | j7 + + // | w20 @ R9 + MULXQ 56(CX), AX, R15 + ADOXQ AX, R9 + ADCXQ R15, R14 + + // | j8 + + // | w21 @ R14 + MULXQ 64(CX), AX, R15 + ADOXQ AX, R14 + + // | w22 @ 40(SP) + // | move to temp register + MOVQ 40(SP), AX + ADCXQ R15, AX + ADOXQ R13, AX + + // | move to an idle register + // | w22 @ AX + MOVQ AX, R13 + ADCXQ SI, SI + MOVQ $0x00, AX + ADOXQ AX, SI + + // | + // | W q3 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - + // | 14 DI | 15 R12 | 16 R11 | 17 R10 | 18 BX | 19 R8 | 20 R9 | 21 R14 | 22 R13 | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | aggregate carries from q2 & q3 + // | should be added to w23 + ADCQ 152(SP), SI + + // | + +/* montgomerry reduction q4 */ + + // | clear flags + XORQ AX, AX + + // | + +/* i = 0 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - + // | 14 DI | 15 R12 | 16 R11 | 17 R10 | 18 BX | 19 R8 | 20 R9 | 21 R14 | 22 R13 | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | u0 @ 72(SP) + MOVQ 72(SP), DX + + // | + +/* */ + + // | j9 + + // | w18 @ BX + MULXQ 72(CX), AX, R15 + ADOXQ AX, BX + ADCXQ R15, R8 + MOVQ BX, 40(SP) + + // | j10 + + // | w19 @ R8 + MULXQ 80(CX), AX, R15 + ADOXQ AX, R8 + ADCXQ R15, R9 + + // | j11 + + // | w20 @ R9 + MULXQ 88(CX), AX, R15 + ADOXQ AX, R9 + ADCXQ R15, R14 + + // | j12 + + // | w21 @ R14 + MULXQ 96(CX), AX, R15 + ADOXQ AX, R14 + ADCXQ R15, R13 + + // | j13 + + // | w22 @ R13 + MULXQ 104(CX), AX, R15 + ADOXQ AX, R13 + + // | w23 @ 32(SP) + // | move to an idle register + MOVQ 32(SP), BX + ADCXQ R15, BX + + // | bring carry from q2 & q3 + // | w23 @ BX + ADOXQ SI, BX + MOVQ $0x00, SI + ADCXQ SI, SI + MOVQ $0x00, R15 + ADOXQ R15, SI + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - + // | 14 DI | 15 R12 | 16 R11 | 17 R10 | 18 40(SP) | 19 R8 | 20 R9 | 21 R14 | 22 R13 | 23 BX | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | u1 @ 80(SP) + MOVQ 80(SP), DX + + // | + +/* */ + + // | j9 + + // | w19 @ R8 + MULXQ 72(CX), AX, R15 + ADOXQ AX, R8 + ADCXQ R15, R9 + MOVQ R8, 32(SP) + + // | j10 + + // | w20 @ R9 + MULXQ 80(CX), AX, R15 + ADOXQ AX, R9 + ADCXQ R15, R14 + + // | j11 + + // | w21 @ R14 + MULXQ 88(CX), AX, R15 + ADOXQ AX, R14 + ADCXQ R15, R13 + + // | j12 + + // | w22 @ R13 + MULXQ 96(CX), AX, R15 + ADOXQ AX, R13 + ADCXQ R15, BX + + // | j13 + + // | w23 @ BX + MULXQ 104(CX), AX, R15 + ADOXQ AX, BX + + // | w24 @ 24(SP) + // | move to an idle register + MOVQ 24(SP), R8 + ADCXQ R15, R8 + + // | w24 @ R8 + ADOXQ SI, R8 + MOVQ $0x00, SI + ADCXQ SI, SI + MOVQ $0x00, R15 + ADOXQ R15, SI + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - + // | 14 DI | 15 R12 | 16 R11 | 17 R10 | 18 40(SP) | 19 32(SP) | 20 R9 | 21 R14 | 22 R13 | 23 BX | 24 R8 | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | u2 @ 64(SP) + MOVQ 64(SP), DX + + // | + +/* */ + + // | j9 + + // | w20 @ R9 + MULXQ 72(CX), AX, R15 + ADOXQ AX, R9 + ADCXQ R15, R14 + MOVQ R9, 24(SP) + + // | j10 + + // | w21 @ R14 + MULXQ 80(CX), AX, R15 + ADOXQ AX, R14 + ADCXQ R15, R13 + + // | j11 + + // | w22 @ R13 + MULXQ 88(CX), AX, R15 + ADOXQ AX, R13 + ADCXQ R15, BX + + // | j12 + + // | w23 @ BX + MULXQ 96(CX), AX, R15 + ADOXQ AX, BX + ADCXQ R15, R8 + + // | j13 + + // | w24 @ R8 + MULXQ 104(CX), AX, R15 + ADOXQ AX, R8 + + // | w25 @ 16(SP) + // | move to an idle register + MOVQ 16(SP), R9 + ADCXQ R15, R9 + + // | w25 @ R9 + ADOXQ SI, R9 + MOVQ $0x00, SI + ADCXQ SI, SI + MOVQ $0x00, R15 + ADOXQ R15, SI + + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - + // | 14 DI | 15 R12 | 16 R11 | 17 R10 | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 R14 | 22 R13 | 23 BX | 24 R8 | 25 R9 | 26 8(SP) | 27 (SP) + + + // | u3 @ 56(SP) + MOVQ 56(SP), DX + + // | + +/* */ + + // | j9 + + // | w21 @ R14 + MULXQ 72(CX), AX, R15 + ADOXQ AX, R14 + ADCXQ R15, R13 + MOVQ R14, 16(SP) + + // | j10 + + // | w22 @ R13 + MULXQ 80(CX), AX, R15 + ADOXQ AX, R13 + ADCXQ R15, BX + + // | j11 + + // | w23 @ BX + MULXQ 88(CX), AX, R15 + ADOXQ AX, BX + ADCXQ R15, R8 + + // | j12 + + // | w24 @ R8 + MULXQ 96(CX), AX, R15 + ADOXQ AX, R8 + ADCXQ R15, R9 + + // | j13 + + // | w25 @ R9 + MULXQ 104(CX), AX, R15 + ADOXQ AX, R9 + + // | w26 @ 8(SP) + // | move to an idle register + MOVQ 8(SP), R14 + ADCXQ R15, R14 + + // | w26 @ R14 + ADOXQ SI, R14 + MOVQ $0x00, SI + ADCXQ SI, SI + MOVQ $0x00, R15 + ADOXQ R15, SI + + // | + +/* i = 4 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - + // | 14 DI | 15 R12 | 16 R11 | 17 R10 | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 R13 | 23 BX | 24 R8 | 25 R9 | 26 R14 | 27 (SP) + + + // | u4 @ 48(SP) + MOVQ 48(SP), DX + + // | + +/* */ + + // | j9 + + // | w22 @ R13 + MULXQ 72(CX), AX, R15 + ADOXQ AX, R13 + ADCXQ R15, BX + + // | j10 + + // | w23 @ BX + MULXQ 80(CX), AX, R15 + ADOXQ AX, BX + ADCXQ R15, R8 + + // | j11 + + // | w24 @ R8 + MULXQ 88(CX), AX, R15 + ADOXQ AX, R8 + ADCXQ R15, R9 + + // | j12 + + // | w25 @ R9 + MULXQ 96(CX), AX, R15 + ADOXQ AX, R9 + ADCXQ R15, R14 + + // | j13 + + // | w26 @ R14 + MULXQ 104(CX), AX, R15 + ADOXQ AX, R14 + + // | w27 @ (SP) + // | move to an idle register + MOVQ (SP), AX + ADCXQ R15, AX + + // | w27 @ AX + ADOXQ SI, AX + MOVQ $0x00, SI + ADCXQ SI, SI + MOVQ $0x00, R15 + ADOXQ R15, SI + + // | + // | W q4 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - + // | 14 DI | 15 R12 | 16 R11 | 17 R10 | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 R13 | 23 BX | 24 R8 | 25 R9 | 26 R14 | 27 AX + + + // | + +/* modular reduction */ + + MOVQ DI, R15 + SUBQ (CX), R15 + MOVQ R12, DX + SBBQ 8(CX), DX + MOVQ DX, (SP) + MOVQ R11, DX + SBBQ 16(CX), DX + MOVQ DX, 8(SP) + MOVQ R10, DX + SBBQ 24(CX), DX + MOVQ DX, 48(SP) + MOVQ 40(SP), DX + SBBQ 32(CX), DX + MOVQ DX, 56(SP) + MOVQ 32(SP), DX + SBBQ 40(CX), DX + MOVQ DX, 64(SP) + MOVQ 24(SP), DX + SBBQ 48(CX), DX + MOVQ DX, 72(SP) + MOVQ 16(SP), DX + SBBQ 56(CX), DX + MOVQ DX, 80(SP) + MOVQ R13, DX + SBBQ 64(CX), DX + MOVQ DX, 88(SP) + MOVQ BX, DX + SBBQ 72(CX), DX + MOVQ DX, 96(SP) + MOVQ R8, DX + SBBQ 80(CX), DX + MOVQ DX, 104(SP) + MOVQ R9, DX + SBBQ 88(CX), DX + MOVQ DX, 112(SP) + MOVQ R14, DX + SBBQ 96(CX), DX + MOVQ DX, 120(SP) + MOVQ AX, DX + SBBQ 104(CX), DX + MOVQ DX, 128(SP) + SBBQ $0x00, SI + + // | + +/* out */ + + MOVQ c+0(FP), SI + CMOVQCC R15, DI + MOVQ DI, (SI) + CMOVQCC (SP), R12 + MOVQ R12, 8(SI) + CMOVQCC 8(SP), R11 + MOVQ R11, 16(SI) + CMOVQCC 48(SP), R10 + MOVQ R10, 24(SI) + MOVQ 40(SP), DX + CMOVQCC 56(SP), DX + MOVQ DX, 32(SI) + MOVQ 32(SP), DX + CMOVQCC 64(SP), DX + MOVQ DX, 40(SI) + MOVQ 24(SP), DX + CMOVQCC 72(SP), DX + MOVQ DX, 48(SI) + MOVQ 16(SP), DX + CMOVQCC 80(SP), DX + MOVQ DX, 56(SI) + CMOVQCC 88(SP), R13 + MOVQ R13, 64(SI) + CMOVQCC 96(SP), BX + MOVQ BX, 72(SI) + CMOVQCC 104(SP), R8 + MOVQ R8, 80(SI) + CMOVQCC 112(SP), R9 + MOVQ R9, 88(SI) + CMOVQCC 120(SP), R14 + MOVQ R14, 96(SI) + CMOVQCC 128(SP), AX + MOVQ AX, 104(SI) + RET + + // | + +/* end */ + + +// func mul_no_adx_bmi2_14(c *[14]uint64, a *[14]uint64, b *[14]uint64, p *[14]uint64, inp uint64) +TEXT ·mul_no_adx_bmi2_14(SB), NOSPLIT, $264-40 + // | + +/* inputs */ + + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + MOVQ $0x00, R9 + MOVQ $0x00, R10 + MOVQ $0x00, R11 + MOVQ $0x00, R12 + MOVQ $0x00, R13 + MOVQ $0x00, R14 + MOVQ $0x00, R15 + + // | + +/* i = 0 */ + + // | a0 @ CX + MOVQ (DI), CX + + // | a0 * b0 + MOVQ (SI), AX + MULQ CX + MOVQ AX, (SP) + MOVQ DX, R8 + + // | a0 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + + // | a0 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + + // | a0 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + + // | a0 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + + // | a0 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + + // | a0 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + + // | a0 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + + // | + +/* i = 1 */ + + // | a1 @ CX + MOVQ 8(DI), CX + MOVQ $0x00, BX + + // | a1 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R10 + ADCQ $0x00, BX + MOVQ R8, 8(SP) + MOVQ $0x00, R8 + + // | a1 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + + // | a1 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + + // | + +/* i = 2 */ + + // | a2 @ CX + MOVQ 16(DI), CX + MOVQ $0x00, BX + + // | a2 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ $0x00, R11 + ADCQ $0x00, BX + MOVQ R9, 16(SP) + MOVQ $0x00, R9 + + // | a2 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + + // | a2 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + + // | + +/* i = 3 */ + + // | a3 @ CX + MOVQ 24(DI), CX + MOVQ $0x00, BX + + // | a3 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ $0x00, R12 + ADCQ $0x00, BX + MOVQ R10, 24(SP) + MOVQ $0x00, R10 + + // | a3 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + + // | a3 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + + // | + +/* i = 4 */ + + // | a4 @ CX + MOVQ 32(DI), CX + MOVQ $0x00, BX + + // | a4 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ $0x00, R13 + ADCQ $0x00, BX + MOVQ R11, 32(SP) + MOVQ $0x00, R11 + + // | a4 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + + // | a4 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + + // | + +/* i = 5 */ + + // | a5 @ CX + MOVQ 40(DI), CX + MOVQ $0x00, BX + + // | a5 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ $0x00, R14 + ADCQ $0x00, BX + MOVQ R12, 40(SP) + MOVQ $0x00, R12 + + // | a5 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + + // | a5 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + + // | + +/* i = 6 */ + + // | a6 @ CX + MOVQ 48(DI), CX + MOVQ $0x00, BX + + // | a6 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ $0x00, R15 + ADCQ $0x00, BX + MOVQ R13, 48(SP) + MOVQ $0x00, R13 + + // | a6 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + + // | a6 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + + // | + +/* i = 7 */ + + // | a7 @ CX + MOVQ 56(DI), CX + MOVQ $0x00, BX + + // | a7 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ $0x00, R8 + ADCQ $0x00, BX + MOVQ R14, 56(SP) + MOVQ $0x00, R14 + + // | a7 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + + // | a7 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + + // | + +/* i = 8 */ + + // | a8 @ CX + MOVQ 64(DI), CX + MOVQ $0x00, BX + + // | a8 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ $0x00, R9 + ADCQ $0x00, BX + MOVQ R15, 64(SP) + MOVQ $0x00, R15 + + // | a8 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + + // | a8 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + + // | + +/* i = 9 */ + + // | a9 @ CX + MOVQ 72(DI), CX + MOVQ $0x00, BX + + // | a9 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R10 + ADCQ $0x00, BX + MOVQ R8, 72(SP) + MOVQ $0x00, R8 + + // | a9 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a9 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a9 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a9 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a9 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a9 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + + // | a9 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + + // | + +/* i = 10 */ + + // | a10 @ CX + MOVQ 80(DI), CX + MOVQ $0x00, BX + + // | a10 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ $0x00, R11 + ADCQ $0x00, BX + MOVQ R9, 80(SP) + MOVQ $0x00, R9 + + // | a10 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a10 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a10 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a10 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a10 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a10 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + + // | a10 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + + // | + +/* i = 11 */ + + // | a11 @ CX + MOVQ 88(DI), CX + MOVQ $0x00, BX + + // | a11 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ $0x00, R12 + ADCQ $0x00, BX + MOVQ R10, 88(SP) + MOVQ $0x00, R10 + + // | a11 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a11 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a11 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a11 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a11 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a11 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + + // | a11 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + + // | + +/* i = 12 */ + + // | a12 @ CX + MOVQ 96(DI), CX + MOVQ $0x00, BX + + // | a12 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ $0x00, R13 + ADCQ $0x00, BX + MOVQ R11, 96(SP) + MOVQ $0x00, R11 + + // | a12 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a12 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a12 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a12 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a12 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a12 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + + // | a12 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + + // | + +/* i = 13 */ + + // | a13 @ CX + MOVQ 104(DI), CX + MOVQ $0x00, BX + + // | a13 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ $0x00, R14 + ADCQ $0x00, BX + + // | a13 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a13 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a13 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a13 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a13 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a13 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ $0x00, BX + + // | a13 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, BX + + // | + +/* */ + + // | + // | W part 1 multiplication + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 R12 + // | 14 R13 | 15 R14 | 16 R15 | 17 R8 | 18 R9 | 19 R10 | 20 R11 | 21 BX | 22 - | 23 - | 24 - | 25 - | 26 - | 27 - + + + MOVQ R12, 104(SP) + MOVQ R13, 112(SP) + MOVQ R14, 120(SP) + MOVQ R15, 128(SP) + MOVQ R8, 136(SP) + MOVQ R9, 144(SP) + MOVQ R10, 152(SP) + MOVQ R11, 160(SP) + MOVQ BX, 168(SP) + + // | + // | W part 1 moved to stack + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) + // | 14 112(SP) | 15 120(SP) | 16 128(SP) | 17 136(SP) | 18 144(SP) | 19 152(SP) | 20 160(SP) | 21 168(SP) | 22 - | 23 - | 24 - | 25 - | 26 - | 27 - + + + MOVQ $0x00, R9 + MOVQ $0x00, R10 + MOVQ $0x00, R11 + MOVQ $0x00, R12 + MOVQ $0x00, R13 + MOVQ $0x00, R14 + MOVQ $0x00, R15 + + // | + +/* i = 0 */ + + // | a0 @ CX + MOVQ (DI), CX + + // | a0 * b8 + MOVQ 64(SI), AX + MULQ CX + MOVQ AX, 176(SP) + MOVQ DX, R8 + + // | a0 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + + // | a0 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + + // | a0 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + + // | a0 * b12 + MOVQ 96(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + + // | a0 * b13 + MOVQ 104(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + + // | + +/* i = 1 */ + + // | a1 @ CX + MOVQ 8(DI), CX + MOVQ $0x00, BX + + // | a1 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R10 + ADCQ $0x00, BX + MOVQ R8, 184(SP) + MOVQ $0x00, R8 + + // | a1 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a1 * b12 + MOVQ 96(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + + // | a1 * b13 + MOVQ 104(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + + // | + +/* i = 2 */ + + // | a2 @ CX + MOVQ 16(DI), CX + MOVQ $0x00, BX + + // | a2 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ $0x00, R11 + ADCQ $0x00, BX + MOVQ R9, 192(SP) + MOVQ $0x00, R9 + + // | a2 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a2 * b12 + MOVQ 96(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + + // | a2 * b13 + MOVQ 104(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + + // | + +/* i = 3 */ + + // | a3 @ CX + MOVQ 24(DI), CX + MOVQ $0x00, BX + + // | a3 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ $0x00, R12 + ADCQ $0x00, BX + MOVQ R10, 200(SP) + MOVQ $0x00, R10 + + // | a3 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a3 * b12 + MOVQ 96(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + + // | a3 * b13 + MOVQ 104(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + + // | + +/* i = 4 */ + + // | a4 @ CX + MOVQ 32(DI), CX + MOVQ $0x00, BX + + // | a4 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ $0x00, R13 + ADCQ $0x00, BX + MOVQ R11, 208(SP) + MOVQ $0x00, R11 + + // | a4 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a4 * b12 + MOVQ 96(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + + // | a4 * b13 + MOVQ 104(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + + // | + +/* i = 5 */ + + // | a5 @ CX + MOVQ 40(DI), CX + MOVQ $0x00, BX + + // | a5 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ $0x00, R14 + ADCQ $0x00, BX + MOVQ R12, 216(SP) + MOVQ $0x00, R12 + + // | a5 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a5 * b12 + MOVQ 96(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + + // | a5 * b13 + MOVQ 104(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + + // | + +/* i = 6 */ + + // | a6 @ CX + MOVQ 48(DI), CX + MOVQ $0x00, BX + + // | a6 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ $0x00, R15 + ADCQ $0x00, BX + MOVQ R13, 224(SP) + MOVQ $0x00, R13 + + // | a6 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a6 * b12 + MOVQ 96(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + + // | a6 * b13 + MOVQ 104(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + + // | + +/* i = 7 */ + + // | a7 @ CX + MOVQ 56(DI), CX + MOVQ $0x00, BX + + // | a7 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ $0x00, R8 + ADCQ $0x00, BX + MOVQ R14, 232(SP) + MOVQ $0x00, R14 + + // | a7 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a7 * b12 + MOVQ 96(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + + // | a7 * b13 + MOVQ 104(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + + // | + +/* i = 8 */ + + // | a8 @ CX + MOVQ 64(DI), CX + MOVQ $0x00, BX + + // | a8 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ $0x00, R9 + ADCQ $0x00, BX + MOVQ R15, 240(SP) + MOVQ $0x00, R15 + + // | a8 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b12 + MOVQ 96(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + + // | a8 * b13 + MOVQ 104(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + + // | + +/* i = 9 */ + + // | a9 @ CX + MOVQ 72(DI), CX + MOVQ $0x00, BX + + // | a9 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R10 + ADCQ $0x00, BX + MOVQ R8, 248(SP) + MOVQ $0x00, R8 + + // | a9 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a9 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a9 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a9 * b12 + MOVQ 96(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + + // | a9 * b13 + MOVQ 104(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + + // | + +/* i = 10 */ + + // | a10 @ CX + MOVQ 80(DI), CX + MOVQ $0x00, BX + + // | a10 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ $0x00, R11 + ADCQ $0x00, BX + MOVQ R9, 256(SP) + MOVQ $0x00, R9 + + // | a10 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a10 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a10 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a10 * b12 + MOVQ 96(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + + // | a10 * b13 + MOVQ 104(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + + // | + +/* i = 11 */ + + // | a11 @ CX + MOVQ 88(DI), CX + MOVQ $0x00, BX + + // | a11 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ $0x00, R12 + ADCQ $0x00, BX + + // | a11 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a11 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a11 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a11 * b12 + MOVQ 96(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + + // | a11 * b13 + MOVQ 104(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + + // | + +/* i = 12 */ + + // | a12 @ CX + MOVQ 96(DI), CX + MOVQ $0x00, BX + + // | a12 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ $0x00, R13 + ADCQ $0x00, BX + + // | a12 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a12 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a12 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a12 * b12 + MOVQ 96(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + + // | a12 * b13 + MOVQ 104(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + + // | + +/* i = 13 */ + + // | a13 @ CX + MOVQ 104(DI), CX + MOVQ $0x00, BX + + // | a13 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ $0x00, R14 + ADCQ $0x00, BX + + // | a13 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a13 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a13 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a13 * b12 + MOVQ 96(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, BX + + // | a13 * b13 + MOVQ 104(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, BX + + // | + +/* */ + + // | + // | W part 2 multiplication + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 176(SP) | 9 184(SP) | 10 192(SP) | 11 200(SP) | 12 208(SP) | 13 216(SP) + // | 14 224(SP) | 15 232(SP) | 16 240(SP) | 17 248(SP) | 18 256(SP) | 19 R10 | 20 R11 | 21 R12 | 22 R13 | 23 R14 | 24 R15 | 25 R8 | 26 R9 | 27 BX + + + // | + // | W part 1 + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) + // | 14 112(SP) | 15 120(SP) | 16 128(SP) | 17 136(SP) | 18 144(SP) | 19 152(SP) | 20 160(SP) | 21 168(SP) | 22 - | 23 - | 24 - | 25 - | 26 - | 27 - + + + MOVQ 64(SP), AX + ADDQ AX, 176(SP) + MOVQ 72(SP), AX + ADCQ AX, 184(SP) + MOVQ 80(SP), AX + ADCQ AX, 192(SP) + MOVQ 88(SP), AX + ADCQ AX, 200(SP) + MOVQ 96(SP), AX + ADCQ AX, 208(SP) + MOVQ 104(SP), AX + ADCQ AX, 216(SP) + MOVQ 112(SP), AX + ADCQ AX, 224(SP) + MOVQ 120(SP), AX + ADCQ AX, 232(SP) + MOVQ 128(SP), AX + ADCQ AX, 240(SP) + MOVQ 136(SP), AX + ADCQ AX, 248(SP) + MOVQ 144(SP), AX + ADCQ AX, 256(SP) + ADCQ 152(SP), R10 + ADCQ 160(SP), R11 + ADCQ 168(SP), R12 + ADCQ $0x00, R13 + ADCQ $0x00, R14 + ADCQ $0x00, R15 + ADCQ $0x00, R8 + ADCQ $0x00, R9 + ADCQ $0x00, BX + + // | + // | W combined + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 176(SP) | 9 184(SP) | 10 192(SP) | 11 200(SP) | 12 208(SP) | 13 216(SP) + // | 14 224(SP) | 15 232(SP) | 16 240(SP) | 17 248(SP) | 18 256(SP) | 19 R10 | 20 R11 | 21 R12 | 22 R13 | 23 R14 | 24 R15 | 25 R8 | 26 R9 | 27 BX + + + MOVQ (SP), CX + MOVQ 8(SP), DI + MOVQ 16(SP), SI + MOVQ BX, (SP) + MOVQ 24(SP), BX + MOVQ R9, 8(SP) + MOVQ 32(SP), R9 + MOVQ R8, 16(SP) + MOVQ 40(SP), R8 + MOVQ R15, 24(SP) + MOVQ 48(SP), R15 + MOVQ R14, 32(SP) + MOVQ 56(SP), R14 + MOVQ R13, 40(SP) + MOVQ 176(SP), R13 + MOVQ R12, 48(SP) + MOVQ R11, 56(SP) + MOVQ R10, 64(SP) + + // | fetch modulus + MOVQ p+24(FP), R12 + + // | + +/* montgomery reduction q1 */ + + // | + +/* i = 0 */ + + // | + // | W + // | 0 CX | 1 DI | 2 SI | 3 BX | 4 R9 | 5 R8 | 6 R15 | 7 R14 | 8 R13 | 9 184(SP) | 10 192(SP) | 11 200(SP) | 12 208(SP) | 13 216(SP) + // | 14 224(SP) | 15 232(SP) | 16 240(SP) | 17 248(SP) | 18 256(SP) | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | | u0 = w0 * inp + MOVQ CX, AX + MULQ inp+32(FP) + MOVQ AX, R11 + MOVQ $0x00, R10 + + // | + +/* */ + + // | save u0 + MOVQ R11, 72(SP) + + // | j0 + + // | w0 @ CX + MOVQ (R12), AX + MULQ R11 + ADDQ AX, CX + ADCQ DX, R10 + + // | j1 + + // | w1 @ DI + MOVQ 8(R12), AX + MULQ R11 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R10, DI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j2 + + // | w2 @ SI + MOVQ 16(R12), AX + MULQ R11 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R10, SI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j3 + + // | w3 @ BX + MOVQ 24(R12), AX + MULQ R11 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R10, BX + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j4 + + // | w4 @ R9 + MOVQ 32(R12), AX + MULQ R11 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R10, R9 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j5 + + // | w5 @ R8 + MOVQ 40(R12), AX + MULQ R11 + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R10, R8 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j6 + + // | w6 @ R15 + MOVQ 48(R12), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R10, R15 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j7 + + // | w7 @ R14 + MOVQ 56(R12), AX + MULQ R11 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R10, R14 + + // | w8 @ R13 + ADCQ DX, R13 + ADCQ $0x00, CX + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 DI | 2 SI | 3 BX | 4 R9 | 5 R8 | 6 R15 | 7 R14 | 8 R13 | 9 184(SP) | 10 192(SP) | 11 200(SP) | 12 208(SP) | 13 216(SP) + // | 14 224(SP) | 15 232(SP) | 16 240(SP) | 17 248(SP) | 18 256(SP) | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | | u1 = w1 * inp + MOVQ DI, AX + MULQ inp+32(FP) + MOVQ AX, R11 + MOVQ $0x00, R10 + + // | + +/* */ + + // | save u1 + MOVQ R11, 80(SP) + + // | j0 + + // | w1 @ DI + MOVQ (R12), AX + MULQ R11 + ADDQ AX, DI + ADCQ DX, R10 + + // | j1 + + // | w2 @ SI + MOVQ 8(R12), AX + MULQ R11 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R10, SI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j2 + + // | w3 @ BX + MOVQ 16(R12), AX + MULQ R11 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R10, BX + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j3 + + // | w4 @ R9 + MOVQ 24(R12), AX + MULQ R11 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R10, R9 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j4 + + // | w5 @ R8 + MOVQ 32(R12), AX + MULQ R11 + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R10, R8 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j5 + + // | w6 @ R15 + MOVQ 40(R12), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R10, R15 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j6 + + // | w7 @ R14 + MOVQ 48(R12), AX + MULQ R11 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R10, R14 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j7 + + // | w8 @ R13 + MOVQ 56(R12), AX + MULQ R11 + ADDQ AX, R13 + ADCQ DX, CX + ADDQ R10, R13 + + // | move to idle register + MOVQ 184(SP), DI + + // | w9 @ DI + ADCQ CX, DI + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 SI | 3 BX | 4 R9 | 5 R8 | 6 R15 | 7 R14 | 8 R13 | 9 DI | 10 192(SP) | 11 200(SP) | 12 208(SP) | 13 216(SP) + // | 14 224(SP) | 15 232(SP) | 16 240(SP) | 17 248(SP) | 18 256(SP) | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | | u2 = w2 * inp + MOVQ SI, AX + MULQ inp+32(FP) + MOVQ AX, R11 + MOVQ $0x00, R10 + + // | + +/* */ + + // | save u2 + MOVQ R11, 88(SP) + + // | j0 + + // | w2 @ SI + MOVQ (R12), AX + MULQ R11 + ADDQ AX, SI + ADCQ DX, R10 + + // | j1 + + // | w3 @ BX + MOVQ 8(R12), AX + MULQ R11 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R10, BX + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j2 + + // | w4 @ R9 + MOVQ 16(R12), AX + MULQ R11 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R10, R9 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j3 + + // | w5 @ R8 + MOVQ 24(R12), AX + MULQ R11 + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R10, R8 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j4 + + // | w6 @ R15 + MOVQ 32(R12), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R10, R15 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j5 + + // | w7 @ R14 + MOVQ 40(R12), AX + MULQ R11 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R10, R14 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j6 + + // | w8 @ R13 + MOVQ 48(R12), AX + MULQ R11 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R10, R13 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j7 + + // | w9 @ DI + MOVQ 56(R12), AX + MULQ R11 + ADDQ AX, DI + ADCQ DX, CX + ADDQ R10, DI + + // | move to idle register + MOVQ 192(SP), SI + + // | w10 @ SI + ADCQ CX, SI + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 BX | 4 R9 | 5 R8 | 6 R15 | 7 R14 | 8 R13 | 9 DI | 10 SI | 11 200(SP) | 12 208(SP) | 13 216(SP) + // | 14 224(SP) | 15 232(SP) | 16 240(SP) | 17 248(SP) | 18 256(SP) | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | | u3 = w3 * inp + MOVQ BX, AX + MULQ inp+32(FP) + MOVQ AX, R11 + MOVQ $0x00, R10 + + // | + +/* */ + + // | save u3 + MOVQ R11, 96(SP) + + // | j0 + + // | w3 @ BX + MOVQ (R12), AX + MULQ R11 + ADDQ AX, BX + ADCQ DX, R10 + + // | j1 + + // | w4 @ R9 + MOVQ 8(R12), AX + MULQ R11 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R10, R9 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j2 + + // | w5 @ R8 + MOVQ 16(R12), AX + MULQ R11 + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R10, R8 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j3 + + // | w6 @ R15 + MOVQ 24(R12), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R10, R15 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j4 + + // | w7 @ R14 + MOVQ 32(R12), AX + MULQ R11 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R10, R14 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j5 + + // | w8 @ R13 + MOVQ 40(R12), AX + MULQ R11 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R10, R13 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j6 + + // | w9 @ DI + MOVQ 48(R12), AX + MULQ R11 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R10, DI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j7 + + // | w10 @ SI + MOVQ 56(R12), AX + MULQ R11 + ADDQ AX, SI + ADCQ DX, CX + ADDQ R10, SI + + // | move to idle register + MOVQ 200(SP), BX + + // | w11 @ BX + ADCQ CX, BX + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 4 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 R9 | 5 R8 | 6 R15 | 7 R14 | 8 R13 | 9 DI | 10 SI | 11 BX | 12 208(SP) | 13 216(SP) + // | 14 224(SP) | 15 232(SP) | 16 240(SP) | 17 248(SP) | 18 256(SP) | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | | u4 = w4 * inp + MOVQ R9, AX + MULQ inp+32(FP) + MOVQ AX, R11 + MOVQ $0x00, R10 + + // | + +/* */ + + // | save u4 + MOVQ R11, 104(SP) + + // | j0 + + // | w4 @ R9 + MOVQ (R12), AX + MULQ R11 + ADDQ AX, R9 + ADCQ DX, R10 + + // | j1 + + // | w5 @ R8 + MOVQ 8(R12), AX + MULQ R11 + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R10, R8 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j2 + + // | w6 @ R15 + MOVQ 16(R12), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R10, R15 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j3 + + // | w7 @ R14 + MOVQ 24(R12), AX + MULQ R11 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R10, R14 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j4 + + // | w8 @ R13 + MOVQ 32(R12), AX + MULQ R11 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R10, R13 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j5 + + // | w9 @ DI + MOVQ 40(R12), AX + MULQ R11 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R10, DI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j6 + + // | w10 @ SI + MOVQ 48(R12), AX + MULQ R11 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R10, SI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j7 + + // | w11 @ BX + MOVQ 56(R12), AX + MULQ R11 + ADDQ AX, BX + ADCQ DX, CX + ADDQ R10, BX + + // | move to idle register + MOVQ 208(SP), R9 + + // | w12 @ R9 + ADCQ CX, R9 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 5 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 R8 | 6 R15 | 7 R14 | 8 R13 | 9 DI | 10 SI | 11 BX | 12 R9 | 13 216(SP) + // | 14 224(SP) | 15 232(SP) | 16 240(SP) | 17 248(SP) | 18 256(SP) | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | | u5 = w5 * inp + MOVQ R8, AX + MULQ inp+32(FP) + MOVQ AX, R11 + MOVQ $0x00, R10 + + // | + +/* */ + + // | save u5 + MOVQ R11, 112(SP) + + // | j0 + + // | w5 @ R8 + MOVQ (R12), AX + MULQ R11 + ADDQ AX, R8 + ADCQ DX, R10 + + // | j1 + + // | w6 @ R15 + MOVQ 8(R12), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R10, R15 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j2 + + // | w7 @ R14 + MOVQ 16(R12), AX + MULQ R11 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R10, R14 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j3 + + // | w8 @ R13 + MOVQ 24(R12), AX + MULQ R11 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R10, R13 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j4 + + // | w9 @ DI + MOVQ 32(R12), AX + MULQ R11 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R10, DI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j5 + + // | w10 @ SI + MOVQ 40(R12), AX + MULQ R11 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R10, SI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j6 + + // | w11 @ BX + MOVQ 48(R12), AX + MULQ R11 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R10, BX + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j7 + + // | w12 @ R9 + MOVQ 56(R12), AX + MULQ R11 + ADDQ AX, R9 + ADCQ DX, CX + ADDQ R10, R9 + + // | move to idle register + MOVQ 216(SP), R8 + + // | w13 @ R8 + ADCQ CX, R8 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 6 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 R15 | 7 R14 | 8 R13 | 9 DI | 10 SI | 11 BX | 12 R9 | 13 R8 + // | 14 224(SP) | 15 232(SP) | 16 240(SP) | 17 248(SP) | 18 256(SP) | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | | u6 = w6 * inp + MOVQ R15, AX + MULQ inp+32(FP) + MOVQ AX, R11 + MOVQ $0x00, R10 + + // | + +/* */ + + // | save u6 + MOVQ R11, 120(SP) + + // | j0 + + // | w6 @ R15 + MOVQ (R12), AX + MULQ R11 + ADDQ AX, R15 + ADCQ DX, R10 + + // | j1 + + // | w7 @ R14 + MOVQ 8(R12), AX + MULQ R11 + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R10, R14 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j2 + + // | w8 @ R13 + MOVQ 16(R12), AX + MULQ R11 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R10, R13 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j3 + + // | w9 @ DI + MOVQ 24(R12), AX + MULQ R11 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R10, DI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j4 + + // | w10 @ SI + MOVQ 32(R12), AX + MULQ R11 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R10, SI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j5 + + // | w11 @ BX + MOVQ 40(R12), AX + MULQ R11 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R10, BX + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j6 + + // | w12 @ R9 + MOVQ 48(R12), AX + MULQ R11 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R10, R9 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j7 + + // | w13 @ R8 + MOVQ 56(R12), AX + MULQ R11 + ADDQ AX, R8 + ADCQ DX, CX + ADDQ R10, R8 + + // | move to idle register + MOVQ 224(SP), R15 + + // | w14 @ R15 + ADCQ CX, R15 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 7 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 R14 | 8 R13 | 9 DI | 10 SI | 11 BX | 12 R9 | 13 R8 + // | 14 R15 | 15 232(SP) | 16 240(SP) | 17 248(SP) | 18 256(SP) | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | | u7 = w7 * inp + MOVQ R14, AX + MULQ inp+32(FP) + MOVQ AX, R11 + MOVQ $0x00, R10 + + // | + +/* */ + + // | save u7 + MOVQ R11, 128(SP) + + // | j0 + + // | w7 @ R14 + MOVQ (R12), AX + MULQ R11 + ADDQ AX, R14 + ADCQ DX, R10 + + // | j1 + + // | w8 @ R13 + MOVQ 8(R12), AX + MULQ R11 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R10, R13 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j2 + + // | w9 @ DI + MOVQ 16(R12), AX + MULQ R11 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R10, DI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j3 + + // | w10 @ SI + MOVQ 24(R12), AX + MULQ R11 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R10, SI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j4 + + // | w11 @ BX + MOVQ 32(R12), AX + MULQ R11 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R10, BX + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j5 + + // | w12 @ R9 + MOVQ 40(R12), AX + MULQ R11 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R10, R9 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j6 + + // | w13 @ R8 + MOVQ 48(R12), AX + MULQ R11 + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R10, R8 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j7 + + // | w14 @ R15 + MOVQ 56(R12), AX + MULQ R11 + ADDQ AX, R15 + ADCQ DX, CX + ADDQ R10, R15 + + // | move to idle register + MOVQ 232(SP), R14 + + // | w15 @ R14 + ADCQ CX, R14 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + // | W q1 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R13 | 9 DI | 10 SI | 11 BX | 12 R9 | 13 R8 + // | 14 R15 | 15 R14 | 16 240(SP) | 17 248(SP) | 18 256(SP) | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | save the carry from q1 + // | should be added to w16 + MOVQ CX, 136(SP) + + // | + +/* montgomerry reduction q2 */ + + // | + +/* i = 0 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R13 | 9 DI | 10 SI | 11 BX | 12 R9 | 13 R8 + // | 14 R15 | 15 R14 | 16 240(SP) | 17 248(SP) | 18 256(SP) | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + MOVQ $0x00, R10 + + // | + +/* */ + + // | j8 + + // | w8 @ R13 + MOVQ 64(R12), AX + MULQ 72(SP) + ADDQ AX, R13 + ADCQ DX, R10 + + // | j9 + + // | w9 @ DI + MOVQ 72(R12), AX + MULQ 72(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R10, DI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j10 + + // | w10 @ SI + MOVQ 80(R12), AX + MULQ 72(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R10, SI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j11 + + // | w11 @ BX + MOVQ 88(R12), AX + MULQ 72(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R10, BX + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j12 + + // | w12 @ R9 + MOVQ 96(R12), AX + MULQ 72(SP) + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R10, R9 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j13 + + // | w13 @ R8 + MOVQ 104(R12), AX + MULQ 72(SP) + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R10, R8 + + // | w14 @ R15 + ADCQ DX, R15 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R13 | 9 DI | 10 SI | 11 BX | 12 R9 | 13 R8 + // | 14 R15 | 15 R14 | 16 240(SP) | 17 248(SP) | 18 256(SP) | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + MOVQ $0x00, R10 + + // | + +/* */ + + // | j8 + + // | w9 @ DI + MOVQ 64(R12), AX + MULQ 80(SP) + ADDQ AX, DI + ADCQ DX, R10 + MOVQ DI, 72(SP) + + // | j9 + + // | w10 @ SI + MOVQ 72(R12), AX + MULQ 80(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R10, SI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j10 + + // | w11 @ BX + MOVQ 80(R12), AX + MULQ 80(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R10, BX + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j11 + + // | w12 @ R9 + MOVQ 88(R12), AX + MULQ 80(SP) + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R10, R9 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j12 + + // | w13 @ R8 + MOVQ 96(R12), AX + MULQ 80(SP) + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R10, R8 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j13 + + // | w14 @ R15 + MOVQ 104(R12), AX + MULQ 80(SP) + ADDQ AX, R15 + ADCQ DX, CX + ADDQ R10, R15 + + // | w15 @ R14 + ADCQ CX, R14 + + // | bring the carry from q1 + MOVQ 136(SP), CX + ADCQ $0x00, CX + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R13 | 9 72(SP) | 10 SI | 11 BX | 12 R9 | 13 R8 + // | 14 R15 | 15 R14 | 16 240(SP) | 17 248(SP) | 18 256(SP) | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + MOVQ $0x00, R10 + + // | + +/* */ + + // | j8 + + // | w10 @ SI + MOVQ 64(R12), AX + MULQ 88(SP) + ADDQ AX, SI + ADCQ DX, R10 + MOVQ SI, 80(SP) + + // | j9 + + // | w11 @ BX + MOVQ 72(R12), AX + MULQ 88(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R10, BX + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j10 + + // | w12 @ R9 + MOVQ 80(R12), AX + MULQ 88(SP) + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R10, R9 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j11 + + // | w13 @ R8 + MOVQ 88(R12), AX + MULQ 88(SP) + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R10, R8 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j12 + + // | w14 @ R15 + MOVQ 96(R12), AX + MULQ 88(SP) + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R10, R15 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j13 + + // | w15 @ R14 + MOVQ 104(R12), AX + MULQ 88(SP) + ADDQ AX, R14 + ADCQ DX, CX + ADDQ R10, R14 + + // | move to an idle register + MOVQ 240(SP), R11 + + // | w16 @ R11 + ADCQ CX, R11 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R13 | 9 72(SP) | 10 80(SP) | 11 BX | 12 R9 | 13 R8 + // | 14 R15 | 15 R14 | 16 R11 | 17 248(SP) | 18 256(SP) | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + MOVQ $0x00, R10 + + // | + +/* */ + + // | j8 + + // | w11 @ BX + MOVQ 64(R12), AX + MULQ 96(SP) + ADDQ AX, BX + ADCQ DX, R10 + MOVQ BX, 88(SP) + + // | j9 + + // | w12 @ R9 + MOVQ 72(R12), AX + MULQ 96(SP) + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R10, R9 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j10 + + // | w13 @ R8 + MOVQ 80(R12), AX + MULQ 96(SP) + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R10, R8 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j11 + + // | w14 @ R15 + MOVQ 88(R12), AX + MULQ 96(SP) + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R10, R15 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j12 + + // | w15 @ R14 + MOVQ 96(R12), AX + MULQ 96(SP) + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R10, R14 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j13 + + // | w16 @ R11 + MOVQ 104(R12), AX + MULQ 96(SP) + ADDQ AX, R11 + ADCQ DX, CX + ADDQ R10, R11 + + // | move to an idle register + MOVQ 248(SP), BX + + // | w17 @ BX + ADCQ CX, BX + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 4 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R13 | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 R9 | 13 R8 + // | 14 R15 | 15 R14 | 16 R11 | 17 BX | 18 256(SP) | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + MOVQ $0x00, R10 + + // | + +/* */ + + // | j8 + + // | w12 @ R9 + MOVQ 64(R12), AX + MULQ 104(SP) + ADDQ AX, R9 + ADCQ DX, R10 + MOVQ R9, 96(SP) + + // | j9 + + // | w13 @ R8 + MOVQ 72(R12), AX + MULQ 104(SP) + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R10, R8 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j10 + + // | w14 @ R15 + MOVQ 80(R12), AX + MULQ 104(SP) + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R10, R15 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j11 + + // | w15 @ R14 + MOVQ 88(R12), AX + MULQ 104(SP) + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R10, R14 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j12 + + // | w16 @ R11 + MOVQ 96(R12), AX + MULQ 104(SP) + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R10, R11 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j13 + + // | w17 @ BX + MOVQ 104(R12), AX + MULQ 104(SP) + ADDQ AX, BX + ADCQ DX, CX + ADDQ R10, BX + + // | move to an idle register + MOVQ 256(SP), DI + + // | w18 @ DI + ADCQ CX, DI + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 5 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R13 | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 R8 + // | 14 R15 | 15 R14 | 16 R11 | 17 BX | 18 DI | 19 64(SP) | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + MOVQ $0x00, R10 + + // | + +/* */ + + // | j8 + + // | w13 @ R8 + MOVQ 64(R12), AX + MULQ 112(SP) + ADDQ AX, R8 + ADCQ DX, R10 + + // | j9 + + // | w14 @ R15 + MOVQ 72(R12), AX + MULQ 112(SP) + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R10, R15 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j10 + + // | w15 @ R14 + MOVQ 80(R12), AX + MULQ 112(SP) + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R10, R14 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j11 + + // | w16 @ R11 + MOVQ 88(R12), AX + MULQ 112(SP) + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R10, R11 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j12 + + // | w17 @ BX + MOVQ 96(R12), AX + MULQ 112(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R10, BX + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j13 + + // | w18 @ DI + MOVQ 104(R12), AX + MULQ 112(SP) + ADDQ AX, DI + ADCQ DX, CX + ADDQ R10, DI + + // | move to an idle register + MOVQ 64(SP), SI + + // | w19 @ SI + ADCQ CX, SI + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 6 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R13 | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 R8 + // | 14 R15 | 15 R14 | 16 R11 | 17 BX | 18 DI | 19 SI | 20 56(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + MOVQ $0x00, R10 + + // | + +/* */ + + // | j8 + + // | w14 @ R15 + MOVQ 64(R12), AX + MULQ 120(SP) + ADDQ AX, R15 + ADCQ DX, R10 + + // | j9 + + // | w15 @ R14 + MOVQ 72(R12), AX + MULQ 120(SP) + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R10, R14 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j10 + + // | w16 @ R11 + MOVQ 80(R12), AX + MULQ 120(SP) + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R10, R11 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j11 + + // | w17 @ BX + MOVQ 88(R12), AX + MULQ 120(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R10, BX + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j12 + + // | w18 @ DI + MOVQ 96(R12), AX + MULQ 120(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R10, DI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j13 + + // | w19 @ SI + MOVQ 104(R12), AX + MULQ 120(SP) + ADDQ AX, SI + ADCQ DX, CX + ADDQ R10, SI + + // | move to an idle register + MOVQ 56(SP), R9 + + // | w20 @ R9 + ADCQ CX, R9 + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + +/* i = 7 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R13 | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 R8 + // | 14 R15 | 15 R14 | 16 R11 | 17 BX | 18 DI | 19 SI | 20 R9 | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + MOVQ $0x00, R10 + + // | + +/* */ + + // | j8 + + // | w15 @ R14 + MOVQ 64(R12), AX + MULQ 128(SP) + ADDQ AX, R14 + ADCQ DX, R10 + + // | j9 + + // | w16 @ R11 + MOVQ 72(R12), AX + MULQ 128(SP) + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R10, R11 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j10 + + // | w17 @ BX + MOVQ 80(R12), AX + MULQ 128(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R10, BX + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j11 + + // | w18 @ DI + MOVQ 88(R12), AX + MULQ 128(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R10, DI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j12 + + // | w19 @ SI + MOVQ 96(R12), AX + MULQ 128(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R10, SI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j13 + + // | w20 @ R9 + MOVQ 104(R12), AX + MULQ 128(SP) + ADDQ AX, R9 + ADCQ DX, CX + ADDQ R10, R9 + + // | tolarete this limb to stay in stack + // | w21 @ 48(SP) + ADCQ CX, 48(SP) + MOVQ $0x00, CX + ADCQ $0x00, CX + + // | + // | q2 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R13 | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 R8 + // | 14 R15 | 15 R14 | 16 R11 | 17 BX | 18 DI | 19 SI | 20 R9 | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | save the carry from q2 + // | should be added to w22 + MOVQ CX, 136(SP) + + // | + +/* q2 q3 transition swap */ + + MOVQ 72(SP), CX + MOVQ R9, 72(SP) + MOVQ 80(SP), R9 + MOVQ SI, 80(SP) + MOVQ 88(SP), SI + MOVQ DI, 88(SP) + MOVQ 96(SP), DI + MOVQ BX, 96(SP) + + // | + // | W q2 q3 transition + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R13 | 9 CX | 10 R9 | 11 SI | 12 DI | 13 R8 + // | 14 R15 | 15 R14 | 16 R11 | 17 96(SP) | 18 88(SP) | 19 80(SP) | 20 72(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | + +/* montgomery reduction q3 */ + + // | + +/* i = 8 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R13 | 9 CX | 10 R9 | 11 SI | 12 DI | 13 R8 + // | 14 R15 | 15 R14 | 16 R11 | 17 96(SP) | 18 88(SP) | 19 80(SP) | 20 72(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | | u8 = w8 * inp + MOVQ R13, AX + MULQ inp+32(FP) + MOVQ AX, BX + MOVQ $0x00, R10 + + // | + +/* */ + + // | save u8 + MOVQ BX, 104(SP) + + // | j0 + + // | w8 @ R13 + MOVQ (R12), AX + MULQ BX + ADDQ AX, R13 + ADCQ DX, R10 + + // | j1 + + // | w9 @ CX + MOVQ 8(R12), AX + MULQ BX + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R10, CX + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j2 + + // | w10 @ R9 + MOVQ 16(R12), AX + MULQ BX + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R10, R9 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j3 + + // | w11 @ SI + MOVQ 24(R12), AX + MULQ BX + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R10, SI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j4 + + // | w12 @ DI + MOVQ 32(R12), AX + MULQ BX + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R10, DI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j5 + + // | w13 @ R8 + MOVQ 40(R12), AX + MULQ BX + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R10, R8 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j6 + + // | w14 @ R15 + MOVQ 48(R12), AX + MULQ BX + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R10, R15 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j7 + + // | w15 @ R14 + MOVQ 56(R12), AX + MULQ BX + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R10, R14 + + // | w16 @ R11 + ADCQ DX, R11 + ADCQ $0x00, R13 + + // | + +/* i = 9 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 CX | 10 R9 | 11 SI | 12 DI | 13 R8 + // | 14 R15 | 15 R14 | 16 R11 | 17 96(SP) | 18 88(SP) | 19 80(SP) | 20 72(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | | u9 = w9 * inp + MOVQ CX, AX + MULQ inp+32(FP) + MOVQ AX, BX + MOVQ $0x00, R10 + + // | + +/* */ + + // | save u9 + MOVQ BX, 112(SP) + + // | j0 + + // | w9 @ CX + MOVQ (R12), AX + MULQ BX + ADDQ AX, CX + ADCQ DX, R10 + + // | j1 + + // | w10 @ R9 + MOVQ 8(R12), AX + MULQ BX + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R10, R9 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j2 + + // | w11 @ SI + MOVQ 16(R12), AX + MULQ BX + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R10, SI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j3 + + // | w12 @ DI + MOVQ 24(R12), AX + MULQ BX + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R10, DI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j4 + + // | w13 @ R8 + MOVQ 32(R12), AX + MULQ BX + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R10, R8 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j5 + + // | w14 @ R15 + MOVQ 40(R12), AX + MULQ BX + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R10, R15 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j6 + + // | w15 @ R14 + MOVQ 48(R12), AX + MULQ BX + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R10, R14 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j7 + + // | w16 @ R11 + MOVQ 56(R12), AX + MULQ BX + ADDQ AX, R11 + ADCQ DX, R13 + ADDQ R10, R11 + + // | move to idle register + MOVQ 96(SP), CX + + // | w17 @ CX + ADCQ R13, CX + MOVQ $0x00, R13 + ADCQ $0x00, R13 + + // | + +/* i = 10 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 R9 | 11 SI | 12 DI | 13 R8 + // | 14 R15 | 15 R14 | 16 R11 | 17 CX | 18 88(SP) | 19 80(SP) | 20 72(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | | u10 = w10 * inp + MOVQ R9, AX + MULQ inp+32(FP) + MOVQ AX, BX + MOVQ $0x00, R10 + + // | + +/* */ + + // | save u10 + MOVQ BX, 96(SP) + + // | j0 + + // | w10 @ R9 + MOVQ (R12), AX + MULQ BX + ADDQ AX, R9 + ADCQ DX, R10 + + // | j1 + + // | w11 @ SI + MOVQ 8(R12), AX + MULQ BX + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R10, SI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j2 + + // | w12 @ DI + MOVQ 16(R12), AX + MULQ BX + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R10, DI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j3 + + // | w13 @ R8 + MOVQ 24(R12), AX + MULQ BX + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R10, R8 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j4 + + // | w14 @ R15 + MOVQ 32(R12), AX + MULQ BX + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R10, R15 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j5 + + // | w15 @ R14 + MOVQ 40(R12), AX + MULQ BX + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R10, R14 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j6 + + // | w16 @ R11 + MOVQ 48(R12), AX + MULQ BX + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R10, R11 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j7 + + // | w17 @ CX + MOVQ 56(R12), AX + MULQ BX + ADDQ AX, CX + ADCQ DX, R13 + ADDQ R10, CX + + // | move to idle register + MOVQ 88(SP), R9 + + // | w18 @ R9 + ADCQ R13, R9 + MOVQ $0x00, R13 + ADCQ $0x00, R13 + + // | + +/* i = 11 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 SI | 12 DI | 13 R8 + // | 14 R15 | 15 R14 | 16 R11 | 17 CX | 18 R9 | 19 80(SP) | 20 72(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | | u11 = w11 * inp + MOVQ SI, AX + MULQ inp+32(FP) + MOVQ AX, BX + MOVQ $0x00, R10 + + // | + +/* */ + + // | save u11 + MOVQ BX, 88(SP) + + // | j0 + + // | w11 @ SI + MOVQ (R12), AX + MULQ BX + ADDQ AX, SI + ADCQ DX, R10 + + // | j1 + + // | w12 @ DI + MOVQ 8(R12), AX + MULQ BX + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R10, DI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j2 + + // | w13 @ R8 + MOVQ 16(R12), AX + MULQ BX + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R10, R8 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j3 + + // | w14 @ R15 + MOVQ 24(R12), AX + MULQ BX + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R10, R15 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j4 + + // | w15 @ R14 + MOVQ 32(R12), AX + MULQ BX + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R10, R14 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j5 + + // | w16 @ R11 + MOVQ 40(R12), AX + MULQ BX + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R10, R11 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j6 + + // | w17 @ CX + MOVQ 48(R12), AX + MULQ BX + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R10, CX + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j7 + + // | w18 @ R9 + MOVQ 56(R12), AX + MULQ BX + ADDQ AX, R9 + ADCQ DX, R13 + ADDQ R10, R9 + + // | move to idle register + MOVQ 80(SP), SI + + // | w19 @ SI + ADCQ R13, SI + MOVQ $0x00, R13 + ADCQ $0x00, R13 + + // | + +/* i = 12 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 DI | 13 R8 + // | 14 R15 | 15 R14 | 16 R11 | 17 CX | 18 R9 | 19 SI | 20 72(SP) | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | | u12 = w12 * inp + MOVQ DI, AX + MULQ inp+32(FP) + MOVQ AX, BX + MOVQ $0x00, R10 + + // | + +/* */ + + // | save u12 + MOVQ BX, 80(SP) + + // | j0 + + // | w12 @ DI + MOVQ (R12), AX + MULQ BX + ADDQ AX, DI + ADCQ DX, R10 + + // | j1 + + // | w13 @ R8 + MOVQ 8(R12), AX + MULQ BX + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R10, R8 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j2 + + // | w14 @ R15 + MOVQ 16(R12), AX + MULQ BX + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R10, R15 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j3 + + // | w15 @ R14 + MOVQ 24(R12), AX + MULQ BX + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R10, R14 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j4 + + // | w16 @ R11 + MOVQ 32(R12), AX + MULQ BX + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R10, R11 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j5 + + // | w17 @ CX + MOVQ 40(R12), AX + MULQ BX + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R10, CX + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j6 + + // | w18 @ R9 + MOVQ 48(R12), AX + MULQ BX + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R10, R9 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j7 + + // | w19 @ SI + MOVQ 56(R12), AX + MULQ BX + ADDQ AX, SI + ADCQ DX, R13 + ADDQ R10, SI + + // | move to idle register + MOVQ 72(SP), DI + + // | w20 @ DI + ADCQ R13, DI + MOVQ $0x00, R13 + ADCQ $0x00, R13 + + // | + +/* i = 13 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 R8 + // | 14 R15 | 15 R14 | 16 R11 | 17 CX | 18 R9 | 19 SI | 20 DI | 21 48(SP) | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | | u13 = w13 * inp + MOVQ R8, AX + MULQ inp+32(FP) + MOVQ AX, BX + MOVQ $0x00, R10 + + // | + +/* */ + + // | save u13 + MOVQ BX, 72(SP) + + // | j0 + + // | w13 @ R8 + MOVQ (R12), AX + MULQ BX + ADDQ AX, R8 + ADCQ DX, R10 + + // | j1 + + // | w14 @ R15 + MOVQ 8(R12), AX + MULQ BX + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R10, R15 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j2 + + // | w15 @ R14 + MOVQ 16(R12), AX + MULQ BX + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R10, R14 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j3 + + // | w16 @ R11 + MOVQ 24(R12), AX + MULQ BX + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R10, R11 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j4 + + // | w17 @ CX + MOVQ 32(R12), AX + MULQ BX + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R10, CX + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j5 + + // | w18 @ R9 + MOVQ 40(R12), AX + MULQ BX + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R10, R9 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j6 + + // | w19 @ SI + MOVQ 48(R12), AX + MULQ BX + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R10, SI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j7 + + // | w20 @ DI + MOVQ 56(R12), AX + MULQ BX + ADDQ AX, DI + ADCQ DX, R13 + ADDQ R10, DI + + // | move to idle register + MOVQ 48(SP), R8 + + // | w21 @ R8 + ADCQ R13, R8 + MOVQ $0x00, R13 + ADCQ $0x00, R13 + + // | + // | W q3 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - + // | 14 R15 | 15 R14 | 16 R11 | 17 CX | 18 R9 | 19 SI | 20 DI | 21 R8 | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + // | aggregate carries from q2 & q3 + // | should be added to w22 + ADCQ R13, 136(SP) + + // | + +/* montgomerry reduction q4 */ + + // | + +/* i = 0 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - + // | 14 R15 | 15 R14 | 16 R11 | 17 CX | 18 R9 | 19 SI | 20 DI | 21 R8 | 22 40(SP) | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + MOVQ $0x00, R10 + + // | + +/* */ + + // | j8 + + // | w16 @ R11 + MOVQ 64(R12), AX + MULQ 104(SP) + ADDQ AX, R11 + ADCQ DX, R10 + + // | j9 + + // | w17 @ CX + MOVQ 72(R12), AX + MULQ 104(SP) + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R10, CX + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j10 + + // | w18 @ R9 + MOVQ 80(R12), AX + MULQ 104(SP) + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R10, R9 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j11 + + // | w19 @ SI + MOVQ 88(R12), AX + MULQ 104(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R10, SI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j12 + + // | w20 @ DI + MOVQ 96(R12), AX + MULQ 104(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R10, DI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j13 + + // | w21 @ R8 + MOVQ 104(R12), AX + MULQ 104(SP) + ADDQ AX, R8 + ADCQ 136(SP), DX + ADDQ R10, R8 + MOVQ 40(SP), BX + + // | w22 @ BX + ADCQ DX, BX + MOVQ $0x00, R13 + ADCQ $0x00, R13 + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - + // | 14 R15 | 15 R14 | 16 R11 | 17 CX | 18 R9 | 19 SI | 20 DI | 21 R8 | 22 BX | 23 32(SP) | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + MOVQ $0x00, R10 + + // | + +/* */ + + // | j8 + + // | w17 @ CX + MOVQ 64(R12), AX + MULQ 112(SP) + ADDQ AX, CX + ADCQ DX, R10 + MOVQ CX, 48(SP) + + // | j9 + + // | w18 @ R9 + MOVQ 72(R12), AX + MULQ 112(SP) + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R10, R9 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j10 + + // | w19 @ SI + MOVQ 80(R12), AX + MULQ 112(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R10, SI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j11 + + // | w20 @ DI + MOVQ 88(R12), AX + MULQ 112(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R10, DI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j12 + + // | w21 @ R8 + MOVQ 96(R12), AX + MULQ 112(SP) + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R10, R8 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j13 + + // | w22 @ BX + MOVQ 104(R12), AX + MULQ 112(SP) + ADDQ AX, BX + ADCQ DX, R13 + ADDQ R10, BX + MOVQ 32(SP), CX + + // | w23 @ CX + ADCQ R13, CX + MOVQ $0x00, R13 + ADCQ $0x00, R13 + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - + // | 14 R15 | 15 R14 | 16 R11 | 17 48(SP) | 18 R9 | 19 SI | 20 DI | 21 R8 | 22 BX | 23 CX | 24 24(SP) | 25 16(SP) | 26 8(SP) | 27 (SP) + + + MOVQ $0x00, R10 + + // | + +/* */ + + // | j8 + + // | w18 @ R9 + MOVQ 64(R12), AX + MULQ 96(SP) + ADDQ AX, R9 + ADCQ DX, R10 + MOVQ R9, 32(SP) + + // | j9 + + // | w19 @ SI + MOVQ 72(R12), AX + MULQ 96(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R10, SI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j10 + + // | w20 @ DI + MOVQ 80(R12), AX + MULQ 96(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R10, DI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j11 + + // | w21 @ R8 + MOVQ 88(R12), AX + MULQ 96(SP) + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R10, R8 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j12 + + // | w22 @ BX + MOVQ 96(R12), AX + MULQ 96(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R10, BX + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j13 + + // | w23 @ CX + MOVQ 104(R12), AX + MULQ 96(SP) + ADDQ AX, CX + ADCQ DX, R13 + ADDQ R10, CX + MOVQ 24(SP), R9 + + // | w24 @ R9 + ADCQ R13, R9 + MOVQ $0x00, R13 + ADCQ $0x00, R13 + + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - + // | 14 R15 | 15 R14 | 16 R11 | 17 48(SP) | 18 32(SP) | 19 SI | 20 DI | 21 R8 | 22 BX | 23 CX | 24 R9 | 25 16(SP) | 26 8(SP) | 27 (SP) + + + MOVQ $0x00, R10 + + // | + +/* */ + + // | j8 + + // | w19 @ SI + MOVQ 64(R12), AX + MULQ 88(SP) + ADDQ AX, SI + ADCQ DX, R10 + MOVQ SI, 24(SP) + + // | j9 + + // | w20 @ DI + MOVQ 72(R12), AX + MULQ 88(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R10, DI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j10 + + // | w21 @ R8 + MOVQ 80(R12), AX + MULQ 88(SP) + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R10, R8 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j11 + + // | w22 @ BX + MOVQ 88(R12), AX + MULQ 88(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R10, BX + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j12 + + // | w23 @ CX + MOVQ 96(R12), AX + MULQ 88(SP) + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R10, CX + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j13 + + // | w24 @ R9 + MOVQ 104(R12), AX + MULQ 88(SP) + ADDQ AX, R9 + ADCQ DX, R13 + ADDQ R10, R9 + MOVQ 16(SP), SI + + // | w25 @ SI + ADCQ R13, SI + MOVQ $0x00, R13 + ADCQ $0x00, R13 + + // | + +/* i = 4 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - + // | 14 R15 | 15 R14 | 16 R11 | 17 48(SP) | 18 32(SP) | 19 24(SP) | 20 DI | 21 R8 | 22 BX | 23 CX | 24 R9 | 25 SI | 26 8(SP) | 27 (SP) + + + MOVQ $0x00, R10 + + // | + +/* */ + + // | j8 + + // | w20 @ DI + MOVQ 64(R12), AX + MULQ 80(SP) + ADDQ AX, DI + ADCQ DX, R10 + MOVQ DI, 16(SP) + + // | j9 + + // | w21 @ R8 + MOVQ 72(R12), AX + MULQ 80(SP) + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R10, R8 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j10 + + // | w22 @ BX + MOVQ 80(R12), AX + MULQ 80(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R10, BX + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j11 + + // | w23 @ CX + MOVQ 88(R12), AX + MULQ 80(SP) + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R10, CX + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j12 + + // | w24 @ R9 + MOVQ 96(R12), AX + MULQ 80(SP) + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R10, R9 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j13 + + // | w25 @ SI + MOVQ 104(R12), AX + MULQ 80(SP) + ADDQ AX, SI + ADCQ DX, R13 + ADDQ R10, SI + MOVQ 8(SP), DI + + // | w26 @ DI + ADCQ R13, DI + MOVQ $0x00, R13 + ADCQ $0x00, R13 + + // | + +/* i = 5 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - + // | 14 R15 | 15 R14 | 16 R11 | 17 48(SP) | 18 32(SP) | 19 24(SP) | 20 16(SP) | 21 R8 | 22 BX | 23 CX | 24 R9 | 25 SI | 26 DI | 27 (SP) + + + MOVQ $0x00, R10 + + // | + +/* */ + + // | j8 + + // | w21 @ R8 + MOVQ 64(R12), AX + MULQ 72(SP) + ADDQ AX, R8 + ADCQ DX, R10 + + // | j9 + + // | w22 @ BX + MOVQ 72(R12), AX + MULQ 72(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R10, BX + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j10 + + // | w23 @ CX + MOVQ 80(R12), AX + MULQ 72(SP) + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R10, CX + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j11 + + // | w24 @ R9 + MOVQ 88(R12), AX + MULQ 72(SP) + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R10, R9 + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j12 + + // | w25 @ SI + MOVQ 96(R12), AX + MULQ 72(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R10, SI + MOVQ $0x00, R10 + ADCQ DX, R10 + + // | j13 + + // | w26 @ DI + MOVQ 104(R12), AX + MULQ 72(SP) + ADDQ AX, DI + ADCQ DX, R13 + ADDQ R10, DI + + // | very last limb goes to short carry register + MOVQ (SP), R10 + + // | w-1 @ R10 + ADCQ R13, R10 + MOVQ $0x00, R13 + ADCQ $0x00, R13 + + // | + // | W q4 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - + // | 14 R15 | 15 R14 | 16 R11 | 17 48(SP) | 18 32(SP) | 19 24(SP) | 20 16(SP) | 21 R8 | 22 BX | 23 CX | 24 R9 | 25 SI | 26 DI | 27 R10 + + + // | + +/* modular reduction */ + + MOVQ R15, DX + SUBQ (R12), DX + MOVQ DX, (SP) + MOVQ R14, DX + SBBQ 8(R12), DX + MOVQ DX, 8(SP) + MOVQ R11, DX + SBBQ 16(R12), DX + MOVQ DX, 120(SP) + MOVQ 48(SP), DX + SBBQ 24(R12), DX + MOVQ DX, 128(SP) + MOVQ 32(SP), DX + SBBQ 32(R12), DX + MOVQ DX, 144(SP) + MOVQ 24(SP), DX + SBBQ 40(R12), DX + MOVQ DX, 152(SP) + MOVQ 16(SP), DX + SBBQ 48(R12), DX + MOVQ DX, 160(SP) + MOVQ R8, DX + SBBQ 56(R12), DX + MOVQ DX, 168(SP) + MOVQ BX, DX + SBBQ 64(R12), DX + MOVQ DX, 176(SP) + MOVQ CX, DX + SBBQ 72(R12), DX + MOVQ DX, 184(SP) + MOVQ R9, DX + SBBQ 80(R12), DX + MOVQ DX, 192(SP) + MOVQ SI, DX + SBBQ 88(R12), DX + MOVQ DX, 200(SP) + MOVQ DI, DX + SBBQ 96(R12), DX + MOVQ DX, 208(SP) + MOVQ R10, DX + SBBQ 104(R12), DX + MOVQ DX, 216(SP) + SBBQ $0x00, R13 + + // | + +/* out */ + + MOVQ c+0(FP), R13 + CMOVQCC (SP), R15 + MOVQ R15, (R13) + CMOVQCC 8(SP), R14 + MOVQ R14, 8(R13) + CMOVQCC 120(SP), R11 + MOVQ R11, 16(R13) + MOVQ 48(SP), DX + CMOVQCC 128(SP), DX + MOVQ DX, 24(R13) + MOVQ 32(SP), DX + CMOVQCC 144(SP), DX + MOVQ DX, 32(R13) + MOVQ 24(SP), DX + CMOVQCC 152(SP), DX + MOVQ DX, 40(R13) + MOVQ 16(SP), DX + CMOVQCC 160(SP), DX + MOVQ DX, 48(R13) + CMOVQCC 168(SP), R8 + MOVQ R8, 56(R13) + CMOVQCC 176(SP), BX + MOVQ BX, 64(R13) + CMOVQCC 184(SP), CX + MOVQ CX, 72(R13) + CMOVQCC 192(SP), R9 + MOVQ R9, 80(R13) + CMOVQCC 200(SP), SI + MOVQ SI, 88(R13) + CMOVQCC 208(SP), DI + MOVQ DI, 96(R13) + CMOVQCC 216(SP), R10 + MOVQ R10, 104(R13) + RET + + // | + +/* end */ + + +// func cpy15(dst *[15]uint64, src *[15]uint64) +TEXT ·cpy15(SB), NOSPLIT, $0-16 + MOVQ dst+0(FP), DI + MOVQ src+8(FP), SI + MOVQ (SI), R8 + MOVQ R8, (DI) + MOVQ 8(SI), R8 + MOVQ R8, 8(DI) + MOVQ 16(SI), R8 + MOVQ R8, 16(DI) + MOVQ 24(SI), R8 + MOVQ R8, 24(DI) + MOVQ 32(SI), R8 + MOVQ R8, 32(DI) + MOVQ 40(SI), R8 + MOVQ R8, 40(DI) + MOVQ 48(SI), R8 + MOVQ R8, 48(DI) + MOVQ 56(SI), R8 + MOVQ R8, 56(DI) + MOVQ 64(SI), R8 + MOVQ R8, 64(DI) + MOVQ 72(SI), R8 + MOVQ R8, 72(DI) + MOVQ 80(SI), R8 + MOVQ R8, 80(DI) + MOVQ 88(SI), R8 + MOVQ R8, 88(DI) + MOVQ 96(SI), R8 + MOVQ R8, 96(DI) + MOVQ 104(SI), R8 + MOVQ R8, 104(DI) + MOVQ 112(SI), R8 + MOVQ R8, 112(DI) + RET + +// func eq15(a *[15]uint64, b *[15]uint64) bool +TEXT ·eq15(SB), NOSPLIT, $0-17 + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + MOVB $0x00, ret+16(FP) + MOVQ (DI), R8 + CMPQ (SI), R8 + JNE ret + MOVQ 8(DI), R8 + CMPQ 8(SI), R8 + JNE ret + MOVQ 16(DI), R8 + CMPQ 16(SI), R8 + JNE ret + MOVQ 24(DI), R8 + CMPQ 24(SI), R8 + JNE ret + MOVQ 32(DI), R8 + CMPQ 32(SI), R8 + JNE ret + MOVQ 40(DI), R8 + CMPQ 40(SI), R8 + JNE ret + MOVQ 48(DI), R8 + CMPQ 48(SI), R8 + JNE ret + MOVQ 56(DI), R8 + CMPQ 56(SI), R8 + JNE ret + MOVQ 64(DI), R8 + CMPQ 64(SI), R8 + JNE ret + MOVQ 72(DI), R8 + CMPQ 72(SI), R8 + JNE ret + MOVQ 80(DI), R8 + CMPQ 80(SI), R8 + JNE ret + MOVQ 88(DI), R8 + CMPQ 88(SI), R8 + JNE ret + MOVQ 96(DI), R8 + CMPQ 96(SI), R8 + JNE ret + MOVQ 104(DI), R8 + CMPQ 104(SI), R8 + JNE ret + MOVQ 112(DI), R8 + CMPQ 112(SI), R8 + JNE ret + MOVB $0x01, ret+16(FP) + +ret: + RET + +// func cmp15(a *[15]uint64, b *[15]uint64) int8 +TEXT ·cmp15(SB), NOSPLIT, $0-17 + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + MOVQ 112(DI), R8 + CMPQ 112(SI), R8 + JB gt + JA lt + MOVQ 104(DI), R8 + CMPQ 104(SI), R8 + JB gt + JA lt + MOVQ 96(DI), R8 + CMPQ 96(SI), R8 + JB gt + JA lt + MOVQ 88(DI), R8 + CMPQ 88(SI), R8 + JB gt + JA lt + MOVQ 80(DI), R8 + CMPQ 80(SI), R8 + JB gt + JA lt + MOVQ 72(DI), R8 + CMPQ 72(SI), R8 + JB gt + JA lt + MOVQ 64(DI), R8 + CMPQ 64(SI), R8 + JB gt + JA lt + MOVQ 56(DI), R8 + CMPQ 56(SI), R8 + JB gt + JA lt + MOVQ 48(DI), R8 + CMPQ 48(SI), R8 + JB gt + JA lt + MOVQ 40(DI), R8 + CMPQ 40(SI), R8 + JB gt + JA lt + MOVQ 32(DI), R8 + CMPQ 32(SI), R8 + JB gt + JA lt + MOVQ 24(DI), R8 + CMPQ 24(SI), R8 + JB gt + JA lt + MOVQ 16(DI), R8 + CMPQ 16(SI), R8 + JB gt + JA lt + MOVQ 8(DI), R8 + CMPQ 8(SI), R8 + JB gt + JA lt + MOVQ (DI), R8 + CMPQ (SI), R8 + JB gt + JA lt + MOVB $0x00, ret+16(FP) + JMP ret + +gt: + MOVB $0x01, ret+16(FP) + JMP ret + +lt: + MOVB $0xff, ret+16(FP) + +ret: + RET + +// func add15(c *[15]uint64, a *[15]uint64, b *[15]uint64, p *[15]uint64) +TEXT ·add15(SB), NOSPLIT, $160-32 + // | + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + XORQ AX, AX + + // | + MOVQ (DI), CX + ADDQ (SI), CX + MOVQ 8(DI), DX + ADCQ 8(SI), DX + MOVQ 16(DI), R8 + ADCQ 16(SI), R8 + MOVQ 24(DI), R9 + ADCQ 24(SI), R9 + MOVQ 32(DI), R10 + ADCQ 32(SI), R10 + MOVQ 40(DI), R11 + ADCQ 40(SI), R11 + MOVQ 48(DI), R12 + ADCQ 48(SI), R12 + MOVQ 56(DI), R13 + ADCQ 56(SI), R13 + MOVQ 64(DI), R14 + ADCQ 64(SI), R14 + MOVQ 72(DI), R15 + ADCQ 72(SI), R15 + MOVQ 80(DI), BX + ADCQ 80(SI), BX + MOVQ BX, (SP) + MOVQ 88(DI), BX + ADCQ 88(SI), BX + MOVQ BX, 8(SP) + MOVQ 96(DI), BX + ADCQ 96(SI), BX + MOVQ BX, 16(SP) + MOVQ 104(DI), BX + ADCQ 104(SI), BX + MOVQ BX, 24(SP) + MOVQ 112(DI), BX + ADCQ 112(SI), BX + MOVQ BX, 32(SP) + ADCQ $0x00, AX + + // | + MOVQ p+24(FP), SI + MOVQ CX, BX + SUBQ (SI), BX + MOVQ BX, 40(SP) + MOVQ DX, BX + SBBQ 8(SI), BX + MOVQ BX, 48(SP) + MOVQ R8, BX + SBBQ 16(SI), BX + MOVQ BX, 56(SP) + MOVQ R9, BX + SBBQ 24(SI), BX + MOVQ BX, 64(SP) + MOVQ R10, BX + SBBQ 32(SI), BX + MOVQ BX, 72(SP) + MOVQ R11, BX + SBBQ 40(SI), BX + MOVQ BX, 80(SP) + MOVQ R12, BX + SBBQ 48(SI), BX + MOVQ BX, 88(SP) + MOVQ R13, BX + SBBQ 56(SI), BX + MOVQ BX, 96(SP) + MOVQ R14, BX + SBBQ 64(SI), BX + MOVQ BX, 104(SP) + MOVQ R15, BX + SBBQ 72(SI), BX + MOVQ BX, 112(SP) + MOVQ (SP), BX + SBBQ 80(SI), BX + MOVQ BX, 120(SP) + MOVQ 8(SP), BX + SBBQ 88(SI), BX + MOVQ BX, 128(SP) + MOVQ 16(SP), BX + SBBQ 96(SI), BX + MOVQ BX, 136(SP) + MOVQ 24(SP), BX + SBBQ 104(SI), BX + MOVQ BX, 144(SP) + MOVQ 32(SP), BX + SBBQ 112(SI), BX + MOVQ BX, 152(SP) + SBBQ $0x00, AX + + // | + MOVQ c+0(FP), DI + CMOVQCC 40(SP), CX + MOVQ CX, (DI) + CMOVQCC 48(SP), DX + MOVQ DX, 8(DI) + CMOVQCC 56(SP), R8 + MOVQ R8, 16(DI) + CMOVQCC 64(SP), R9 + MOVQ R9, 24(DI) + CMOVQCC 72(SP), R10 + MOVQ R10, 32(DI) + CMOVQCC 80(SP), R11 + MOVQ R11, 40(DI) + CMOVQCC 88(SP), R12 + MOVQ R12, 48(DI) + CMOVQCC 96(SP), R13 + MOVQ R13, 56(DI) + CMOVQCC 104(SP), R14 + MOVQ R14, 64(DI) + CMOVQCC 112(SP), R15 + MOVQ R15, 72(DI) + MOVQ (SP), BX + CMOVQCC 120(SP), BX + MOVQ BX, 80(DI) + MOVQ 8(SP), BX + CMOVQCC 128(SP), BX + MOVQ BX, 88(DI) + MOVQ 16(SP), BX + CMOVQCC 136(SP), BX + MOVQ BX, 96(DI) + MOVQ 24(SP), BX + CMOVQCC 144(SP), BX + MOVQ BX, 104(DI) + MOVQ 32(SP), BX + CMOVQCC 152(SP), BX + MOVQ BX, 112(DI) + RET + + // | + +/* end */ + + RET + +// func addn15(a *[15]uint64, b *[15]uint64) uint64 +TEXT ·addn15(SB), NOSPLIT, $40-24 + // | + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + + // | + MOVQ (DI), CX + ADDQ (SI), CX + MOVQ 8(DI), DX + ADCQ 8(SI), DX + MOVQ 16(DI), R8 + ADCQ 16(SI), R8 + MOVQ 24(DI), R9 + ADCQ 24(SI), R9 + MOVQ 32(DI), R10 + ADCQ 32(SI), R10 + MOVQ 40(DI), R11 + ADCQ 40(SI), R11 + MOVQ 48(DI), R12 + ADCQ 48(SI), R12 + MOVQ 56(DI), R13 + ADCQ 56(SI), R13 + MOVQ 64(DI), R14 + ADCQ 64(SI), R14 + MOVQ 72(DI), R15 + ADCQ 72(SI), R15 + MOVQ 80(DI), BX + ADCQ 80(SI), BX + MOVQ BX, (SP) + MOVQ 88(DI), BX + ADCQ 88(SI), BX + MOVQ BX, 8(SP) + MOVQ 96(DI), BX + ADCQ 96(SI), BX + MOVQ BX, 16(SP) + MOVQ 104(DI), BX + ADCQ 104(SI), BX + MOVQ BX, 24(SP) + MOVQ 112(DI), BX + ADCQ 112(SI), BX + MOVQ BX, 32(SP) + ADCQ $0x00, AX + + // | + MOVQ CX, (DI) + MOVQ DX, 8(DI) + MOVQ R8, 16(DI) + MOVQ R9, 24(DI) + MOVQ R10, 32(DI) + MOVQ R11, 40(DI) + MOVQ R12, 48(DI) + MOVQ R13, 56(DI) + MOVQ R14, 64(DI) + MOVQ R15, 72(DI) + MOVQ (SP), BX + MOVQ BX, 80(DI) + MOVQ 8(SP), BX + MOVQ BX, 88(DI) + MOVQ 16(SP), BX + MOVQ BX, 96(DI) + MOVQ 24(SP), BX + MOVQ BX, 104(DI) + MOVQ 32(SP), BX + MOVQ BX, 112(DI) + MOVQ AX, ret+16(FP) + RET + + // | + +/* end */ + + RET + +// func double15(c *[15]uint64, a *[15]uint64, p *[15]uint64) +TEXT ·double15(SB), NOSPLIT, $160-24 + // | + MOVQ a+8(FP), DI + XORQ AX, AX + MOVQ (DI), CX + ADDQ CX, CX + MOVQ 8(DI), DX + ADCQ DX, DX + MOVQ 16(DI), R8 + ADCQ R8, R8 + MOVQ 24(DI), R9 + ADCQ R9, R9 + MOVQ 32(DI), R10 + ADCQ R10, R10 + MOVQ 40(DI), R11 + ADCQ R11, R11 + MOVQ 48(DI), R12 + ADCQ R12, R12 + MOVQ 56(DI), R13 + ADCQ R13, R13 + MOVQ 64(DI), R14 + ADCQ R14, R14 + MOVQ 72(DI), R15 + ADCQ R15, R15 + MOVQ 80(DI), BX + ADCQ BX, BX + MOVQ BX, (SP) + MOVQ 88(DI), BX + ADCQ BX, BX + MOVQ BX, 8(SP) + MOVQ 96(DI), BX + ADCQ BX, BX + MOVQ BX, 16(SP) + MOVQ 104(DI), BX + ADCQ BX, BX + MOVQ BX, 24(SP) + MOVQ 112(DI), BX + ADCQ BX, BX + MOVQ BX, 32(SP) + ADCQ $0x00, AX + + // | + MOVQ p+16(FP), SI + MOVQ CX, BX + SUBQ (SI), BX + MOVQ BX, 40(SP) + MOVQ DX, BX + SBBQ 8(SI), BX + MOVQ BX, 48(SP) + MOVQ R8, BX + SBBQ 16(SI), BX + MOVQ BX, 56(SP) + MOVQ R9, BX + SBBQ 24(SI), BX + MOVQ BX, 64(SP) + MOVQ R10, BX + SBBQ 32(SI), BX + MOVQ BX, 72(SP) + MOVQ R11, BX + SBBQ 40(SI), BX + MOVQ BX, 80(SP) + MOVQ R12, BX + SBBQ 48(SI), BX + MOVQ BX, 88(SP) + MOVQ R13, BX + SBBQ 56(SI), BX + MOVQ BX, 96(SP) + MOVQ R14, BX + SBBQ 64(SI), BX + MOVQ BX, 104(SP) + MOVQ R15, BX + SBBQ 72(SI), BX + MOVQ BX, 112(SP) + MOVQ (SP), BX + SBBQ 80(SI), BX + MOVQ BX, 120(SP) + MOVQ 8(SP), BX + SBBQ 88(SI), BX + MOVQ BX, 128(SP) + MOVQ 16(SP), BX + SBBQ 96(SI), BX + MOVQ BX, 136(SP) + MOVQ 24(SP), BX + SBBQ 104(SI), BX + MOVQ BX, 144(SP) + MOVQ 32(SP), BX + SBBQ 112(SI), BX + MOVQ BX, 152(SP) + SBBQ $0x00, AX + + // | + MOVQ c+0(FP), DI + CMOVQCC 40(SP), CX MOVQ CX, (DI) - CMOVQCC 56(SP), DX + CMOVQCC 48(SP), DX MOVQ DX, 8(DI) - CMOVQCC 64(SP), R8 + CMOVQCC 56(SP), R8 MOVQ R8, 16(DI) - CMOVQCC 72(SP), R9 + CMOVQCC 64(SP), R9 MOVQ R9, 24(DI) - CMOVQCC 80(SP), R10 + CMOVQCC 72(SP), R10 MOVQ R10, 32(DI) - CMOVQCC 88(SP), R11 + CMOVQCC 80(SP), R11 MOVQ R11, 40(DI) - CMOVQCC 96(SP), R12 + CMOVQCC 88(SP), R12 MOVQ R12, 48(DI) - CMOVQCC 104(SP), R13 + CMOVQCC 96(SP), R13 MOVQ R13, 56(DI) - CMOVQCC 112(SP), R14 + CMOVQCC 104(SP), R14 MOVQ R14, 64(DI) - CMOVQCC 120(SP), R15 + CMOVQCC 112(SP), R15 MOVQ R15, 72(DI) MOVQ (SP), BX - CMOVQCC 128(SP), BX + CMOVQCC 120(SP), BX MOVQ BX, 80(DI) MOVQ 8(SP), BX - CMOVQCC 136(SP), BX + CMOVQCC 128(SP), BX MOVQ BX, 88(DI) MOVQ 16(SP), BX - CMOVQCC 144(SP), BX + CMOVQCC 136(SP), BX MOVQ BX, 96(DI) MOVQ 24(SP), BX - CMOVQCC 152(SP), BX + CMOVQCC 144(SP), BX MOVQ BX, 104(DI) MOVQ 32(SP), BX - CMOVQCC 160(SP), BX + CMOVQCC 152(SP), BX MOVQ BX, 112(DI) - MOVQ 40(SP), BX - CMOVQCC 168(SP), BX - MOVQ BX, 120(DI) RET -// func sub16(c *[16]uint64, a *[16]uint64, b *[16]uint64, p *[16]uint64) -TEXT ·sub16(SB), NOSPLIT, $176-32 - // | - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI - XORQ AX, AX - MOVQ (DI), CX - SUBQ (SI), CX - MOVQ 8(DI), DX - SBBQ 8(SI), DX - MOVQ 16(DI), R8 - SBBQ 16(SI), R8 - MOVQ 24(DI), R9 - SBBQ 24(SI), R9 - MOVQ 32(DI), R10 - SBBQ 32(SI), R10 - MOVQ 40(DI), R11 - SBBQ 40(SI), R11 - MOVQ 48(DI), R12 - SBBQ 48(SI), R12 - MOVQ 56(DI), R13 - SBBQ 56(SI), R13 - MOVQ 64(DI), R14 - SBBQ 64(SI), R14 - MOVQ 72(DI), R15 - SBBQ 72(SI), R15 - MOVQ 80(DI), BX - SBBQ 80(SI), BX - MOVQ BX, (SP) - MOVQ 88(DI), BX - SBBQ 88(SI), BX - MOVQ BX, 8(SP) - MOVQ 96(DI), BX - SBBQ 96(SI), BX - MOVQ BX, 16(SP) - MOVQ 104(DI), BX - SBBQ 104(SI), BX - MOVQ BX, 24(SP) - MOVQ 112(DI), BX - SBBQ 112(SI), BX - MOVQ BX, 32(SP) - MOVQ 120(DI), BX - SBBQ 120(SI), BX - MOVQ BX, 40(SP) + // | + +/* end */ + + RET + +// func sub15(c *[15]uint64, a *[15]uint64, b *[15]uint64, p *[15]uint64) +TEXT ·sub15(SB), NOSPLIT, $160-32 + // | + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + XORQ AX, AX + MOVQ (DI), CX + SUBQ (SI), CX + MOVQ 8(DI), DX + SBBQ 8(SI), DX + MOVQ 16(DI), R8 + SBBQ 16(SI), R8 + MOVQ 24(DI), R9 + SBBQ 24(SI), R9 + MOVQ 32(DI), R10 + SBBQ 32(SI), R10 + MOVQ 40(DI), R11 + SBBQ 40(SI), R11 + MOVQ 48(DI), R12 + SBBQ 48(SI), R12 + MOVQ 56(DI), R13 + SBBQ 56(SI), R13 + MOVQ 64(DI), R14 + SBBQ 64(SI), R14 + MOVQ 72(DI), R15 + SBBQ 72(SI), R15 + MOVQ 80(DI), BX + SBBQ 80(SI), BX + MOVQ BX, (SP) + MOVQ 88(DI), BX + SBBQ 88(SI), BX + MOVQ BX, 8(SP) + MOVQ 96(DI), BX + SBBQ 96(SI), BX + MOVQ BX, 16(SP) + MOVQ 104(DI), BX + SBBQ 104(SI), BX + MOVQ BX, 24(SP) + MOVQ 112(DI), BX + SBBQ 112(SI), BX + MOVQ BX, 32(SP) + + // | + MOVQ p+24(FP), SI + CMOVQCS (SI), AX + MOVQ AX, 40(SP) + CMOVQCS 8(SI), AX + MOVQ AX, 48(SP) + CMOVQCS 16(SI), AX + MOVQ AX, 56(SP) + CMOVQCS 24(SI), AX + MOVQ AX, 64(SP) + CMOVQCS 32(SI), AX + MOVQ AX, 72(SP) + CMOVQCS 40(SI), AX + MOVQ AX, 80(SP) + CMOVQCS 48(SI), AX + MOVQ AX, 88(SP) + CMOVQCS 56(SI), AX + MOVQ AX, 96(SP) + CMOVQCS 64(SI), AX + MOVQ AX, 104(SP) + CMOVQCS 72(SI), AX + MOVQ AX, 112(SP) + CMOVQCS 80(SI), AX + MOVQ AX, 120(SP) + CMOVQCS 88(SI), AX + MOVQ AX, 128(SP) + CMOVQCS 96(SI), AX + MOVQ AX, 136(SP) + CMOVQCS 104(SI), AX + MOVQ AX, 144(SP) + CMOVQCS 112(SI), AX + MOVQ AX, 152(SP) + + // | + MOVQ c+0(FP), DI + ADDQ 40(SP), CX + MOVQ CX, (DI) + ADCQ 48(SP), DX + MOVQ DX, 8(DI) + ADCQ 56(SP), R8 + MOVQ R8, 16(DI) + ADCQ 64(SP), R9 + MOVQ R9, 24(DI) + ADCQ 72(SP), R10 + MOVQ R10, 32(DI) + ADCQ 80(SP), R11 + MOVQ R11, 40(DI) + ADCQ 88(SP), R12 + MOVQ R12, 48(DI) + ADCQ 96(SP), R13 + MOVQ R13, 56(DI) + ADCQ 104(SP), R14 + MOVQ R14, 64(DI) + ADCQ 112(SP), R15 + MOVQ R15, 72(DI) + MOVQ (SP), BX + ADCQ 120(SP), BX + MOVQ BX, 80(DI) + MOVQ 8(SP), BX + ADCQ 128(SP), BX + MOVQ BX, 88(DI) + MOVQ 16(SP), BX + ADCQ 136(SP), BX + MOVQ BX, 96(DI) + MOVQ 24(SP), BX + ADCQ 144(SP), BX + MOVQ BX, 104(DI) + MOVQ 32(SP), BX + ADCQ 152(SP), BX + MOVQ BX, 112(DI) + RET + + // | + +/* end */ + + RET + +// func subn15(a *[15]uint64, b *[15]uint64) uint64 +TEXT ·subn15(SB), NOSPLIT, $40-24 + // | + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + XORQ AX, AX + + // | + MOVQ (DI), CX + SUBQ (SI), CX + MOVQ 8(DI), DX + SBBQ 8(SI), DX + MOVQ 16(DI), R8 + SBBQ 16(SI), R8 + MOVQ 24(DI), R9 + SBBQ 24(SI), R9 + MOVQ 32(DI), R10 + SBBQ 32(SI), R10 + MOVQ 40(DI), R11 + SBBQ 40(SI), R11 + MOVQ 48(DI), R12 + SBBQ 48(SI), R12 + MOVQ 56(DI), R13 + SBBQ 56(SI), R13 + MOVQ 64(DI), R14 + SBBQ 64(SI), R14 + MOVQ 72(DI), R15 + SBBQ 72(SI), R15 + MOVQ 80(DI), BX + SBBQ 80(SI), BX + MOVQ BX, (SP) + MOVQ 88(DI), BX + SBBQ 88(SI), BX + MOVQ BX, 8(SP) + MOVQ 96(DI), BX + SBBQ 96(SI), BX + MOVQ BX, 16(SP) + MOVQ 104(DI), BX + SBBQ 104(SI), BX + MOVQ BX, 24(SP) + MOVQ 112(DI), BX + SBBQ 112(SI), BX + MOVQ BX, 32(SP) + ADCQ $0x00, AX + + // | + MOVQ CX, (DI) + MOVQ DX, 8(DI) + MOVQ R8, 16(DI) + MOVQ R9, 24(DI) + MOVQ R10, 32(DI) + MOVQ R11, 40(DI) + MOVQ R12, 48(DI) + MOVQ R13, 56(DI) + MOVQ R14, 64(DI) + MOVQ R15, 72(DI) + MOVQ (SP), BX + MOVQ BX, 80(DI) + MOVQ 8(SP), BX + MOVQ BX, 88(DI) + MOVQ 16(SP), BX + MOVQ BX, 96(DI) + MOVQ 24(SP), BX + MOVQ BX, 104(DI) + MOVQ 32(SP), BX + MOVQ BX, 112(DI) + MOVQ AX, ret+16(FP) + RET + + // | + +/* end */ + + RET + +// func _neg15(c *[15]uint64, a *[15]uint64, p *[15]uint64) +TEXT ·_neg15(SB), NOSPLIT, $40-24 + // | + MOVQ a+8(FP), DI + + // | + MOVQ p+16(FP), SI + MOVQ (SI), CX + SUBQ (DI), CX + MOVQ 8(SI), DX + SBBQ 8(DI), DX + MOVQ 16(SI), R8 + SBBQ 16(DI), R8 + MOVQ 24(SI), R9 + SBBQ 24(DI), R9 + MOVQ 32(SI), R10 + SBBQ 32(DI), R10 + MOVQ 40(SI), R11 + SBBQ 40(DI), R11 + MOVQ 48(SI), R12 + SBBQ 48(DI), R12 + MOVQ 56(SI), R13 + SBBQ 56(DI), R13 + MOVQ 64(SI), R14 + SBBQ 64(DI), R14 + MOVQ 72(SI), R15 + SBBQ 72(DI), R15 + MOVQ 80(SI), BX + SBBQ 80(DI), BX + MOVQ BX, (SP) + MOVQ 88(SI), BX + SBBQ 88(DI), BX + MOVQ BX, 8(SP) + MOVQ 96(SI), BX + SBBQ 96(DI), BX + MOVQ BX, 16(SP) + MOVQ 104(SI), BX + SBBQ 104(DI), BX + MOVQ BX, 24(SP) + MOVQ 112(SI), BX + SBBQ 112(DI), BX + MOVQ BX, 32(SP) + + // | + MOVQ c+0(FP), DI + MOVQ CX, (DI) + MOVQ DX, 8(DI) + MOVQ R8, 16(DI) + MOVQ R9, 24(DI) + MOVQ R10, 32(DI) + MOVQ R11, 40(DI) + MOVQ R12, 48(DI) + MOVQ R13, 56(DI) + MOVQ R14, 64(DI) + MOVQ R15, 72(DI) + MOVQ (SP), BX + MOVQ BX, 80(DI) + MOVQ 8(SP), BX + MOVQ BX, 88(DI) + MOVQ 16(SP), BX + MOVQ BX, 96(DI) + MOVQ 24(SP), BX + MOVQ BX, 104(DI) + MOVQ 32(SP), BX + MOVQ BX, 112(DI) + RET + + // | + +/* end */ + + RET + +// func mul_two_15(a *[15]uint64) +TEXT ·mul_two_15(SB), NOSPLIT, $0-8 + MOVQ a+0(FP), DI + XORQ AX, AX + RCLQ $0x01, (DI) + RCLQ $0x01, 8(DI) + RCLQ $0x01, 16(DI) + RCLQ $0x01, 24(DI) + RCLQ $0x01, 32(DI) + RCLQ $0x01, 40(DI) + RCLQ $0x01, 48(DI) + RCLQ $0x01, 56(DI) + RCLQ $0x01, 64(DI) + RCLQ $0x01, 72(DI) + RCLQ $0x01, 80(DI) + RCLQ $0x01, 88(DI) + RCLQ $0x01, 96(DI) + RCLQ $0x01, 104(DI) + RCLQ $0x01, 112(DI) + RET + +// func div_two_15(a *[15]uint64) +TEXT ·div_two_15(SB), NOSPLIT, $0-8 + MOVQ a+0(FP), DI + XORQ AX, AX + RCRQ $0x01, 112(DI) + RCRQ $0x01, 104(DI) + RCRQ $0x01, 96(DI) + RCRQ $0x01, 88(DI) + RCRQ $0x01, 80(DI) + RCRQ $0x01, 72(DI) + RCRQ $0x01, 64(DI) + RCRQ $0x01, 56(DI) + RCRQ $0x01, 48(DI) + RCRQ $0x01, 40(DI) + RCRQ $0x01, 32(DI) + RCRQ $0x01, 24(DI) + RCRQ $0x01, 16(DI) + RCRQ $0x01, 8(DI) + RCRQ $0x01, (DI) + RET + +// func mul15(c *[15]uint64, a *[15]uint64, b *[15]uint64, p *[15]uint64, inp uint64) +TEXT ·mul15(SB), NOSPLIT, $280-40 + // | + +/* inputs */ + + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + XORQ AX, AX + + // | + +/* i = 0 */ + + // | a0 @ DX + MOVQ (DI), DX + + // | a0 * b0 + MULXQ (SI), AX, CX + MOVQ AX, (SP) + + // | a0 * b1 + MULXQ 8(SI), AX, R8 + ADCXQ AX, CX + + // | a0 * b2 + MULXQ 16(SI), AX, R9 + ADCXQ AX, R8 + + // | a0 * b3 + MULXQ 24(SI), AX, R10 + ADCXQ AX, R9 + + // | a0 * b4 + MULXQ 32(SI), AX, R11 + ADCXQ AX, R10 + + // | a0 * b5 + MULXQ 40(SI), AX, R12 + ADCXQ AX, R11 + + // | a0 * b6 + MULXQ 48(SI), AX, R13 + ADCXQ AX, R12 + + // | a0 * b7 + MULXQ 56(SI), AX, R14 + ADCXQ AX, R13 + + // | a0 * b8 + MULXQ 64(SI), AX, R15 + ADCXQ AX, R14 + ADCQ $0x00, R15 + + // | + +/* i = 1 */ + + // | a1 @ DX + MOVQ 8(DI), DX + XORQ AX, AX + + // | a1 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + MOVQ CX, 8(SP) + MOVQ $0x00, CX + + // | a1 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a1 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a1 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a1 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a1 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a1 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a1 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a1 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R15 + ADOXQ CX, CX + ADCXQ BX, CX + + // | + +/* i = 2 */ + + // | a2 @ DX + MOVQ 16(DI), DX + XORQ AX, AX + + // | a2 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + MOVQ R8, 16(SP) + MOVQ $0x00, R8 + + // | a2 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a2 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a2 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a2 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a2 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a2 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a2 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a2 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, CX + ADOXQ R8, R8 + ADCXQ BX, R8 + + // | + +/* i = 3 */ + + // | a3 @ DX + MOVQ 24(DI), DX + XORQ AX, AX + + // | a3 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + MOVQ R9, 24(SP) + MOVQ $0x00, R9 + + // | a3 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a3 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a3 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a3 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a3 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a3 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a3 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a3 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R8 + ADOXQ R9, R9 + ADCXQ BX, R9 + + // | + +/* i = 4 */ + + // | a4 @ DX + MOVQ 32(DI), DX + XORQ AX, AX + + // | a4 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + MOVQ R10, 32(SP) + MOVQ $0x00, R10 + + // | a4 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a4 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a4 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a4 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a4 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a4 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a4 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a4 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R9 + ADOXQ R10, R10 + ADCXQ BX, R10 + + // | + +/* i = 5 */ + + // | a5 @ DX + MOVQ 40(DI), DX + XORQ AX, AX + + // | a5 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + MOVQ R11, 40(SP) + MOVQ $0x00, R11 + + // | a5 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a5 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a5 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a5 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a5 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a5 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a5 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a5 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R10 + ADOXQ R11, R11 + ADCXQ BX, R11 + + // | + +/* i = 6 */ + + // | a6 @ DX + MOVQ 48(DI), DX + XORQ AX, AX + + // | a6 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + MOVQ R12, 48(SP) + MOVQ $0x00, R12 + + // | a6 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a6 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a6 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a6 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a6 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a6 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a6 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a6 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R11 + ADOXQ R12, R12 + ADCXQ BX, R12 + + // | + +/* i = 7 */ + + // | a7 @ DX + MOVQ 56(DI), DX + XORQ AX, AX + + // | a7 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + MOVQ R13, 56(SP) + MOVQ $0x00, R13 + + // | a7 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a7 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a7 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a7 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a7 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a7 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a7 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a7 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R12 + ADOXQ R13, R13 + ADCXQ BX, R13 + + // | + +/* i = 8 */ + + // | a8 @ DX + MOVQ 64(DI), DX + XORQ AX, AX + + // | a8 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + MOVQ R14, 64(SP) + MOVQ $0x00, R14 + + // | a8 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a8 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a8 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a8 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a8 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a8 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a8 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a8 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R13 + ADOXQ R14, R14 + ADCXQ BX, R14 + + // | + +/* i = 9 */ + + // | a9 @ DX + MOVQ 72(DI), DX + XORQ AX, AX + + // | a9 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + MOVQ R15, 72(SP) + MOVQ $0x00, R15 + + // | a9 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a9 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a9 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a9 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a9 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a9 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a9 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a9 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R14 + ADOXQ R15, R15 + ADCXQ BX, R15 + + // | + +/* i = 10 */ + + // | a10 @ DX + MOVQ 80(DI), DX + XORQ AX, AX + + // | a10 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + MOVQ CX, 80(SP) + MOVQ $0x00, CX + + // | a10 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a10 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a10 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a10 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a10 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a10 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a10 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a10 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R15 + ADOXQ CX, CX + ADCXQ BX, CX + + // | + +/* i = 11 */ + + // | a11 @ DX + MOVQ 88(DI), DX + XORQ AX, AX + + // | a11 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + MOVQ R8, 88(SP) + MOVQ $0x00, R8 + + // | a11 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a11 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a11 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a11 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a11 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a11 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a11 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a11 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, CX + ADOXQ R8, R8 + ADCXQ BX, R8 + + // | + +/* i = 12 */ + + // | a12 @ DX + MOVQ 96(DI), DX + XORQ AX, AX + + // | a12 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + MOVQ R9, 96(SP) + MOVQ $0x00, R9 + + // | a12 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a12 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a12 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a12 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a12 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a12 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a12 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a12 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R8 + ADOXQ R9, R9 + ADCXQ BX, R9 + + // | + +/* i = 13 */ + + // | a13 @ DX + MOVQ 104(DI), DX + XORQ AX, AX + + // | a13 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + MOVQ R10, 104(SP) + MOVQ $0x00, R10 + + // | a13 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a13 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a13 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a13 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a13 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a13 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a13 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a13 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R9 + ADOXQ R10, R10 + ADCXQ BX, R10 + + // | + +/* i = 14 */ + + // | a14 @ DX + MOVQ 112(DI), DX + XORQ AX, AX + + // | a14 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + MOVQ R11, 112(SP) + MOVQ $0x00, R11 + + // | a14 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a14 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a14 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a14 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a14 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a14 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a14 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a14 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R10 + ADOXQ BX, R11 + ADCQ $0x00, R11 + + // | + +/* */ + + // | + // | W right + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 112(SP) + // | 15 R12 | 16 R13 | 17 R14 | 18 R15 | 19 CX | 20 R8 | 21 R9 | 22 R10 | 23 R11 | 24 - | 25 - | 26 - | 27 - | 28 - | 29 - + + + MOVQ R12, 120(SP) + MOVQ R13, 128(SP) + MOVQ R14, 136(SP) + MOVQ R15, 144(SP) + MOVQ CX, 152(SP) + MOVQ R8, 160(SP) + MOVQ R9, 168(SP) + MOVQ R10, 176(SP) + MOVQ R11, 184(SP) + + // | + // | W right at stack + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 112(SP) + // | 15 120(SP) | 16 128(SP) | 17 136(SP) | 18 144(SP) | 19 152(SP) | 20 160(SP) | 21 168(SP) | 22 176(SP) | 23 184(SP) | 24 - | 25 - | 26 - | 27 - | 28 - | 29 - + + + XORQ AX, AX + + // | + +/* i = 0 */ + + // | a0 @ DX + MOVQ (DI), DX + + // | a0 * b9 + MULXQ 72(SI), AX, CX + MOVQ AX, 192(SP) + + // | a0 * b10 + MULXQ 80(SI), AX, R8 + ADCXQ AX, CX + + // | a0 * b11 + MULXQ 88(SI), AX, R9 + ADCXQ AX, R8 + + // | a0 * b12 + MULXQ 96(SI), AX, R10 + ADCXQ AX, R9 + + // | a0 * b13 + MULXQ 104(SI), AX, R11 + ADCXQ AX, R10 + + // | a0 * b14 + MULXQ 112(SI), AX, R12 + ADCXQ AX, R11 + ADCQ $0x00, R12 + + // | + +/* i = 1 */ + + // | a1 @ DX + MOVQ 8(DI), DX + XORQ R13, R13 + + // | a1 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + MOVQ CX, 200(SP) + + // | a1 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a1 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a1 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a1 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a1 * b14 + MULXQ 112(SI), AX, BX + ADOXQ AX, R12 + ADOXQ R13, R13 + ADCXQ BX, R13 + + // | + +/* i = 2 */ + + // | a2 @ DX + MOVQ 16(DI), DX + XORQ R14, R14 + + // | a2 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + MOVQ R8, 208(SP) + + // | a2 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a2 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a2 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a2 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a2 * b14 + MULXQ 112(SI), AX, BX + ADOXQ AX, R13 + ADOXQ R14, R14 + ADCXQ BX, R14 + + // | + +/* i = 3 */ + + // | a3 @ DX + MOVQ 24(DI), DX + XORQ R15, R15 + + // | a3 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + MOVQ R9, 216(SP) + + // | a3 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a3 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a3 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a3 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a3 * b14 + MULXQ 112(SI), AX, BX + ADOXQ AX, R14 + ADOXQ R15, R15 + ADCXQ BX, R15 + + // | + +/* i = 4 */ + + // | a4 @ DX + MOVQ 32(DI), DX + XORQ CX, CX + + // | a4 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + MOVQ R10, 224(SP) + + // | a4 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a4 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a4 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a4 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a4 * b14 + MULXQ 112(SI), AX, BX + ADOXQ AX, R15 + ADOXQ CX, CX + ADCXQ BX, CX + + // | + +/* i = 5 */ + + // | a5 @ DX + MOVQ 40(DI), DX + XORQ R8, R8 + + // | a5 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + MOVQ R11, 232(SP) + + // | a5 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a5 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a5 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a5 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a5 * b14 + MULXQ 112(SI), AX, BX + ADOXQ AX, CX + ADOXQ R8, R8 + ADCXQ BX, R8 + + // | + +/* i = 6 */ + + // | a6 @ DX + MOVQ 48(DI), DX + XORQ R9, R9 + + // | a6 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + MOVQ R12, 240(SP) + + // | a6 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a6 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a6 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a6 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a6 * b14 + MULXQ 112(SI), AX, BX + ADOXQ AX, R8 + ADOXQ R9, R9 + ADCXQ BX, R9 + + // | + +/* i = 7 */ + + // | a7 @ DX + MOVQ 56(DI), DX + XORQ R10, R10 + + // | a7 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + MOVQ R13, 248(SP) + + // | a7 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a7 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a7 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a7 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a7 * b14 + MULXQ 112(SI), AX, BX + ADOXQ AX, R9 + ADOXQ R10, R10 + ADCXQ BX, R10 + + // | + +/* i = 8 */ + + // | a8 @ DX + MOVQ 64(DI), DX + XORQ R11, R11 + + // | a8 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + MOVQ R14, 256(SP) + + // | a8 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a8 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a8 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a8 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a8 * b14 + MULXQ 112(SI), AX, BX + ADOXQ AX, R10 + ADOXQ R11, R11 + ADCXQ BX, R11 + + // | + +/* i = 9 */ + + // | a9 @ DX + MOVQ 72(DI), DX + XORQ R12, R12 + + // | a9 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + MOVQ R15, 264(SP) + + // | a9 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + + // | a9 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a9 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a9 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a9 * b14 + MULXQ 112(SI), AX, BX + ADOXQ AX, R11 + ADOXQ R12, R12 + ADCXQ BX, R12 + + // | + +/* i = 10 */ + + // | a10 @ DX + MOVQ 80(DI), DX + XORQ R13, R13 + + // | a10 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + MOVQ CX, 272(SP) + + // | a10 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a10 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a10 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a10 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a10 * b14 + MULXQ 112(SI), AX, BX + ADOXQ AX, R12 + ADOXQ R13, R13 + ADCXQ BX, R13 + + // | + +/* i = 11 */ + + // | a11 @ DX + MOVQ 88(DI), DX + XORQ R14, R14 + + // | a11 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + + // | a11 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a11 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a11 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a11 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a11 * b14 + MULXQ 112(SI), AX, BX + ADOXQ AX, R13 + ADOXQ R14, R14 + ADCXQ BX, R14 + + // | + +/* i = 12 */ + + // | a12 @ DX + MOVQ 96(DI), DX + XORQ R15, R15 + + // | a12 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a12 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a12 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a12 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a12 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a12 * b14 + MULXQ 112(SI), AX, BX + ADOXQ AX, R14 + ADOXQ R15, R15 + ADCXQ BX, R15 + + // | + +/* i = 13 */ + + // | a13 @ DX + MOVQ 104(DI), DX + XORQ CX, CX + + // | a13 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + + // | a13 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a13 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a13 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a13 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a13 * b14 + MULXQ 112(SI), AX, BX + ADOXQ AX, R15 + ADOXQ CX, CX + ADCXQ BX, CX + + // | + +/* i = 14 */ + + // | a14 @ DX + MOVQ 112(DI), DX + XORQ DI, DI + + // | a14 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + + // | a14 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + + // | a14 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + + // | a14 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + + // | a14 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + + // | a14 * b14 + MULXQ 112(SI), AX, BX + ADOXQ AX, CX + ADOXQ BX, DI + ADCQ $0x00, DI + + // | + +/* */ + + // | + // | W left + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 192(SP) | 10 200(SP) | 11 208(SP) | 12 216(SP) | 13 224(SP) | 14 232(SP) + // | 15 240(SP) | 16 248(SP) | 17 256(SP) | 18 264(SP) | 19 272(SP) | 20 R8 | 21 R9 | 22 R10 | 23 R11 | 24 R12 | 25 R13 | 26 R14 | 27 R15 | 28 CX | 29 DI + + + // | + // | W right + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 112(SP) + // | 15 120(SP) | 16 128(SP) | 17 136(SP) | 18 144(SP) | 19 152(SP) | 20 160(SP) | 21 168(SP) | 22 176(SP) | 23 184(SP) | 24 - | 25 - | 26 - | 27 - | 28 - | 29 - + + + MOVQ 72(SP), AX + ADDQ AX, 192(SP) + MOVQ 80(SP), AX + ADCQ AX, 200(SP) + MOVQ 88(SP), AX + ADCQ AX, 208(SP) + MOVQ 96(SP), AX + ADCQ AX, 216(SP) + MOVQ 104(SP), AX + ADCQ AX, 224(SP) + MOVQ 112(SP), AX + ADCQ AX, 232(SP) + MOVQ 120(SP), AX + ADCQ AX, 240(SP) + MOVQ 128(SP), AX + ADCQ AX, 248(SP) + MOVQ 136(SP), AX + ADCQ AX, 256(SP) + MOVQ 144(SP), AX + ADCQ AX, 264(SP) + MOVQ 152(SP), AX + ADCQ AX, 272(SP) + ADCQ 160(SP), R8 + ADCQ 168(SP), R9 + ADCQ 176(SP), R10 + ADCQ 184(SP), R11 + ADCQ $0x00, R12 + ADCQ $0x00, R13 + ADCQ $0x00, R14 + ADCQ $0x00, R15 + ADCQ $0x00, CX + ADCQ $0x00, DI + + // | + // | W combined + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 192(SP) | 10 200(SP) | 11 208(SP) | 12 216(SP) | 13 224(SP) | 14 232(SP) + // | 15 240(SP) | 16 248(SP) | 17 256(SP) | 18 264(SP) | 19 272(SP) | 20 R8 | 21 R9 | 22 R10 | 23 R11 | 24 R12 | 25 R13 | 26 R14 | 27 R15 | 28 CX | 29 DI + + + MOVQ (SP), BX + MOVQ 8(SP), SI + MOVQ DI, (SP) + MOVQ 16(SP), DI + MOVQ CX, 8(SP) + MOVQ 24(SP), CX + MOVQ R15, 16(SP) + MOVQ 32(SP), R15 + MOVQ R14, 24(SP) + MOVQ 40(SP), R14 + MOVQ R13, 32(SP) + MOVQ 48(SP), R13 + MOVQ R12, 40(SP) + MOVQ 56(SP), R12 + MOVQ R11, 48(SP) + MOVQ 64(SP), R11 + MOVQ R10, 56(SP) + MOVQ 192(SP), R10 + MOVQ R9, 64(SP) + MOVQ R8, 72(SP) + + // | fetch modulus + MOVQ p+24(FP), R8 + + // | + // | W ready to mont + // | 0 BX | 1 SI | 2 DI | 3 CX | 4 R15 | 5 R14 | 6 R13 | 7 R12 | 8 R11 | 9 R10 | 10 200(SP) | 11 208(SP) | 12 216(SP) | 13 224(SP) | 14 232(SP) + // | 15 240(SP) | 16 248(SP) | 17 256(SP) | 18 264(SP) | 19 272(SP) | 20 72(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | + +/* montgomery reduction q1 */ + + // | clear flags + XORQ AX, AX + + // | + +/* i = 0 */ + + // | + // | W + // | 0 BX | 1 SI | 2 DI | 3 CX | 4 R15 | 5 R14 | 6 R13 | 7 R12 | 8 R11 | 9 R10 | 10 200(SP) | 11 208(SP) | 12 216(SP) | 13 224(SP) | 14 232(SP) + // | 15 240(SP) | 16 248(SP) | 17 256(SP) | 18 264(SP) | 19 272(SP) | 20 72(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | | u0 = w0 * inp + MOVQ BX, DX + MULXQ inp+32(FP), DX, R9 + + // | save u0 + MOVQ DX, 80(SP) + + // | + +/* */ + + // | j0 + + // | w0 @ BX + MULXQ (R8), AX, R9 + ADOXQ AX, BX + ADCXQ R9, SI + + // | j1 + + // | w1 @ SI + MULXQ 8(R8), AX, R9 + ADOXQ AX, SI + ADCXQ R9, DI + + // | j2 + + // | w2 @ DI + MULXQ 16(R8), AX, R9 + ADOXQ AX, DI + ADCXQ R9, CX + + // | j3 + + // | w3 @ CX + MULXQ 24(R8), AX, R9 + ADOXQ AX, CX + ADCXQ R9, R15 + + // | j4 + + // | w4 @ R15 + MULXQ 32(R8), AX, R9 + ADOXQ AX, R15 + ADCXQ R9, R14 + + // | j5 + + // | w5 @ R14 + MULXQ 40(R8), AX, R9 + ADOXQ AX, R14 + ADCXQ R9, R13 + + // | j6 + + // | w6 @ R13 + MULXQ 48(R8), AX, R9 + ADOXQ AX, R13 + ADCXQ R9, R12 + + // | j7 + + // | w7 @ R12 + MULXQ 56(R8), AX, R9 + ADOXQ AX, R12 + ADCXQ R9, R11 + + // | j8 + + // | w8 @ R11 + MULXQ 64(R8), AX, R9 + ADOXQ AX, R11 + ADCXQ R9, R10 + ADOXQ BX, R10 + ADCXQ BX, BX + MOVQ $0x00, AX + ADOXQ AX, BX + + // | clear flags + XORQ AX, AX + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 SI | 2 DI | 3 CX | 4 R15 | 5 R14 | 6 R13 | 7 R12 | 8 R11 | 9 R10 | 10 200(SP) | 11 208(SP) | 12 216(SP) | 13 224(SP) | 14 232(SP) + // | 15 240(SP) | 16 248(SP) | 17 256(SP) | 18 264(SP) | 19 272(SP) | 20 72(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | | u1 = w1 * inp + MOVQ SI, DX + MULXQ inp+32(FP), DX, R9 + + // | save u1 + MOVQ DX, 88(SP) + + // | + +/* */ + + // | j0 + + // | w1 @ SI + MULXQ (R8), AX, R9 + ADOXQ AX, SI + ADCXQ R9, DI + + // | j1 + + // | w2 @ DI + MULXQ 8(R8), AX, R9 + ADOXQ AX, DI + ADCXQ R9, CX + + // | j2 + + // | w3 @ CX + MULXQ 16(R8), AX, R9 + ADOXQ AX, CX + ADCXQ R9, R15 + + // | j3 + + // | w4 @ R15 + MULXQ 24(R8), AX, R9 + ADOXQ AX, R15 + ADCXQ R9, R14 + + // | j4 + + // | w5 @ R14 + MULXQ 32(R8), AX, R9 + ADOXQ AX, R14 + ADCXQ R9, R13 + + // | j5 + + // | w6 @ R13 + MULXQ 40(R8), AX, R9 + ADOXQ AX, R13 + ADCXQ R9, R12 + + // | j6 + + // | w7 @ R12 + MULXQ 48(R8), AX, R9 + ADOXQ AX, R12 + ADCXQ R9, R11 + + // | j7 + + // | w8 @ R11 + MULXQ 56(R8), AX, R9 + ADOXQ AX, R11 + ADCXQ R9, R10 + + // | j8 + + // | w9 @ R10 + MULXQ 64(R8), AX, R9 + ADOXQ AX, R10 + + // | w10 @ 200(SP) + // | move to temp register + MOVQ 200(SP), AX + ADCXQ R9, AX + ADOXQ BX, AX + + // | move to an idle register + // | w10 @ AX + MOVQ AX, BX + ADCXQ SI, SI + MOVQ $0x00, AX + ADOXQ AX, SI + + // | clear flags + XORQ AX, AX + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 DI | 3 CX | 4 R15 | 5 R14 | 6 R13 | 7 R12 | 8 R11 | 9 R10 | 10 BX | 11 208(SP) | 12 216(SP) | 13 224(SP) | 14 232(SP) + // | 15 240(SP) | 16 248(SP) | 17 256(SP) | 18 264(SP) | 19 272(SP) | 20 72(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | | u2 = w2 * inp + MOVQ DI, DX + MULXQ inp+32(FP), DX, R9 + + // | save u2 + MOVQ DX, 96(SP) + + // | + +/* */ + + // | j0 + + // | w2 @ DI + MULXQ (R8), AX, R9 + ADOXQ AX, DI + ADCXQ R9, CX + + // | j1 + + // | w3 @ CX + MULXQ 8(R8), AX, R9 + ADOXQ AX, CX + ADCXQ R9, R15 + + // | j2 + + // | w4 @ R15 + MULXQ 16(R8), AX, R9 + ADOXQ AX, R15 + ADCXQ R9, R14 + + // | j3 + + // | w5 @ R14 + MULXQ 24(R8), AX, R9 + ADOXQ AX, R14 + ADCXQ R9, R13 + + // | j4 + + // | w6 @ R13 + MULXQ 32(R8), AX, R9 + ADOXQ AX, R13 + ADCXQ R9, R12 + + // | j5 + + // | w7 @ R12 + MULXQ 40(R8), AX, R9 + ADOXQ AX, R12 + ADCXQ R9, R11 + + // | j6 + + // | w8 @ R11 + MULXQ 48(R8), AX, R9 + ADOXQ AX, R11 + ADCXQ R9, R10 + + // | j7 + + // | w9 @ R10 + MULXQ 56(R8), AX, R9 + ADOXQ AX, R10 + ADCXQ R9, BX + + // | j8 + + // | w10 @ BX + MULXQ 64(R8), AX, R9 + ADOXQ AX, BX + + // | w11 @ 208(SP) + // | move to temp register + MOVQ 208(SP), AX + ADCXQ R9, AX + ADOXQ SI, AX + + // | move to an idle register + // | w11 @ AX + MOVQ AX, SI + ADCXQ DI, DI + MOVQ $0x00, AX + ADOXQ AX, DI + + // | clear flags + XORQ AX, AX + + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 CX | 4 R15 | 5 R14 | 6 R13 | 7 R12 | 8 R11 | 9 R10 | 10 BX | 11 SI | 12 216(SP) | 13 224(SP) | 14 232(SP) + // | 15 240(SP) | 16 248(SP) | 17 256(SP) | 18 264(SP) | 19 272(SP) | 20 72(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | | u3 = w3 * inp + MOVQ CX, DX + MULXQ inp+32(FP), DX, R9 + + // | save u3 + MOVQ DX, 104(SP) + + // | + +/* */ + + // | j0 + + // | w3 @ CX + MULXQ (R8), AX, R9 + ADOXQ AX, CX + ADCXQ R9, R15 + + // | j1 + + // | w4 @ R15 + MULXQ 8(R8), AX, R9 + ADOXQ AX, R15 + ADCXQ R9, R14 + + // | j2 + + // | w5 @ R14 + MULXQ 16(R8), AX, R9 + ADOXQ AX, R14 + ADCXQ R9, R13 + + // | j3 + + // | w6 @ R13 + MULXQ 24(R8), AX, R9 + ADOXQ AX, R13 + ADCXQ R9, R12 + + // | j4 + + // | w7 @ R12 + MULXQ 32(R8), AX, R9 + ADOXQ AX, R12 + ADCXQ R9, R11 + + // | j5 + + // | w8 @ R11 + MULXQ 40(R8), AX, R9 + ADOXQ AX, R11 + ADCXQ R9, R10 + + // | j6 + + // | w9 @ R10 + MULXQ 48(R8), AX, R9 + ADOXQ AX, R10 + ADCXQ R9, BX + + // | j7 + + // | w10 @ BX + MULXQ 56(R8), AX, R9 + ADOXQ AX, BX + ADCXQ R9, SI + + // | j8 + + // | w11 @ SI + MULXQ 64(R8), AX, R9 + ADOXQ AX, SI + + // | w12 @ 216(SP) + // | move to temp register + MOVQ 216(SP), AX + ADCXQ R9, AX + ADOXQ DI, AX + + // | move to an idle register + // | w12 @ AX + MOVQ AX, DI + ADCXQ CX, CX + MOVQ $0x00, AX + ADOXQ AX, CX + + // | clear flags + XORQ AX, AX + + // | + +/* i = 4 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 R15 | 5 R14 | 6 R13 | 7 R12 | 8 R11 | 9 R10 | 10 BX | 11 SI | 12 DI | 13 224(SP) | 14 232(SP) + // | 15 240(SP) | 16 248(SP) | 17 256(SP) | 18 264(SP) | 19 272(SP) | 20 72(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | | u4 = w4 * inp + MOVQ R15, DX + MULXQ inp+32(FP), DX, R9 + + // | save u4 + MOVQ DX, 112(SP) + + // | + +/* */ + + // | j0 + + // | w4 @ R15 + MULXQ (R8), AX, R9 + ADOXQ AX, R15 + ADCXQ R9, R14 + + // | j1 + + // | w5 @ R14 + MULXQ 8(R8), AX, R9 + ADOXQ AX, R14 + ADCXQ R9, R13 + + // | j2 + + // | w6 @ R13 + MULXQ 16(R8), AX, R9 + ADOXQ AX, R13 + ADCXQ R9, R12 + + // | j3 + + // | w7 @ R12 + MULXQ 24(R8), AX, R9 + ADOXQ AX, R12 + ADCXQ R9, R11 + + // | j4 + + // | w8 @ R11 + MULXQ 32(R8), AX, R9 + ADOXQ AX, R11 + ADCXQ R9, R10 + + // | j5 + + // | w9 @ R10 + MULXQ 40(R8), AX, R9 + ADOXQ AX, R10 + ADCXQ R9, BX + + // | j6 + + // | w10 @ BX + MULXQ 48(R8), AX, R9 + ADOXQ AX, BX + ADCXQ R9, SI + + // | j7 + + // | w11 @ SI + MULXQ 56(R8), AX, R9 + ADOXQ AX, SI + ADCXQ R9, DI + + // | j8 + + // | w12 @ DI + MULXQ 64(R8), AX, R9 + ADOXQ AX, DI + + // | w13 @ 224(SP) + // | move to temp register + MOVQ 224(SP), AX + ADCXQ R9, AX + ADOXQ CX, AX + + // | move to an idle register + // | w13 @ AX + MOVQ AX, CX + ADCXQ R15, R15 + MOVQ $0x00, AX + ADOXQ AX, R15 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 5 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 R14 | 6 R13 | 7 R12 | 8 R11 | 9 R10 | 10 BX | 11 SI | 12 DI | 13 CX | 14 232(SP) + // | 15 240(SP) | 16 248(SP) | 17 256(SP) | 18 264(SP) | 19 272(SP) | 20 72(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | | u5 = w5 * inp + MOVQ R14, DX + MULXQ inp+32(FP), DX, R9 + + // | save u5 + MOVQ DX, 120(SP) + + // | + +/* */ + + // | j0 + + // | w5 @ R14 + MULXQ (R8), AX, R9 + ADOXQ AX, R14 + ADCXQ R9, R13 + + // | j1 + + // | w6 @ R13 + MULXQ 8(R8), AX, R9 + ADOXQ AX, R13 + ADCXQ R9, R12 + + // | j2 + + // | w7 @ R12 + MULXQ 16(R8), AX, R9 + ADOXQ AX, R12 + ADCXQ R9, R11 + + // | j3 + + // | w8 @ R11 + MULXQ 24(R8), AX, R9 + ADOXQ AX, R11 + ADCXQ R9, R10 + + // | j4 + + // | w9 @ R10 + MULXQ 32(R8), AX, R9 + ADOXQ AX, R10 + ADCXQ R9, BX + + // | j5 + + // | w10 @ BX + MULXQ 40(R8), AX, R9 + ADOXQ AX, BX + ADCXQ R9, SI + + // | j6 + + // | w11 @ SI + MULXQ 48(R8), AX, R9 + ADOXQ AX, SI + ADCXQ R9, DI + + // | j7 + + // | w12 @ DI + MULXQ 56(R8), AX, R9 + ADOXQ AX, DI + ADCXQ R9, CX + + // | j8 + + // | w13 @ CX + MULXQ 64(R8), AX, R9 + ADOXQ AX, CX + + // | w14 @ 232(SP) + // | move to temp register + MOVQ 232(SP), AX + ADCXQ R9, AX + ADOXQ R15, AX + + // | move to an idle register + // | w14 @ AX + MOVQ AX, R15 + ADCXQ R14, R14 + MOVQ $0x00, AX + ADOXQ AX, R14 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 6 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 R13 | 7 R12 | 8 R11 | 9 R10 | 10 BX | 11 SI | 12 DI | 13 CX | 14 R15 + // | 15 240(SP) | 16 248(SP) | 17 256(SP) | 18 264(SP) | 19 272(SP) | 20 72(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | | u6 = w6 * inp + MOVQ R13, DX + MULXQ inp+32(FP), DX, R9 + + // | save u6 + MOVQ DX, 128(SP) + + // | + +/* */ + + // | j0 + + // | w6 @ R13 + MULXQ (R8), AX, R9 + ADOXQ AX, R13 + ADCXQ R9, R12 + + // | j1 + + // | w7 @ R12 + MULXQ 8(R8), AX, R9 + ADOXQ AX, R12 + ADCXQ R9, R11 + + // | j2 + + // | w8 @ R11 + MULXQ 16(R8), AX, R9 + ADOXQ AX, R11 + ADCXQ R9, R10 + + // | j3 + + // | w9 @ R10 + MULXQ 24(R8), AX, R9 + ADOXQ AX, R10 + ADCXQ R9, BX + + // | j4 + + // | w10 @ BX + MULXQ 32(R8), AX, R9 + ADOXQ AX, BX + ADCXQ R9, SI + + // | j5 + + // | w11 @ SI + MULXQ 40(R8), AX, R9 + ADOXQ AX, SI + ADCXQ R9, DI + + // | j6 + + // | w12 @ DI + MULXQ 48(R8), AX, R9 + ADOXQ AX, DI + ADCXQ R9, CX + + // | j7 + + // | w13 @ CX + MULXQ 56(R8), AX, R9 + ADOXQ AX, CX + ADCXQ R9, R15 + + // | j8 + + // | w14 @ R15 + MULXQ 64(R8), AX, R9 + ADOXQ AX, R15 + + // | w15 @ 240(SP) + // | move to temp register + MOVQ 240(SP), AX + ADCXQ R9, AX + ADOXQ R14, AX + + // | move to an idle register + // | w15 @ AX + MOVQ AX, R14 + ADCXQ R13, R13 + MOVQ $0x00, AX + ADOXQ AX, R13 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 7 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 R12 | 8 R11 | 9 R10 | 10 BX | 11 SI | 12 DI | 13 CX | 14 R15 + // | 15 R14 | 16 248(SP) | 17 256(SP) | 18 264(SP) | 19 272(SP) | 20 72(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | | u7 = w7 * inp + MOVQ R12, DX + MULXQ inp+32(FP), DX, R9 + + // | save u7 + MOVQ DX, 136(SP) + + // | + +/* */ + + // | j0 + + // | w7 @ R12 + MULXQ (R8), AX, R9 + ADOXQ AX, R12 + ADCXQ R9, R11 + + // | j1 + + // | w8 @ R11 + MULXQ 8(R8), AX, R9 + ADOXQ AX, R11 + ADCXQ R9, R10 + + // | j2 + + // | w9 @ R10 + MULXQ 16(R8), AX, R9 + ADOXQ AX, R10 + ADCXQ R9, BX + + // | j3 + + // | w10 @ BX + MULXQ 24(R8), AX, R9 + ADOXQ AX, BX + ADCXQ R9, SI + + // | j4 + + // | w11 @ SI + MULXQ 32(R8), AX, R9 + ADOXQ AX, SI + ADCXQ R9, DI + + // | j5 + + // | w12 @ DI + MULXQ 40(R8), AX, R9 + ADOXQ AX, DI + ADCXQ R9, CX + + // | j6 + + // | w13 @ CX + MULXQ 48(R8), AX, R9 + ADOXQ AX, CX + ADCXQ R9, R15 + + // | j7 + + // | w14 @ R15 + MULXQ 56(R8), AX, R9 + ADOXQ AX, R15 + ADCXQ R9, R14 + + // | j8 + + // | w15 @ R14 + MULXQ 64(R8), AX, R9 + ADOXQ AX, R14 + + // | w16 @ 248(SP) + // | move to temp register + MOVQ 248(SP), AX + ADCXQ R9, AX + ADOXQ R13, AX + + // | move to an idle register + // | w16 @ AX + MOVQ AX, R13 + ADCXQ R12, R12 + MOVQ $0x00, AX + ADOXQ AX, R12 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 8 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R11 | 9 R10 | 10 BX | 11 SI | 12 DI | 13 CX | 14 R15 + // | 15 R14 | 16 R13 | 17 256(SP) | 18 264(SP) | 19 272(SP) | 20 72(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | | u8 = w8 * inp + MOVQ R11, DX + MULXQ inp+32(FP), DX, R9 + + // | save u8 + MOVQ DX, 144(SP) + + // | + +/* */ + + // | j0 + + // | w8 @ R11 + MULXQ (R8), AX, R9 + ADOXQ AX, R11 + ADCXQ R9, R10 + + // | j1 + + // | w9 @ R10 + MULXQ 8(R8), AX, R9 + ADOXQ AX, R10 + ADCXQ R9, BX + + // | j2 + + // | w10 @ BX + MULXQ 16(R8), AX, R9 + ADOXQ AX, BX + ADCXQ R9, SI + + // | j3 + + // | w11 @ SI + MULXQ 24(R8), AX, R9 + ADOXQ AX, SI + ADCXQ R9, DI + + // | j4 + + // | w12 @ DI + MULXQ 32(R8), AX, R9 + ADOXQ AX, DI + ADCXQ R9, CX + + // | j5 + + // | w13 @ CX + MULXQ 40(R8), AX, R9 + ADOXQ AX, CX + ADCXQ R9, R15 + + // | j6 + + // | w14 @ R15 + MULXQ 48(R8), AX, R9 + ADOXQ AX, R15 + ADCXQ R9, R14 + + // | j7 + + // | w15 @ R14 + MULXQ 56(R8), AX, R9 + ADOXQ AX, R14 + ADCXQ R9, R13 + + // | j8 + + // | w16 @ R13 + MULXQ 64(R8), AX, R9 + ADOXQ AX, R13 + + // | w17 @ 256(SP) + // | move to temp register + MOVQ 256(SP), AX + ADCXQ R9, AX + ADOXQ R12, AX + + // | move to an idle register + // | w17 @ AX + MOVQ AX, R12 + ADCXQ R11, R11 + MOVQ $0x00, AX + ADOXQ AX, R11 + + // | + // | W montgomery reduction q1 ends + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R10 | 10 BX | 11 SI | 12 DI | 13 CX | 14 R15 + // | 15 R14 | 16 R13 | 17 R12 | 18 264(SP) | 19 272(SP) | 20 72(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | carry from q1 should be added to w18 + MOVQ R11, 152(SP) + + // | + +/* montgomerry reduction q2 */ + + // | clear flags + XORQ R11, R11 + + // | + +/* i = 0 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R10 | 10 BX | 11 SI | 12 DI | 13 CX | 14 R15 + // | 15 R14 | 16 R13 | 17 R12 | 18 264(SP) | 19 272(SP) | 20 72(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | u0 @ 80(SP) + MOVQ 80(SP), DX + + // | + +/* */ + + // | j9 + + // | w9 @ R10 + MULXQ 72(R8), AX, R9 + ADOXQ AX, R10 + ADCXQ R9, BX + + // | j10 + + // | w10 @ BX + MULXQ 80(R8), AX, R9 + ADOXQ AX, BX + ADCXQ R9, SI + + // | j11 + + // | w11 @ SI + MULXQ 88(R8), AX, R9 + ADOXQ AX, SI + ADCXQ R9, DI + + // | j12 + + // | w12 @ DI + MULXQ 96(R8), AX, R9 + ADOXQ AX, DI + ADCXQ R9, CX + + // | j13 + + // | w13 @ CX + MULXQ 104(R8), AX, R9 + ADOXQ AX, CX + ADCXQ R9, R15 + + // | j14 + + // | w14 @ R15 + MULXQ 112(R8), AX, R9 + ADOXQ AX, R15 + ADCXQ R9, R14 + ADOXQ R11, R14 + MOVQ $0x00, R11 + ADCXQ R11, R11 + MOVQ $0x00, AX + ADOXQ AX, R11 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R10 | 10 BX | 11 SI | 12 DI | 13 CX | 14 R15 + // | 15 R14 | 16 R13 | 17 R12 | 18 264(SP) | 19 272(SP) | 20 72(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | u1 @ 88(SP) + MOVQ 88(SP), DX + + // | + +/* */ + + // | j9 + + // | w10 @ BX + MULXQ 72(R8), AX, R9 + ADOXQ AX, BX + MOVQ BX, 80(SP) + ADCXQ R9, SI + + // | j10 + + // | w11 @ SI + MULXQ 80(R8), AX, R9 + ADOXQ AX, SI + ADCXQ R9, DI + + // | j11 + + // | w12 @ DI + MULXQ 88(R8), AX, R9 + ADOXQ AX, DI + ADCXQ R9, CX + + // | j12 + + // | w13 @ CX + MULXQ 96(R8), AX, R9 + ADOXQ AX, CX + ADCXQ R9, R15 + + // | j13 + + // | w14 @ R15 + MULXQ 104(R8), AX, R9 + ADOXQ AX, R15 + ADCXQ R9, R14 + + // | j14 + + // | w15 @ R14 + MULXQ 112(R8), AX, R9 + ADOXQ AX, R14 + ADCXQ R9, R13 + ADOXQ R11, R13 + MOVQ $0x00, R11 + ADCXQ R11, R11 + MOVQ $0x00, AX + ADOXQ AX, R11 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R10 | 10 80(SP) | 11 SI | 12 DI | 13 CX | 14 R15 + // | 15 R14 | 16 R13 | 17 R12 | 18 264(SP) | 19 272(SP) | 20 72(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | u2 @ 96(SP) + MOVQ 96(SP), DX + + // | + +/* */ + + // | j9 + + // | w11 @ SI + MULXQ 72(R8), AX, R9 + ADOXQ AX, SI + MOVQ SI, 88(SP) + ADCXQ R9, DI + + // | j10 + + // | w12 @ DI + MULXQ 80(R8), AX, R9 + ADOXQ AX, DI + ADCXQ R9, CX + + // | j11 + + // | w13 @ CX + MULXQ 88(R8), AX, R9 + ADOXQ AX, CX + ADCXQ R9, R15 + + // | j12 + + // | w14 @ R15 + MULXQ 96(R8), AX, R9 + ADOXQ AX, R15 + ADCXQ R9, R14 + + // | j13 + + // | w15 @ R14 + MULXQ 104(R8), AX, R9 + ADOXQ AX, R14 + ADCXQ R9, R13 + + // | j14 + + // | w16 @ R13 + MULXQ 112(R8), AX, R9 + ADOXQ AX, R13 + ADCXQ R9, R12 + ADOXQ R11, R12 + + // | bring the carry from q1 + MOVQ 152(SP), R11 + MOVQ $0x00, AX + ADCXQ AX, R11 + ADOXQ AX, R11 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R10 | 10 80(SP) | 11 88(SP) | 12 DI | 13 CX | 14 R15 + // | 15 R14 | 16 R13 | 17 R12 | 18 264(SP) | 19 272(SP) | 20 72(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | u3 @ 104(SP) + MOVQ 104(SP), DX + + // | + +/* */ + + // | j9 + + // | w12 @ DI + MULXQ 72(R8), AX, R9 + ADOXQ AX, DI + MOVQ DI, 96(SP) + ADCXQ R9, CX + + // | j10 + + // | w13 @ CX + MULXQ 80(R8), AX, R9 + ADOXQ AX, CX + ADCXQ R9, R15 + + // | j11 + + // | w14 @ R15 + MULXQ 88(R8), AX, R9 + ADOXQ AX, R15 + ADCXQ R9, R14 + + // | j12 + + // | w15 @ R14 + MULXQ 96(R8), AX, R9 + ADOXQ AX, R14 + ADCXQ R9, R13 + + // | j13 + + // | w16 @ R13 + MULXQ 104(R8), AX, R9 + ADOXQ AX, R13 + ADCXQ R9, R12 + + // | j14 + + // | w17 @ R12 + MULXQ 112(R8), AX, R9 + ADOXQ AX, R12 + + // | w18 @ 264(SP) + // | move to an idle register + MOVQ 264(SP), BX + + // | w18 @ BX + ADCXQ R9, BX + ADOXQ R11, BX + MOVQ $0x00, R11 + ADCXQ R11, R11 + MOVQ $0x00, AX + ADOXQ AX, R11 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 4 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R10 | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 CX | 14 R15 + // | 15 R14 | 16 R13 | 17 R12 | 18 BX | 19 272(SP) | 20 72(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | u4 @ 112(SP) + MOVQ 112(SP), DX + + // | + +/* */ + + // | j9 + + // | w13 @ CX + MULXQ 72(R8), AX, R9 + ADOXQ AX, CX + MOVQ CX, 104(SP) + ADCXQ R9, R15 + + // | j10 + + // | w14 @ R15 + MULXQ 80(R8), AX, R9 + ADOXQ AX, R15 + ADCXQ R9, R14 + + // | j11 + + // | w15 @ R14 + MULXQ 88(R8), AX, R9 + ADOXQ AX, R14 + ADCXQ R9, R13 + + // | j12 + + // | w16 @ R13 + MULXQ 96(R8), AX, R9 + ADOXQ AX, R13 + ADCXQ R9, R12 + + // | j13 + + // | w17 @ R12 + MULXQ 104(R8), AX, R9 + ADOXQ AX, R12 + ADCXQ R9, BX + + // | j14 + + // | w18 @ BX + MULXQ 112(R8), AX, R9 + ADOXQ AX, BX + + // | w19 @ 272(SP) + // | move to an idle register + MOVQ 272(SP), CX + + // | w19 @ CX + ADCXQ R9, CX + ADOXQ R11, CX + MOVQ $0x00, R11 + ADCXQ R11, R11 + MOVQ $0x00, AX + ADOXQ AX, R11 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 5 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R10 | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 R15 + // | 15 R14 | 16 R13 | 17 R12 | 18 BX | 19 CX | 20 72(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | u5 @ 120(SP) + MOVQ 120(SP), DX + + // | + +/* */ + + // | j9 + + // | w14 @ R15 + MULXQ 72(R8), AX, R9 + ADOXQ AX, R15 + MOVQ R15, 112(SP) + ADCXQ R9, R14 + + // | j10 + + // | w15 @ R14 + MULXQ 80(R8), AX, R9 + ADOXQ AX, R14 + ADCXQ R9, R13 + + // | j11 + + // | w16 @ R13 + MULXQ 88(R8), AX, R9 + ADOXQ AX, R13 + ADCXQ R9, R12 + + // | j12 + + // | w17 @ R12 + MULXQ 96(R8), AX, R9 + ADOXQ AX, R12 + ADCXQ R9, BX + + // | j13 + + // | w18 @ BX + MULXQ 104(R8), AX, R9 + ADOXQ AX, BX + ADCXQ R9, CX + + // | j14 + + // | w19 @ CX + MULXQ 112(R8), AX, R9 + ADOXQ AX, CX + + // | w20 @ 72(SP) + // | move to an idle register + MOVQ 72(SP), DI + + // | w20 @ DI + ADCXQ R9, DI + ADOXQ R11, DI + MOVQ $0x00, R11 + ADCXQ R11, R11 + MOVQ $0x00, AX + ADOXQ AX, R11 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 6 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R10 | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 112(SP) + // | 15 R14 | 16 R13 | 17 R12 | 18 BX | 19 CX | 20 DI | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | u6 @ 128(SP) + MOVQ 128(SP), DX + + // | + +/* */ + + // | j9 + + // | w15 @ R14 + MULXQ 72(R8), AX, R9 + ADOXQ AX, R14 + MOVQ R14, 72(SP) + ADCXQ R9, R13 + + // | j10 + + // | w16 @ R13 + MULXQ 80(R8), AX, R9 + ADOXQ AX, R13 + ADCXQ R9, R12 + + // | j11 + + // | w17 @ R12 + MULXQ 88(R8), AX, R9 + ADOXQ AX, R12 + ADCXQ R9, BX + + // | j12 + + // | w18 @ BX + MULXQ 96(R8), AX, R9 + ADOXQ AX, BX + ADCXQ R9, CX + + // | j13 + + // | w19 @ CX + MULXQ 104(R8), AX, R9 + ADOXQ AX, CX + ADCXQ R9, DI + + // | j14 + + // | w20 @ DI + MULXQ 112(R8), AX, R9 + ADOXQ AX, DI + + // | w21 @ 64(SP) + // | move to an idle register + MOVQ 64(SP), SI + + // | w21 @ SI + ADCXQ R9, SI + ADOXQ R11, SI + MOVQ $0x00, R11 + ADCXQ R11, R11 + MOVQ $0x00, AX + ADOXQ AX, R11 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 7 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R10 | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 112(SP) + // | 15 72(SP) | 16 R13 | 17 R12 | 18 BX | 19 CX | 20 DI | 21 SI | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | u7 @ 136(SP) + MOVQ 136(SP), DX + + // | + +/* */ + + // | j9 + + // | w16 @ R13 + MULXQ 72(R8), AX, R9 + ADOXQ AX, R13 + ADCXQ R9, R12 + + // | j10 + + // | w17 @ R12 + MULXQ 80(R8), AX, R9 + ADOXQ AX, R12 + ADCXQ R9, BX + + // | j11 + + // | w18 @ BX + MULXQ 88(R8), AX, R9 + ADOXQ AX, BX + ADCXQ R9, CX + + // | j12 + + // | w19 @ CX + MULXQ 96(R8), AX, R9 + ADOXQ AX, CX + ADCXQ R9, DI + + // | j13 + + // | w20 @ DI + MULXQ 104(R8), AX, R9 + ADOXQ AX, DI + ADCXQ R9, SI + + // | j14 + + // | w21 @ SI + MULXQ 112(R8), AX, R9 + ADOXQ AX, SI + + // | w22 @ 56(SP) + // | move to an idle register + MOVQ 56(SP), R14 + + // | w22 @ R14 + ADCXQ R9, R14 + ADOXQ R11, R14 + MOVQ $0x00, R11 + ADCXQ R11, R11 + MOVQ $0x00, AX + ADOXQ AX, R11 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 8 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R10 | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 112(SP) + // | 15 72(SP) | 16 R13 | 17 R12 | 18 BX | 19 CX | 20 DI | 21 SI | 22 R14 | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | u8 @ 144(SP) + MOVQ 144(SP), DX + + // | + +/* */ + + // | j9 + + // | w17 @ R12 + MULXQ 72(R8), AX, R9 + ADOXQ AX, R12 + ADCXQ R9, BX + + // | j10 + + // | w18 @ BX + MULXQ 80(R8), AX, R9 + ADOXQ AX, BX + ADCXQ R9, CX + + // | j11 + + // | w19 @ CX + MULXQ 88(R8), AX, R9 + ADOXQ AX, CX + ADCXQ R9, DI + + // | j12 + + // | w20 @ DI + MULXQ 96(R8), AX, R9 + ADOXQ AX, DI + ADCXQ R9, SI + + // | j13 + + // | w21 @ SI + MULXQ 104(R8), AX, R9 + ADOXQ AX, SI + ADCXQ R9, R14 + + // | j14 + + // | w22 @ R14 + MULXQ 112(R8), AX, R9 + ADOXQ AX, R14 + + // | w23 @ 48(SP) + // | move to an idle register + MOVQ 48(SP), R15 + + // | w23 @ R15 + ADCXQ R9, R15 + ADOXQ R11, R15 + MOVQ $0x00, R11 + ADCXQ R11, R11 + MOVQ $0x00, AX + ADOXQ AX, R11 + + // | + // | q2 ends + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R10 | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 112(SP) + // | 15 72(SP) | 16 R13 | 17 R12 | 18 BX | 19 CX | 20 DI | 21 SI | 22 R14 | 23 R15 | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | save the carry from q2 + // | should be added to w24 + MOVQ R11, 152(SP) + + // | + +/* q2 q3 transition swap */ + + MOVQ 80(SP), R11 + MOVQ R15, 48(SP) + MOVQ 88(SP), R15 + MOVQ R14, 56(SP) + MOVQ 96(SP), R14 + MOVQ SI, 64(SP) + MOVQ 104(SP), SI + MOVQ DI, 80(SP) + MOVQ 112(SP), DI + MOVQ CX, 88(SP) + MOVQ 72(SP), CX + + // | + // | W q2 q3 transition + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R10 | 10 R11 | 11 R15 | 12 R14 | 13 SI | 14 DI + // | 15 CX | 16 R13 | 17 R12 | 18 BX | 19 88(SP) | 20 80(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | + +/* montgomery reduction q3 */ + + // | clear flags + XORQ AX, AX + + // | + +/* i = 9 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R10 | 10 R11 | 11 R15 | 12 R14 | 13 SI | 14 DI + // | 15 CX | 16 R13 | 17 R12 | 18 BX | 19 88(SP) | 20 80(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | | u9 = w9 * inp + MOVQ R10, DX + MULXQ inp+32(FP), DX, R9 + + // | save u9 + MOVQ DX, 72(SP) + + // | + +/* */ + + // | j0 + + // | w9 @ R10 + MULXQ (R8), AX, R9 + ADOXQ AX, R10 + ADCXQ R9, R11 + + // | j1 + + // | w10 @ R11 + MULXQ 8(R8), AX, R9 + ADOXQ AX, R11 + ADCXQ R9, R15 + + // | j2 + + // | w11 @ R15 + MULXQ 16(R8), AX, R9 + ADOXQ AX, R15 + ADCXQ R9, R14 + + // | j3 + + // | w12 @ R14 + MULXQ 24(R8), AX, R9 + ADOXQ AX, R14 + ADCXQ R9, SI + + // | j4 + + // | w13 @ SI + MULXQ 32(R8), AX, R9 + ADOXQ AX, SI + ADCXQ R9, DI + + // | j5 + + // | w14 @ DI + MULXQ 40(R8), AX, R9 + ADOXQ AX, DI + ADCXQ R9, CX + + // | j6 + + // | w15 @ CX + MULXQ 48(R8), AX, R9 + ADOXQ AX, CX + ADCXQ R9, R13 + + // | j7 + + // | w16 @ R13 + MULXQ 56(R8), AX, R9 + ADOXQ AX, R13 + ADCXQ R9, R12 + + // | j8 + + // | w17 @ R12 + MULXQ 64(R8), AX, R9 + ADOXQ AX, R12 + ADCXQ R9, BX + ADOXQ R10, BX + ADCXQ R10, R10 + MOVQ $0x00, AX + ADOXQ AX, R10 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 10 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 R11 | 11 R15 | 12 R14 | 13 SI | 14 DI + // | 15 CX | 16 R13 | 17 R12 | 18 BX | 19 88(SP) | 20 80(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | | u10 = w10 * inp + MOVQ R11, DX + MULXQ inp+32(FP), DX, R9 + + // | save u10 + MOVQ DX, 96(SP) + + // | + +/* */ + + // | j0 + + // | w10 @ R11 + MULXQ (R8), AX, R9 + ADOXQ AX, R11 + ADCXQ R9, R15 + + // | j1 + + // | w11 @ R15 + MULXQ 8(R8), AX, R9 + ADOXQ AX, R15 + ADCXQ R9, R14 + + // | j2 + + // | w12 @ R14 + MULXQ 16(R8), AX, R9 + ADOXQ AX, R14 + ADCXQ R9, SI + + // | j3 + + // | w13 @ SI + MULXQ 24(R8), AX, R9 + ADOXQ AX, SI + ADCXQ R9, DI + + // | j4 + + // | w14 @ DI + MULXQ 32(R8), AX, R9 + ADOXQ AX, DI + ADCXQ R9, CX + + // | j5 + + // | w15 @ CX + MULXQ 40(R8), AX, R9 + ADOXQ AX, CX + ADCXQ R9, R13 + + // | j6 + + // | w16 @ R13 + MULXQ 48(R8), AX, R9 + ADOXQ AX, R13 + ADCXQ R9, R12 + + // | j7 + + // | w17 @ R12 + MULXQ 56(R8), AX, R9 + ADOXQ AX, R12 + ADCXQ R9, BX + + // | j8 + + // | w18 @ BX + MULXQ 64(R8), AX, R9 + ADOXQ AX, BX + + // | w19 @ 88(SP) + // | move to temp register + MOVQ 88(SP), AX + ADCXQ R9, AX + ADOXQ R10, AX + + // | move to an idle register + // | w19 @ AX + MOVQ AX, R10 + ADCXQ R11, R11 + MOVQ $0x00, AX + ADOXQ AX, R11 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 11 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 R15 | 12 R14 | 13 SI | 14 DI + // | 15 CX | 16 R13 | 17 R12 | 18 BX | 19 R10 | 20 80(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | | u11 = w11 * inp + MOVQ R15, DX + MULXQ inp+32(FP), DX, R9 + + // | save u11 + MOVQ DX, 88(SP) + + // | + +/* */ + + // | j0 + + // | w11 @ R15 + MULXQ (R8), AX, R9 + ADOXQ AX, R15 + ADCXQ R9, R14 + + // | j1 + + // | w12 @ R14 + MULXQ 8(R8), AX, R9 + ADOXQ AX, R14 + ADCXQ R9, SI + + // | j2 + + // | w13 @ SI + MULXQ 16(R8), AX, R9 + ADOXQ AX, SI + ADCXQ R9, DI + + // | j3 + + // | w14 @ DI + MULXQ 24(R8), AX, R9 + ADOXQ AX, DI + ADCXQ R9, CX + + // | j4 + + // | w15 @ CX + MULXQ 32(R8), AX, R9 + ADOXQ AX, CX + ADCXQ R9, R13 + + // | j5 + + // | w16 @ R13 + MULXQ 40(R8), AX, R9 + ADOXQ AX, R13 + ADCXQ R9, R12 + + // | j6 + + // | w17 @ R12 + MULXQ 48(R8), AX, R9 + ADOXQ AX, R12 + ADCXQ R9, BX + + // | j7 + + // | w18 @ BX + MULXQ 56(R8), AX, R9 + ADOXQ AX, BX + ADCXQ R9, R10 + + // | j8 + + // | w19 @ R10 + MULXQ 64(R8), AX, R9 + ADOXQ AX, R10 + + // | w20 @ 80(SP) + // | move to temp register + MOVQ 80(SP), AX + ADCXQ R9, AX + ADOXQ R11, AX + + // | move to an idle register + // | w20 @ AX + MOVQ AX, R11 + ADCXQ R15, R15 + MOVQ $0x00, AX + ADOXQ AX, R15 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 12 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 R14 | 13 SI | 14 DI + // | 15 CX | 16 R13 | 17 R12 | 18 BX | 19 R10 | 20 R11 | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | | u12 = w12 * inp + MOVQ R14, DX + MULXQ inp+32(FP), DX, R9 + + // | save u12 + MOVQ DX, 80(SP) + + // | + +/* */ + + // | j0 + + // | w12 @ R14 + MULXQ (R8), AX, R9 + ADOXQ AX, R14 + ADCXQ R9, SI + + // | j1 + + // | w13 @ SI + MULXQ 8(R8), AX, R9 + ADOXQ AX, SI + ADCXQ R9, DI + + // | j2 + + // | w14 @ DI + MULXQ 16(R8), AX, R9 + ADOXQ AX, DI + ADCXQ R9, CX + + // | j3 + + // | w15 @ CX + MULXQ 24(R8), AX, R9 + ADOXQ AX, CX + ADCXQ R9, R13 + + // | j4 + + // | w16 @ R13 + MULXQ 32(R8), AX, R9 + ADOXQ AX, R13 + ADCXQ R9, R12 + + // | j5 + + // | w17 @ R12 + MULXQ 40(R8), AX, R9 + ADOXQ AX, R12 + ADCXQ R9, BX + + // | j6 + + // | w18 @ BX + MULXQ 48(R8), AX, R9 + ADOXQ AX, BX + ADCXQ R9, R10 + + // | j7 + + // | w19 @ R10 + MULXQ 56(R8), AX, R9 + ADOXQ AX, R10 + ADCXQ R9, R11 + + // | j8 + + // | w20 @ R11 + MULXQ 64(R8), AX, R9 + ADOXQ AX, R11 + + // | w21 @ 64(SP) + // | move to temp register + MOVQ 64(SP), AX + ADCXQ R9, AX + ADOXQ R15, AX + + // | move to an idle register + // | w21 @ AX + MOVQ AX, R15 + ADCXQ R14, R14 + MOVQ $0x00, AX + ADOXQ AX, R14 + + // | clear flags + XORQ AX, AX + + // | + +/* i = 13 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 SI | 14 DI + // | 15 CX | 16 R13 | 17 R12 | 18 BX | 19 R10 | 20 R11 | 21 R15 | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | | u13 = w13 * inp + MOVQ SI, DX + MULXQ inp+32(FP), DX, R9 + + // | save u13 + MOVQ DX, 64(SP) + + // | + +/* */ + + // | j0 + + // | w13 @ SI + MULXQ (R8), AX, R9 + ADOXQ AX, SI + ADCXQ R9, DI + + // | j1 + + // | w14 @ DI + MULXQ 8(R8), AX, R9 + ADOXQ AX, DI + ADCXQ R9, CX + + // | j2 + + // | w15 @ CX + MULXQ 16(R8), AX, R9 + ADOXQ AX, CX + ADCXQ R9, R13 + + // | j3 + + // | w16 @ R13 + MULXQ 24(R8), AX, R9 + ADOXQ AX, R13 + ADCXQ R9, R12 + + // | j4 + + // | w17 @ R12 + MULXQ 32(R8), AX, R9 + ADOXQ AX, R12 + ADCXQ R9, BX + + // | j5 + + // | w18 @ BX + MULXQ 40(R8), AX, R9 + ADOXQ AX, BX + ADCXQ R9, R10 + + // | j6 + + // | w19 @ R10 + MULXQ 48(R8), AX, R9 + ADOXQ AX, R10 + ADCXQ R9, R11 + + // | j7 + + // | w20 @ R11 + MULXQ 56(R8), AX, R9 + ADOXQ AX, R11 + ADCXQ R9, R15 + + // | j8 + + // | w21 @ R15 + MULXQ 64(R8), AX, R9 + ADOXQ AX, R15 + + // | w22 @ 56(SP) + // | move to temp register + MOVQ 56(SP), AX + ADCXQ R9, AX + ADOXQ R14, AX + + // | move to an idle register + // | w22 @ AX + MOVQ AX, R14 + ADCXQ SI, SI + MOVQ $0x00, AX + ADOXQ AX, SI + + // | clear flags + XORQ AX, AX + + // | + +/* i = 14 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 DI + // | 15 CX | 16 R13 | 17 R12 | 18 BX | 19 R10 | 20 R11 | 21 R15 | 22 R14 | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | | u14 = w14 * inp + MOVQ DI, DX + MULXQ inp+32(FP), DX, R9 + + // | save u14 + MOVQ DX, 56(SP) + + // | + +/* */ + + // | j0 + + // | w14 @ DI + MULXQ (R8), AX, R9 + ADOXQ AX, DI + ADCXQ R9, CX + + // | j1 + + // | w15 @ CX + MULXQ 8(R8), AX, R9 + ADOXQ AX, CX + ADCXQ R9, R13 + + // | j2 + + // | w16 @ R13 + MULXQ 16(R8), AX, R9 + ADOXQ AX, R13 + ADCXQ R9, R12 + + // | j3 + + // | w17 @ R12 + MULXQ 24(R8), AX, R9 + ADOXQ AX, R12 + ADCXQ R9, BX + + // | j4 + + // | w18 @ BX + MULXQ 32(R8), AX, R9 + ADOXQ AX, BX + ADCXQ R9, R10 + + // | j5 + + // | w19 @ R10 + MULXQ 40(R8), AX, R9 + ADOXQ AX, R10 + ADCXQ R9, R11 + + // | j6 + + // | w20 @ R11 + MULXQ 48(R8), AX, R9 + ADOXQ AX, R11 + ADCXQ R9, R15 + + // | j7 + + // | w21 @ R15 + MULXQ 56(R8), AX, R9 + ADOXQ AX, R15 + ADCXQ R9, R14 + + // | j8 + + // | w22 @ R14 + MULXQ 64(R8), AX, R9 + ADOXQ AX, R14 + + // | w23 @ 48(SP) + // | move to temp register + MOVQ 48(SP), AX + ADCXQ R9, AX + ADOXQ SI, AX + + // | move to an idle register + // | w23 @ AX + MOVQ AX, SI + ADCXQ DI, DI + MOVQ $0x00, AX + ADOXQ AX, DI + + // | + // | W q3 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - + // | 15 CX | 16 R13 | 17 R12 | 18 BX | 19 R10 | 20 R11 | 21 R15 | 22 R14 | 23 SI | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | aggregate carries from q2 & q3 + // | should be added to w24 + ADCQ 152(SP), DI + + // | + +/* montgomerry reduction q4 */ + + // | clear flags + XORQ AX, AX + + // | + +/* i = 0 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - + // | 15 CX | 16 R13 | 17 R12 | 18 BX | 19 R10 | 20 R11 | 21 R15 | 22 R14 | 23 SI | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | u0 @ 72(SP) + MOVQ 72(SP), DX + + // | + +/* */ + + // | j9 + + // | w18 @ BX + MULXQ 72(R8), AX, R9 + ADOXQ AX, BX + ADCXQ R9, R10 + MOVQ BX, 48(SP) + + // | j10 + + // | w19 @ R10 + MULXQ 80(R8), AX, R9 + ADOXQ AX, R10 + ADCXQ R9, R11 + + // | j11 + + // | w20 @ R11 + MULXQ 88(R8), AX, R9 + ADOXQ AX, R11 + ADCXQ R9, R15 + + // | j12 + + // | w21 @ R15 + MULXQ 96(R8), AX, R9 + ADOXQ AX, R15 + ADCXQ R9, R14 + + // | j13 + + // | w22 @ R14 + MULXQ 104(R8), AX, R9 + ADOXQ AX, R14 + ADCXQ R9, SI + + // | j14 + + // | w23 @ SI + MULXQ 112(R8), AX, R9 + ADOXQ AX, SI + + // | w24 @ 40(SP) + // | move to an idle register + MOVQ 40(SP), BX + ADCXQ R9, BX + + // | bring carry from q2 & q3 + // | w24 @ BX + ADOXQ DI, BX + MOVQ $0x00, DI + ADCXQ DI, DI + MOVQ $0x00, R9 + ADOXQ R9, DI + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - + // | 15 CX | 16 R13 | 17 R12 | 18 48(SP) | 19 R10 | 20 R11 | 21 R15 | 22 R14 | 23 SI | 24 BX | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | u1 @ 96(SP) + MOVQ 96(SP), DX + + // | + +/* */ + + // | j9 + + // | w19 @ R10 + MULXQ 72(R8), AX, R9 + ADOXQ AX, R10 + ADCXQ R9, R11 + MOVQ R10, 40(SP) + + // | j10 + + // | w20 @ R11 + MULXQ 80(R8), AX, R9 + ADOXQ AX, R11 + ADCXQ R9, R15 + + // | j11 + + // | w21 @ R15 + MULXQ 88(R8), AX, R9 + ADOXQ AX, R15 + ADCXQ R9, R14 + + // | j12 + + // | w22 @ R14 + MULXQ 96(R8), AX, R9 + ADOXQ AX, R14 + ADCXQ R9, SI + + // | j13 + + // | w23 @ SI + MULXQ 104(R8), AX, R9 + ADOXQ AX, SI + ADCXQ R9, BX + + // | j14 + + // | w24 @ BX + MULXQ 112(R8), AX, R9 + ADOXQ AX, BX + + // | w25 @ 32(SP) + // | move to an idle register + MOVQ 32(SP), R10 + ADCXQ R9, R10 + + // | w25 @ R10 + ADOXQ DI, R10 + MOVQ $0x00, DI + ADCXQ DI, DI + MOVQ $0x00, R9 + ADOXQ R9, DI + + // | + +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - + // | 15 CX | 16 R13 | 17 R12 | 18 48(SP) | 19 40(SP) | 20 R11 | 21 R15 | 22 R14 | 23 SI | 24 BX | 25 R10 | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | u2 @ 88(SP) + MOVQ 88(SP), DX + + // | + +/* */ + + // | j9 + + // | w20 @ R11 + MULXQ 72(R8), AX, R9 + ADOXQ AX, R11 + ADCXQ R9, R15 + MOVQ R11, 32(SP) + + // | j10 + + // | w21 @ R15 + MULXQ 80(R8), AX, R9 + ADOXQ AX, R15 + ADCXQ R9, R14 + + // | j11 + + // | w22 @ R14 + MULXQ 88(R8), AX, R9 + ADOXQ AX, R14 + ADCXQ R9, SI + + // | j12 + + // | w23 @ SI + MULXQ 96(R8), AX, R9 + ADOXQ AX, SI + ADCXQ R9, BX + + // | j13 + + // | w24 @ BX + MULXQ 104(R8), AX, R9 + ADOXQ AX, BX + ADCXQ R9, R10 + + // | j14 + + // | w25 @ R10 + MULXQ 112(R8), AX, R9 + ADOXQ AX, R10 + + // | w26 @ 24(SP) + // | move to an idle register + MOVQ 24(SP), R11 + ADCXQ R9, R11 + + // | w26 @ R11 + ADOXQ DI, R11 + MOVQ $0x00, DI + ADCXQ DI, DI + MOVQ $0x00, R9 + ADOXQ R9, DI + + // | + +/* i = 3 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - + // | 15 CX | 16 R13 | 17 R12 | 18 48(SP) | 19 40(SP) | 20 32(SP) | 21 R15 | 22 R14 | 23 SI | 24 BX | 25 R10 | 26 R11 | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | u3 @ 80(SP) + MOVQ 80(SP), DX + + // | + +/* */ + + // | j9 + + // | w21 @ R15 + MULXQ 72(R8), AX, R9 + ADOXQ AX, R15 + ADCXQ R9, R14 + MOVQ R15, 24(SP) + + // | j10 + + // | w22 @ R14 + MULXQ 80(R8), AX, R9 + ADOXQ AX, R14 + ADCXQ R9, SI + + // | j11 + + // | w23 @ SI + MULXQ 88(R8), AX, R9 + ADOXQ AX, SI + ADCXQ R9, BX + + // | j12 + + // | w24 @ BX + MULXQ 96(R8), AX, R9 + ADOXQ AX, BX + ADCXQ R9, R10 + + // | j13 + + // | w25 @ R10 + MULXQ 104(R8), AX, R9 + ADOXQ AX, R10 + ADCXQ R9, R11 + + // | j14 + + // | w26 @ R11 + MULXQ 112(R8), AX, R9 + ADOXQ AX, R11 + + // | w27 @ 16(SP) + // | move to an idle register + MOVQ 16(SP), R15 + ADCXQ R9, R15 + + // | w27 @ R15 + ADOXQ DI, R15 + MOVQ $0x00, DI + ADCXQ DI, DI + MOVQ $0x00, R9 + ADOXQ R9, DI + + // | + +/* i = 4 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - + // | 15 CX | 16 R13 | 17 R12 | 18 48(SP) | 19 40(SP) | 20 32(SP) | 21 24(SP) | 22 R14 | 23 SI | 24 BX | 25 R10 | 26 R11 | 27 R15 | 28 8(SP) | 29 (SP) + - // | - MOVQ p+24(FP), SI - CMOVQCS (SI), AX - MOVQ AX, 48(SP) - CMOVQCS 8(SI), AX - MOVQ AX, 56(SP) - CMOVQCS 16(SI), AX - MOVQ AX, 64(SP) - CMOVQCS 24(SI), AX - MOVQ AX, 72(SP) - CMOVQCS 32(SI), AX - MOVQ AX, 80(SP) - CMOVQCS 40(SI), AX - MOVQ AX, 88(SP) - CMOVQCS 48(SI), AX - MOVQ AX, 96(SP) - CMOVQCS 56(SI), AX - MOVQ AX, 104(SP) - CMOVQCS 64(SI), AX - MOVQ AX, 112(SP) - CMOVQCS 72(SI), AX - MOVQ AX, 120(SP) - CMOVQCS 80(SI), AX - MOVQ AX, 128(SP) - CMOVQCS 88(SI), AX - MOVQ AX, 136(SP) - CMOVQCS 96(SI), AX - MOVQ AX, 144(SP) - CMOVQCS 104(SI), AX - MOVQ AX, 152(SP) - CMOVQCS 112(SI), AX - MOVQ AX, 160(SP) - CMOVQCS 120(SI), AX - MOVQ AX, 168(SP) + // | u4 @ 64(SP) + MOVQ 64(SP), DX - // | - MOVQ c+0(FP), DI - ADDQ 48(SP), CX - MOVQ CX, (DI) - ADCQ 56(SP), DX - MOVQ DX, 8(DI) - ADCQ 64(SP), R8 - MOVQ R8, 16(DI) - ADCQ 72(SP), R9 - MOVQ R9, 24(DI) - ADCQ 80(SP), R10 - MOVQ R10, 32(DI) - ADCQ 88(SP), R11 - MOVQ R11, 40(DI) - ADCQ 96(SP), R12 - MOVQ R12, 48(DI) - ADCQ 104(SP), R13 - MOVQ R13, 56(DI) - ADCQ 112(SP), R14 - MOVQ R14, 64(DI) - ADCQ 120(SP), R15 - MOVQ R15, 72(DI) - MOVQ (SP), BX - ADCQ 128(SP), BX - MOVQ BX, 80(DI) - MOVQ 8(SP), BX - ADCQ 136(SP), BX - MOVQ BX, 88(DI) - MOVQ 16(SP), BX - ADCQ 144(SP), BX - MOVQ BX, 96(DI) - MOVQ 24(SP), BX - ADCQ 152(SP), BX - MOVQ BX, 104(DI) - MOVQ 32(SP), BX - ADCQ 160(SP), BX - MOVQ BX, 112(DI) - MOVQ 40(SP), BX - ADCQ 168(SP), BX - MOVQ BX, 120(DI) - RET + // | -// func subn16(a *[16]uint64, b *[16]uint64) uint64 -TEXT ·subn16(SB), NOSPLIT, $48-24 - // | - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - XORQ AX, AX +/* */ - // | - MOVQ (DI), CX - SUBQ (SI), CX - MOVQ 8(DI), DX - SBBQ 8(SI), DX - MOVQ 16(DI), R8 - SBBQ 16(SI), R8 - MOVQ 24(DI), R9 - SBBQ 24(SI), R9 - MOVQ 32(DI), R10 - SBBQ 32(SI), R10 - MOVQ 40(DI), R11 - SBBQ 40(SI), R11 - MOVQ 48(DI), R12 - SBBQ 48(SI), R12 - MOVQ 56(DI), R13 - SBBQ 56(SI), R13 - MOVQ 64(DI), R14 - SBBQ 64(SI), R14 - MOVQ 72(DI), R15 - SBBQ 72(SI), R15 - MOVQ 80(DI), BX - SBBQ 80(SI), BX - MOVQ BX, (SP) - MOVQ 88(DI), BX - SBBQ 88(SI), BX - MOVQ BX, 8(SP) - MOVQ 96(DI), BX - SBBQ 96(SI), BX - MOVQ BX, 16(SP) - MOVQ 104(DI), BX - SBBQ 104(SI), BX - MOVQ BX, 24(SP) - MOVQ 112(DI), BX - SBBQ 112(SI), BX - MOVQ BX, 32(SP) - MOVQ 120(DI), BX - SBBQ 120(SI), BX - MOVQ BX, 40(SP) - ADCQ $0x00, AX + // | j9 - // | - MOVQ CX, (DI) - MOVQ DX, 8(DI) - MOVQ R8, 16(DI) - MOVQ R9, 24(DI) - MOVQ R10, 32(DI) - MOVQ R11, 40(DI) - MOVQ R12, 48(DI) - MOVQ R13, 56(DI) - MOVQ R14, 64(DI) - MOVQ R15, 72(DI) - MOVQ (SP), BX - MOVQ BX, 80(DI) - MOVQ 8(SP), BX - MOVQ BX, 88(DI) - MOVQ 16(SP), BX - MOVQ BX, 96(DI) - MOVQ 24(SP), BX - MOVQ BX, 104(DI) - MOVQ 32(SP), BX - MOVQ BX, 112(DI) - MOVQ 40(SP), BX - MOVQ BX, 120(DI) - MOVQ AX, ret+16(FP) - RET + // | w22 @ R14 + MULXQ 72(R8), AX, R9 + ADOXQ AX, R14 + ADCXQ R9, SI + MOVQ R14, 16(SP) -// func _neg16(c *[16]uint64, a *[16]uint64, p *[16]uint64) -TEXT ·_neg16(SB), NOSPLIT, $48-24 - // | - MOVQ a+8(FP), DI + // | j10 - // | - MOVQ p+16(FP), SI - MOVQ (SI), CX - SUBQ (DI), CX - MOVQ 8(SI), DX - SBBQ 8(DI), DX - MOVQ 16(SI), R8 - SBBQ 16(DI), R8 - MOVQ 24(SI), R9 - SBBQ 24(DI), R9 - MOVQ 32(SI), R10 - SBBQ 32(DI), R10 - MOVQ 40(SI), R11 - SBBQ 40(DI), R11 - MOVQ 48(SI), R12 - SBBQ 48(DI), R12 - MOVQ 56(SI), R13 - SBBQ 56(DI), R13 - MOVQ 64(SI), R14 - SBBQ 64(DI), R14 - MOVQ 72(SI), R15 - SBBQ 72(DI), R15 - MOVQ 80(SI), BX - SBBQ 80(DI), BX - MOVQ BX, (SP) - MOVQ 88(SI), BX - SBBQ 88(DI), BX - MOVQ BX, 8(SP) - MOVQ 96(SI), BX - SBBQ 96(DI), BX - MOVQ BX, 16(SP) - MOVQ 104(SI), BX - SBBQ 104(DI), BX - MOVQ BX, 24(SP) - MOVQ 112(SI), BX - SBBQ 112(DI), BX - MOVQ BX, 32(SP) - MOVQ 120(SI), BX - SBBQ 120(DI), BX - MOVQ BX, 40(SP) + // | w23 @ SI + MULXQ 80(R8), AX, R9 + ADOXQ AX, SI + ADCXQ R9, BX + + // | j11 + + // | w24 @ BX + MULXQ 88(R8), AX, R9 + ADOXQ AX, BX + ADCXQ R9, R10 + + // | j12 + + // | w25 @ R10 + MULXQ 96(R8), AX, R9 + ADOXQ AX, R10 + ADCXQ R9, R11 + + // | j13 + + // | w26 @ R11 + MULXQ 104(R8), AX, R9 + ADOXQ AX, R11 + ADCXQ R9, R15 + + // | j14 + + // | w27 @ R15 + MULXQ 112(R8), AX, R9 + ADOXQ AX, R15 + + // | w28 @ 8(SP) + // | move to an idle register + MOVQ 8(SP), R14 + ADCXQ R9, R14 + + // | w28 @ R14 + ADOXQ DI, R14 + MOVQ $0x00, DI + ADCXQ DI, DI + MOVQ $0x00, R9 + ADOXQ R9, DI + + // | + +/* i = 5 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - + // | 15 CX | 16 R13 | 17 R12 | 18 48(SP) | 19 40(SP) | 20 32(SP) | 21 24(SP) | 22 16(SP) | 23 SI | 24 BX | 25 R10 | 26 R11 | 27 R15 | 28 R14 | 29 (SP) - // | - MOVQ c+0(FP), DI - MOVQ CX, (DI) - MOVQ DX, 8(DI) - MOVQ R8, 16(DI) - MOVQ R9, 24(DI) - MOVQ R10, 32(DI) - MOVQ R11, 40(DI) - MOVQ R12, 48(DI) - MOVQ R13, 56(DI) - MOVQ R14, 64(DI) - MOVQ R15, 72(DI) - MOVQ (SP), BX - MOVQ BX, 80(DI) - MOVQ 8(SP), BX - MOVQ BX, 88(DI) - MOVQ 16(SP), BX - MOVQ BX, 96(DI) - MOVQ 24(SP), BX - MOVQ BX, 104(DI) - MOVQ 32(SP), BX - MOVQ BX, 112(DI) - MOVQ 40(SP), BX - MOVQ BX, 120(DI) - RET -// func mul_two_16(a *[16]uint64) -TEXT ·mul_two_16(SB), NOSPLIT, $0-8 - MOVQ a+0(FP), DI - XORQ AX, AX - RCLQ $0x01, (DI) - RCLQ $0x01, 8(DI) - RCLQ $0x01, 16(DI) - RCLQ $0x01, 24(DI) - RCLQ $0x01, 32(DI) - RCLQ $0x01, 40(DI) - RCLQ $0x01, 48(DI) - RCLQ $0x01, 56(DI) - RCLQ $0x01, 64(DI) - RCLQ $0x01, 72(DI) - RCLQ $0x01, 80(DI) - RCLQ $0x01, 88(DI) - RCLQ $0x01, 96(DI) - RCLQ $0x01, 104(DI) - RCLQ $0x01, 112(DI) - RCLQ $0x01, 120(DI) - RET + // | u5 @ 56(SP) + MOVQ 56(SP), DX -// func div_two_16(a *[16]uint64) -TEXT ·div_two_16(SB), NOSPLIT, $0-8 - MOVQ a+0(FP), DI - XORQ AX, AX - RCRQ $0x01, 120(DI) - RCRQ $0x01, 112(DI) - RCRQ $0x01, 104(DI) - RCRQ $0x01, 96(DI) - RCRQ $0x01, 88(DI) - RCRQ $0x01, 80(DI) - RCRQ $0x01, 72(DI) - RCRQ $0x01, 64(DI) - RCRQ $0x01, 56(DI) - RCRQ $0x01, 48(DI) - RCRQ $0x01, 40(DI) - RCRQ $0x01, 32(DI) - RCRQ $0x01, 24(DI) - RCRQ $0x01, 16(DI) - RCRQ $0x01, 8(DI) - RCRQ $0x01, (DI) - RET + // | -// func mul9(c *[18]uint64, a *[9]uint64, b *[9]uint64, p *[9]uint64, inp uint64) -TEXT ·mul9(SB), $136-40 -/* inputs */ - // | - // | Multiplication - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI +/* */ - // | - // | - XORQ R10, R10 - XORQ R11, R11 - XORQ R12, R12 - XORQ R13, R13 - XORQ R14, R14 - XORQ R15, R15 - MOVQ $0x00000000, (SP) - MOVQ $0x00000000, 8(SP) - MOVQ $0x00000000, 16(SP) - MOVQ $0x00000000, 24(SP) - MOVQ $0x00000000, 32(SP) - MOVQ $0x00000000, 40(SP) - MOVQ $0x00000000, 48(SP) - MOVQ $0x00000000, 56(SP) - MOVQ $0x00000000, 64(SP) - MOVQ $0x00000000, 72(SP) - - // | - // | b0 - MOVQ (SI), CX + // | j9 - // | a0 * b0 - // | (w0, w1) @ (R8, R9) - MOVQ (DI), AX - MULQ CX - MOVQ AX, R8 - MOVQ DX, R9 + // | w23 @ SI + MULXQ 72(R8), AX, R9 + ADOXQ AX, SI + ADCXQ R9, BX - // | a1 * b0 - // | (w1, w2) @ (R9, R10) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, R10 + // | j10 - // | a2 * b0 - // | (w2, w3) @ (R10, R11) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, R11 + // | w24 @ BX + MULXQ 80(R8), AX, R9 + ADOXQ AX, BX + ADCXQ R9, R10 - // | a3 * b0 - // | (w3, w4) @ (R11, R12) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, R12 + // | j11 - // | a4 * b0 - // | (w4, w5) @ (R12, R13) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, R13 + // | w25 @ R10 + MULXQ 88(R8), AX, R9 + ADOXQ AX, R10 + ADCXQ R9, R11 - // | a5 * b0 - // | (w5, w6) @ (R13, R14) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 + // | j12 - // | a6 * b0 - // | (w6, w7) @ (R14, R15) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 + // | w26 @ R11 + MULXQ 96(R8), AX, R9 + ADOXQ AX, R11 + ADCXQ R9, R15 - // | a7 * b0 - // | (w7, w8) @ (R15, (SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) + // | j13 - // | a8 * b0 - // | (w8, w9) @ ((SP), 8(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) + // | w27 @ R15 + MULXQ 104(R8), AX, R9 + ADOXQ AX, R15 + ADCXQ R9, R14 - // | - // | b1 - MOVQ 8(SI), CX + // | j14 - // | a0 * b1 - // | (w1, w2, w3, w4) @ (R9, R10, R11, R12) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, R10 - ADCQ $0x00, R11 - ADCQ $0x00, R12 + // | w28 @ R14 + MULXQ 112(R8), AX, R9 + ADOXQ AX, R14 - // | a1 * b1 - // | (w2, w3, w4, w5) @ (R10, R11, R12, R13) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, R11 - ADCQ $0x00, R12 - ADCQ $0x00, R13 + // | w29 @ (SP) + // | move to an idle register + MOVQ (SP), AX + ADCXQ R9, AX - // | a2 * b1 - // | (w3, w4, w5, w6) @ (R11, R12, R13, R14) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, R12 - ADCQ $0x00, R13 - ADCQ $0x00, R14 + // | w29 @ AX + ADOXQ DI, AX + MOVQ $0x00, DI + ADCXQ DI, DI + MOVQ $0x00, R9 + ADOXQ R9, DI - // | a3 * b1 - // | (w4, w5, w6, w7) @ (R12, R13, R14, R15) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, R13 - ADCQ $0x00, R14 - ADCQ $0x00, R15 + // | + // | W q4 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - + // | 15 CX | 16 R13 | 17 R12 | 18 48(SP) | 19 40(SP) | 20 32(SP) | 21 24(SP) | 22 16(SP) | 23 SI | 24 BX | 25 R10 | 26 R11 | 27 R15 | 28 R14 | 29 AX - // | a4 * b1 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) - // | a5 * b1 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + // | - // | a6 * b1 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) +/* modular reduction */ - // | a7 * b1 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + MOVQ CX, R9 + SUBQ (R8), R9 + MOVQ R13, DX + SBBQ 8(R8), DX + MOVQ DX, (SP) + MOVQ R12, DX + SBBQ 16(R8), DX + MOVQ DX, 8(SP) + MOVQ 48(SP), DX + SBBQ 24(R8), DX + MOVQ DX, 56(SP) + MOVQ 40(SP), DX + SBBQ 32(R8), DX + MOVQ DX, 64(SP) + MOVQ 32(SP), DX + SBBQ 40(R8), DX + MOVQ DX, 72(SP) + MOVQ 24(SP), DX + SBBQ 48(R8), DX + MOVQ DX, 80(SP) + MOVQ 16(SP), DX + SBBQ 56(R8), DX + MOVQ DX, 88(SP) + MOVQ SI, DX + SBBQ 64(R8), DX + MOVQ DX, 96(SP) + MOVQ BX, DX + SBBQ 72(R8), DX + MOVQ DX, 104(SP) + MOVQ R10, DX + SBBQ 80(R8), DX + MOVQ DX, 112(SP) + MOVQ R11, DX + SBBQ 88(R8), DX + MOVQ DX, 120(SP) + MOVQ R15, DX + SBBQ 96(R8), DX + MOVQ DX, 128(SP) + MOVQ R14, DX + SBBQ 104(R8), DX + MOVQ DX, 136(SP) + MOVQ AX, DX + SBBQ 112(R8), DX + MOVQ DX, 144(SP) + SBBQ $0x00, DI - // | a8 * b1 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | - // | - // | b2 - MOVQ 16(SI), CX +/* out */ - // | a0 * b2 - // | (w2, w3, w4, w5) @ (R10, R11, R12, R13) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, R11 - ADCQ $0x00, R12 - ADCQ $0x00, R13 + MOVQ c+0(FP), DI + CMOVQCC R9, CX + MOVQ CX, (DI) + CMOVQCC (SP), R13 + MOVQ R13, 8(DI) + CMOVQCC 8(SP), R12 + MOVQ R12, 16(DI) + MOVQ 48(SP), DX + CMOVQCC 56(SP), DX + MOVQ DX, 24(DI) + MOVQ 40(SP), DX + CMOVQCC 64(SP), DX + MOVQ DX, 32(DI) + MOVQ 32(SP), DX + CMOVQCC 72(SP), DX + MOVQ DX, 40(DI) + MOVQ 24(SP), DX + CMOVQCC 80(SP), DX + MOVQ DX, 48(DI) + MOVQ 16(SP), DX + CMOVQCC 88(SP), DX + MOVQ DX, 56(DI) + CMOVQCC 96(SP), SI + MOVQ SI, 64(DI) + CMOVQCC 104(SP), BX + MOVQ BX, 72(DI) + CMOVQCC 112(SP), R10 + MOVQ R10, 80(DI) + CMOVQCC 120(SP), R11 + MOVQ R11, 88(DI) + CMOVQCC 128(SP), R15 + MOVQ R15, 96(DI) + CMOVQCC 136(SP), R14 + MOVQ R14, 104(DI) + CMOVQCC 144(SP), AX + MOVQ AX, 112(DI) + RET - // | a1 * b2 - // | (w3, w4, w5, w6) @ (R11, R12, R13, R14) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, R12 - ADCQ $0x00, R13 - ADCQ $0x00, R14 + // | - // | a2 * b2 - // | (w4, w5, w6, w7) @ (R12, R13, R14, R15) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, R13 - ADCQ $0x00, R14 - ADCQ $0x00, R15 +/* end */ - // | a3 * b2 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) - // | a4 * b2 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) +// func mul_no_adx_bmi2_15(c *[15]uint64, a *[15]uint64, b *[15]uint64, p *[15]uint64, inp uint64) +TEXT ·mul_no_adx_bmi2_15(SB), NOSPLIT, $288-40 + // | - // | a5 * b2 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) +/* inputs */ - // | a6 * b2 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 48(DI), AX + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + MOVQ $0x00, R9 + MOVQ $0x00, R10 + MOVQ $0x00, R11 + MOVQ $0x00, R12 + MOVQ $0x00, R13 + MOVQ $0x00, R14 + MOVQ $0x00, R15 + + // | + +/* i = 0 */ + + // | a0 @ CX + MOVQ (DI), CX + + // | a0 * b0 + MOVQ (SI), AX MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + MOVQ AX, (SP) + MOVQ DX, R8 - // | a7 * b2 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 56(DI), AX + // | a0 * b1 + MOVQ 8(SI), AX MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + ADDQ AX, R8 + ADCQ DX, R9 - // | a8 * b2 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 64(DI), AX + // | a0 * b2 + MOVQ 16(SI), AX MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + ADDQ AX, R9 + ADCQ DX, R10 - // | - // | b3 - MOVQ 24(SI), CX + // | a0 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 - // | a0 * b3 - // | (w3, w4, w5, w6) @ (R11, R12, R13, R14) - MOVQ (DI), AX + // | a0 * b4 + MOVQ 32(SI), AX MULQ CX ADDQ AX, R11 ADCQ DX, R12 - ADCQ $0x00, R13 - ADCQ $0x00, R14 - // | a1 * b3 - // | (w4, w5, w6, w7) @ (R12, R13, R14, R15) - MOVQ 8(DI), AX + // | a0 * b5 + MOVQ 40(SI), AX MULQ CX ADDQ AX, R12 ADCQ DX, R13 - ADCQ $0x00, R14 - ADCQ $0x00, R15 - // | a2 * b3 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ 16(DI), AX + // | a0 * b6 + MOVQ 48(SI), AX MULQ CX ADDQ AX, R13 ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) - // | a3 * b3 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 24(DI), AX + // | a0 * b7 + MOVQ 56(SI), AX MULQ CX ADDQ AX, R14 ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) - // | a4 * b3 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + // | - // | a5 * b3 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) +/* i = 1 */ - // | a6 * b3 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 48(DI), AX + // | a1 @ CX + MOVQ 8(DI), CX + MOVQ $0x00, BX + + // | a1 * b0 + MOVQ (SI), AX MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R10 + ADCQ $0x00, BX + MOVQ R8, 8(SP) + MOVQ $0x00, R8 - // | a7 * b3 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 56(DI), AX + // | a1 * b1 + MOVQ 8(SI), AX MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a8 * b3 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 64(DI), AX + // | a1 * b2 + MOVQ 16(SI), AX MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | - // | b4 - MOVQ 32(SI), CX + // | a1 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a0 * b4 - // | (w4, w5, w6, w7) @ (R12, R13, R14, R15) - MOVQ (DI), AX + // | a1 * b4 + MOVQ 32(SI), AX MULQ CX ADDQ AX, R12 ADCQ DX, R13 - ADCQ $0x00, R14 - ADCQ $0x00, R15 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a1 * b4 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ 8(DI), AX + // | a1 * b5 + MOVQ 40(SI), AX MULQ CX ADDQ AX, R13 ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a2 * b4 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 16(DI), AX + // | a1 * b6 + MOVQ 48(SI), AX MULQ CX ADDQ AX, R14 ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + ADCQ BX, R8 - // | a3 * b4 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 24(DI), AX + // | a1 * b7 + MOVQ 56(SI), AX MULQ CX ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + ADCQ DX, R8 - // | a4 * b4 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | - // | a5 * b4 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) +/* i = 2 */ - // | a6 * b4 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 48(DI), AX + // | a2 @ CX + MOVQ 16(DI), CX + MOVQ $0x00, BX + + // | a2 * b0 + MOVQ (SI), AX MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ $0x00, R11 + ADCQ $0x00, BX + MOVQ R9, 16(SP) + MOVQ $0x00, R9 - // | a7 * b4 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 56(DI), AX + // | a2 * b1 + MOVQ 8(SI), AX MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a8 * b4 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 64(DI), AX + // | a2 * b2 + MOVQ 16(SI), AX MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | - // | b5 - MOVQ 40(SI), CX + // | a2 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a0 * b5 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ (DI), AX + // | a2 * b4 + MOVQ 32(SI), AX MULQ CX ADDQ AX, R13 ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a1 * b5 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 8(DI), AX + // | a2 * b5 + MOVQ 40(SI), AX MULQ CX ADDQ AX, R14 ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a2 * b5 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 16(DI), AX + // | a2 * b6 + MOVQ 48(SI), AX MULQ CX ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + ADCQ DX, R8 + ADCQ BX, R9 - // | a3 * b5 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 24(DI), AX + // | a2 * b7 + MOVQ 56(SI), AX MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + ADDQ AX, R8 + ADCQ DX, R9 - // | a4 * b5 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | - // | a5 * b5 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) +/* i = 3 */ - // | a6 * b5 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 48(DI), AX + // | a3 @ CX + MOVQ 24(DI), CX + MOVQ $0x00, BX + + // | a3 * b0 + MOVQ (SI), AX MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ $0x00, R12 + ADCQ $0x00, BX + MOVQ R10, 24(SP) + MOVQ $0x00, R10 - // | a7 * b5 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 56(DI), AX + // | a3 * b1 + MOVQ 8(SI), AX MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a8 * b5 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 64(DI), AX + // | a3 * b2 + MOVQ 16(SI), AX MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | - // | b6 - MOVQ 48(SI), CX + // | a3 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a0 * b6 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ (DI), AX + // | a3 * b4 + MOVQ 32(SI), AX MULQ CX ADDQ AX, R14 ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a1 * b6 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 8(DI), AX + // | a3 * b5 + MOVQ 40(SI), AX MULQ CX ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a2 * b6 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 16(DI), AX + // | a3 * b6 + MOVQ 48(SI), AX MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 - // | a3 * b6 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 24(DI), AX + // | a3 * b7 + MOVQ 56(SI), AX MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + ADDQ AX, R9 + ADCQ DX, R10 - // | a4 * b6 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | - // | a5 * b6 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) +/* i = 4 */ - // | a6 * b6 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 48(DI), AX + // | a4 @ CX + MOVQ 32(DI), CX + MOVQ $0x00, BX + + // | a4 * b0 + MOVQ (SI), AX MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ $0x00, R13 + ADCQ $0x00, BX + MOVQ R11, 32(SP) + MOVQ $0x00, R11 - // | a7 * b6 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 56(DI), AX + // | a4 * b1 + MOVQ 8(SI), AX MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a8 * b6 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 64(DI), AX + // | a4 * b2 + MOVQ 16(SI), AX MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | - // | b7 - MOVQ 56(SI), CX + // | a4 * b3 + MOVQ 24(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a0 * b7 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ (DI), AX + // | a4 * b4 + MOVQ 32(SI), AX MULQ CX ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a1 * b7 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 8(DI), AX + // | a4 * b5 + MOVQ 40(SI), AX MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a2 * b7 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 16(DI), AX + // | a4 * b6 + MOVQ 48(SI), AX MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 - // | a3 * b7 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 24(DI), AX + // | a4 * b7 + MOVQ 56(SI), AX MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + ADDQ AX, R10 + ADCQ DX, R11 - // | a4 * b7 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | - // | a5 * b7 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) +/* i = 5 */ - // | a6 * b7 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | a5 @ CX + MOVQ 40(DI), CX + MOVQ $0x00, BX - // | a7 * b7 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 56(DI), AX + // | a5 * b0 + MOVQ (SI), AX MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ $0x00, R14 + ADCQ $0x00, BX + MOVQ R12, 40(SP) + MOVQ $0x00, R12 - // | a8 * b7 - // | (w15, w16, w17) @ (56(SP), 64(SP), 72(SP)) - MOVQ 64(DI), AX + // | a5 * b1 + MOVQ 8(SI), AX MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - - // | - // | b8 - MOVQ 64(SI), CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a0 * b8 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ (DI), AX + // | a5 * b2 + MOVQ 16(SI), AX MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a1 * b8 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 8(DI), AX + // | a5 * b3 + MOVQ 24(SI), AX MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a2 * b8 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 16(DI), AX + // | a5 * b4 + MOVQ 32(SI), AX MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a3 * b8 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 24(DI), AX + // | a5 * b5 + MOVQ 40(SI), AX MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a4 * b8 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 32(DI), AX + // | a5 * b6 + MOVQ 48(SI), AX MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 - // | a5 * b8 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 40(DI), AX + // | a5 * b7 + MOVQ 56(SI), AX MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + ADDQ AX, R11 + ADCQ DX, R12 - // | a6 * b8 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | - // | a7 * b8 - // | (w15, w16, w17) @ (56(SP), 64(SP), 72(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) +/* i = 6 */ - // | a8 * b8 - // | (w16, w17) @ (64(SP), 72(SP)) - MOVQ 64(DI), AX + // | a6 @ CX + MOVQ 48(DI), CX + MOVQ $0x00, BX + + // | a6 * b0 + MOVQ (SI), AX MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ $0x00, R15 + ADCQ $0x00, BX + MOVQ R13, 48(SP) + MOVQ $0x00, R13 - // | - // | Montgomerry Reduction - MOVQ R15, 80(SP) - MOVQ R14, 88(SP) - MOVQ p+24(FP), R14 + // | a6 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | - // | (u @ CX) = (w0 @ R8) * inp - MOVQ R8, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | a6 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w0 @ R8 - XORQ DI, DI - MOVQ (R14), AX + // | a6 * b3 + MOVQ 24(SI), AX MULQ CX ADDQ AX, R8 - ADCQ DX, DI + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w1 @ R9 - XORQ SI, SI - MOVQ 8(R14), AX + // | a6 * b4 + MOVQ 32(SI), AX MULQ CX ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w2 @ R10 - XORQ DI, DI - MOVQ 16(R14), AX + // | a6 * b5 + MOVQ 40(SI), AX MULQ CX ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w3 @ R11 - XORQ SI, SI - MOVQ 24(R14), AX + // | a6 * b6 + MOVQ 48(SI), AX MULQ CX ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + ADCQ DX, R12 + ADCQ BX, R13 - // | w4 @ R12 - XORQ DI, DI - MOVQ 32(R14), AX + // | a6 * b7 + MOVQ 56(SI), AX MULQ CX ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + ADCQ DX, R13 - // | w5 @ R13 - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + // | - // | w6 @ 88(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, DI - ADDQ SI, 88(SP) - ADCQ $0x00, DI +/* i = 7 */ - // | w7 @ 80(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, SI - ADDQ DI, 80(SP) - ADCQ $0x00, SI + // | a7 @ CX + MOVQ 56(DI), CX + MOVQ $0x00, BX - // | w8 @ (SP) - XORQ DI, DI - MOVQ 64(R14), AX + // | a7 * b0 + MOVQ (SI), AX MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI - - // | w9 @ 8(SP) - ADDQ DI, 8(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ $0x00, R8 + ADCQ $0x00, BX + MOVQ R14, 56(SP) + MOVQ $0x00, R14 - // | - MOVQ 16(SP), R8 + // | a7 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | (u @ CX) = (w1 @ R9) * inp - MOVQ R9, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | a7 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w1 @ R9 - XORQ DI, DI - MOVQ (R14), AX + // | a7 * b3 + MOVQ 24(SI), AX MULQ CX ADDQ AX, R9 - ADCQ DX, DI + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w2 @ R10 - XORQ SI, SI - MOVQ 8(R14), AX + // | a7 * b4 + MOVQ 32(SI), AX MULQ CX ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w3 @ R11 - XORQ DI, DI - MOVQ 16(R14), AX + // | a7 * b5 + MOVQ 40(SI), AX MULQ CX ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w4 @ R12 - XORQ SI, SI - MOVQ 24(R14), AX + // | a7 * b6 + MOVQ 48(SI), AX MULQ CX ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + ADCQ DX, R13 + ADCQ BX, R14 - // | w5 @ R13 - XORQ DI, DI - MOVQ 32(R14), AX + // | a7 * b7 + MOVQ 56(SI), AX MULQ CX ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + ADCQ DX, R14 - // | w6 @ 88(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, SI - ADDQ DI, 88(SP) - ADCQ $0x00, SI + // | - // | w7 @ 80(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, DI - ADDQ SI, 80(SP) - ADCQ $0x00, DI +/* i = 8 */ - // | w8 @ (SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + // | a8 @ CX + MOVQ 64(DI), CX + MOVQ $0x00, BX - // | w9 @ 8(SP) - XORQ DI, DI - MOVQ 64(R14), AX + // | a8 * b0 + MOVQ (SI), AX MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI - - // | w10 @ R8 - ADDQ DI, R15 - ADCQ R15, R8 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 - - // | - MOVQ 24(SP), R9 + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ $0x00, R9 + ADCQ $0x00, BX + MOVQ R15, 64(SP) + MOVQ $0x00, R15 - // | (u @ CX) = (w2 @ R10) * inp - MOVQ R10, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | a8 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w2 @ R10 - XORQ DI, DI - MOVQ (R14), AX + // | a8 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a8 * b3 + MOVQ 24(SI), AX MULQ CX ADDQ AX, R10 - ADCQ DX, DI + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w3 @ R11 - XORQ SI, SI - MOVQ 8(R14), AX + // | a8 * b4 + MOVQ 32(SI), AX MULQ CX ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w4 @ R12 - XORQ DI, DI - MOVQ 16(R14), AX + // | a8 * b5 + MOVQ 40(SI), AX MULQ CX ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w5 @ R13 - XORQ SI, SI - MOVQ 24(R14), AX + // | a8 * b6 + MOVQ 48(SI), AX MULQ CX ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + ADCQ DX, R14 + ADCQ BX, R15 - // | w6 @ 88(SP) - XORQ DI, DI - MOVQ 32(R14), AX + // | a8 * b7 + MOVQ 56(SI), AX MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, DI - ADDQ SI, 88(SP) - ADCQ $0x00, DI + ADDQ AX, R14 + ADCQ DX, R15 - // | w7 @ 80(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, SI - ADDQ DI, 80(SP) - ADCQ $0x00, SI + // | - // | w8 @ (SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI +/* i = 9 */ - // | w9 @ 8(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + // | a9 @ CX + MOVQ 72(DI), CX + MOVQ $0x00, BX - // | w10 @ R8 - XORQ DI, DI - MOVQ 64(R14), AX + // | a9 * b0 + MOVQ (SI), AX MULQ CX ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + ADCQ DX, R9 + ADCQ $0x00, R10 + ADCQ $0x00, BX + MOVQ R8, 72(SP) + MOVQ $0x00, R8 - // | w11 @ R9 - ADDQ DI, R15 - ADCQ R15, R9 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 - - // | - MOVQ 32(SP), R10 + // | a9 * b1 + MOVQ 8(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | (u @ CX) = (w3 @ R11) * inp - MOVQ R11, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | a9 * b2 + MOVQ 16(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w3 @ R11 - XORQ DI, DI - MOVQ (R14), AX + // | a9 * b3 + MOVQ 24(SI), AX MULQ CX ADDQ AX, R11 - ADCQ DX, DI + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w4 @ R12 - XORQ SI, SI - MOVQ 8(R14), AX + // | a9 * b4 + MOVQ 32(SI), AX MULQ CX ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w5 @ R13 - XORQ DI, DI - MOVQ 16(R14), AX + // | a9 * b5 + MOVQ 40(SI), AX MULQ CX ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w6 @ 88(SP) - XORQ SI, SI - MOVQ 24(R14), AX + // | a9 * b6 + MOVQ 48(SI), AX MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, SI - ADDQ DI, 88(SP) - ADCQ $0x00, SI + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 - // | w7 @ 80(SP) - XORQ DI, DI - MOVQ 32(R14), AX + // | a9 * b7 + MOVQ 56(SI), AX MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, DI - ADDQ SI, 80(SP) - ADCQ $0x00, DI + ADDQ AX, R15 + ADCQ DX, R8 - // | w8 @ (SP) - XORQ SI, SI - MOVQ 40(R14), AX + // | + +/* i = 10 */ + + // | a10 @ CX + MOVQ 80(DI), CX + MOVQ $0x00, BX + + // | a10 * b0 + MOVQ (SI), AX MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ $0x00, R11 + ADCQ $0x00, BX + MOVQ R9, 80(SP) + MOVQ $0x00, R9 - // | w9 @ 8(SP) - XORQ DI, DI - MOVQ 48(R14), AX + // | a10 * b1 + MOVQ 8(SI), AX MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w10 @ R8 - XORQ SI, SI - MOVQ 56(R14), AX + // | a10 * b2 + MOVQ 16(SI), AX MULQ CX - ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w11 @ R9 - XORQ DI, DI - MOVQ 64(R14), AX + // | a10 * b3 + MOVQ 24(SI), AX MULQ CX - ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w12 @ R10 - ADDQ DI, R15 - ADCQ R15, R10 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | a10 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | - MOVQ 40(SP), R11 + // | a10 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | (u @ CX) = (w4 @ R12) * inp - MOVQ R12, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | a10 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 - // | w4 @ R12 - XORQ DI, DI - MOVQ (R14), AX + // | a10 * b7 + MOVQ 56(SI), AX MULQ CX - ADDQ AX, R12 - ADCQ DX, DI + ADDQ AX, R8 + ADCQ DX, R9 - // | w5 @ R13 - XORQ SI, SI - MOVQ 8(R14), AX + // | + +/* i = 11 */ + + // | a11 @ CX + MOVQ 88(DI), CX + MOVQ $0x00, BX + + // | a11 * b0 + MOVQ (SI), AX MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ $0x00, R12 + ADCQ $0x00, BX + MOVQ R10, 88(SP) + MOVQ $0x00, R10 - // | w6 @ 88(SP) - XORQ DI, DI - MOVQ 16(R14), AX + // | a11 * b1 + MOVQ 8(SI), AX MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, DI - ADDQ SI, 88(SP) - ADCQ $0x00, DI + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w7 @ 80(SP) - XORQ SI, SI - MOVQ 24(R14), AX + // | a11 * b2 + MOVQ 16(SI), AX MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, SI - ADDQ DI, 80(SP) - ADCQ $0x00, SI + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w8 @ (SP) - XORQ DI, DI - MOVQ 32(R14), AX + // | a11 * b3 + MOVQ 24(SI), AX MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w9 @ 8(SP) - XORQ SI, SI - MOVQ 40(R14), AX + // | a11 * b4 + MOVQ 32(SI), AX MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w10 @ R8 - XORQ DI, DI - MOVQ 48(R14), AX + // | a11 * b5 + MOVQ 40(SI), AX MULQ CX - ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w11 @ R9 - XORQ SI, SI - MOVQ 56(R14), AX + // | a11 * b6 + MOVQ 48(SI), AX MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 - // | w12 @ R10 - XORQ DI, DI - MOVQ 64(R14), AX + // | a11 * b7 + MOVQ 56(SI), AX MULQ CX - ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + ADDQ AX, R9 + ADCQ DX, R10 - // | w13 @ R11 - ADDQ DI, R15 - ADCQ R15, R11 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | - // | - MOVQ 48(SP), R12 +/* i = 12 */ - // | (u @ CX) = (w5 @ R13) * inp - MOVQ R13, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | a12 @ CX + MOVQ 96(DI), CX + MOVQ $0x00, BX - // | w5 @ R13 - XORQ DI, DI - MOVQ (R14), AX + // | a12 * b0 + MOVQ (SI), AX MULQ CX - ADDQ AX, R13 - ADCQ DX, DI + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ $0x00, R13 + ADCQ $0x00, BX + MOVQ R11, 96(SP) + MOVQ $0x00, R11 - // | w6 @ 88(SP) - XORQ SI, SI - MOVQ 8(R14), AX + // | a12 * b1 + MOVQ 8(SI), AX MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, SI - ADDQ DI, 88(SP) - ADCQ $0x00, SI + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w7 @ 80(SP) - XORQ DI, DI - MOVQ 16(R14), AX + // | a12 * b2 + MOVQ 16(SI), AX MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, DI - ADDQ SI, 80(SP) - ADCQ $0x00, DI + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w8 @ (SP) - XORQ SI, SI - MOVQ 24(R14), AX + // | a12 * b3 + MOVQ 24(SI), AX MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w9 @ 8(SP) - XORQ DI, DI - MOVQ 32(R14), AX + // | a12 * b4 + MOVQ 32(SI), AX MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w10 @ R8 - XORQ SI, SI - MOVQ 40(R14), AX + // | a12 * b5 + MOVQ 40(SI), AX MULQ CX ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w11 @ R9 - XORQ DI, DI - MOVQ 48(R14), AX + // | a12 * b6 + MOVQ 48(SI), AX MULQ CX ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI + ADCQ DX, R10 + ADCQ BX, R11 - // | w12 @ R10 - XORQ SI, SI - MOVQ 56(R14), AX + // | a12 * b7 + MOVQ 56(SI), AX MULQ CX ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI - - // | w13 @ R11 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI + ADCQ DX, R11 - // | w14 @ R12 - ADDQ DI, R15 - ADCQ R15, R12 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | - // | - MOVQ 56(SP), R13 +/* i = 13 */ - // | (u @ CX) = (w6 @ 88(SP)) * inp - MOVQ 88(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + // | a13 @ CX + MOVQ 104(DI), CX + MOVQ $0x00, BX - // | w6 @ 88(SP) - XORQ DI, DI - MOVQ (R14), AX + // | a13 * b0 + MOVQ (SI), AX MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, DI + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ $0x00, R14 + ADCQ $0x00, BX + MOVQ R12, 104(SP) + MOVQ $0x00, R12 - // | w7 @ 80(SP) - XORQ SI, SI - MOVQ 8(R14), AX + // | a13 * b1 + MOVQ 8(SI), AX MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, SI - ADDQ DI, 80(SP) - ADCQ $0x00, SI + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w8 @ (SP) - XORQ DI, DI - MOVQ 16(R14), AX + // | a13 * b2 + MOVQ 16(SI), AX MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w9 @ 8(SP) - XORQ SI, SI - MOVQ 24(R14), AX + // | a13 * b3 + MOVQ 24(SI), AX MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w10 @ R8 - XORQ DI, DI - MOVQ 32(R14), AX + // | a13 * b4 + MOVQ 32(SI), AX MULQ CX ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w11 @ R9 - XORQ SI, SI - MOVQ 40(R14), AX + // | a13 * b5 + MOVQ 40(SI), AX MULQ CX ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w12 @ R10 - XORQ DI, DI - MOVQ 48(R14), AX + // | a13 * b6 + MOVQ 48(SI), AX MULQ CX ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + ADCQ DX, R11 + ADCQ BX, R12 - // | w13 @ R11 - XORQ SI, SI - MOVQ 56(R14), AX + // | a13 * b7 + MOVQ 56(SI), AX MULQ CX ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI - - // | w14 @ R12 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + ADCQ DX, R12 - // | w15 @ R13 - ADDQ DI, R15 - ADCQ R15, R13 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | - // | - MOVQ 64(SP), BX - MOVQ BX, 88(SP) +/* i = 14 */ - // | (u @ CX) = (w7 @ 80(SP)) * inp - MOVQ 80(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + // | a14 @ CX + MOVQ 112(DI), CX + MOVQ $0x00, BX - // | w7 @ 80(SP) - XORQ DI, DI - MOVQ (R14), AX + // | a14 * b0 + MOVQ (SI), AX MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, DI + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ $0x00, R15 + ADCQ $0x00, BX - // | w8 @ (SP) - XORQ SI, SI - MOVQ 8(R14), AX + // | a14 * b1 + MOVQ 8(SI), AX MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w9 @ 8(SP) - XORQ DI, DI - MOVQ 16(R14), AX + // | a14 * b2 + MOVQ 16(SI), AX MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w10 @ R8 - XORQ SI, SI - MOVQ 24(R14), AX + // | a14 * b3 + MOVQ 24(SI), AX MULQ CX ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w11 @ R9 - XORQ DI, DI - MOVQ 32(R14), AX + // | a14 * b4 + MOVQ 32(SI), AX MULQ CX ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w12 @ R10 - XORQ SI, SI - MOVQ 40(R14), AX + // | a14 * b5 + MOVQ 40(SI), AX MULQ CX ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w13 @ R11 - XORQ DI, DI - MOVQ 48(R14), AX + // | a14 * b6 + MOVQ 48(SI), AX MULQ CX ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI + ADCQ DX, R12 + ADCQ $0x00, BX - // | w14 @ R12 - XORQ SI, SI - MOVQ 56(R14), AX + // | a14 * b7 + MOVQ 56(SI), AX MULQ CX ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + ADCQ DX, BX - // | w15 @ R13 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + // | - // | w16 @ 88(SP) - ADDQ DI, R15 - ADCQ R15, 88(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 +/* */ - // | - MOVQ 72(SP), BX - MOVQ BX, 80(SP) + // | + // | W part 1 multiplication + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 R13 + // | 15 R14 | 16 R15 | 17 R8 | 18 R9 | 19 R10 | 20 R11 | 21 R12 | 22 BX | 23 - | 24 - | 25 - | 26 - | 27 - | 28 - | 29 - - // | (u @ CX) = (w8 @ (SP)) * inp - MOVQ (SP), AX - MULQ inp+32(FP) - MOVQ AX, CX - // | w8 @ (SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI + MOVQ R13, 112(SP) + MOVQ R14, 120(SP) + MOVQ R15, 128(SP) + MOVQ R8, 136(SP) + MOVQ R9, 144(SP) + MOVQ R10, 152(SP) + MOVQ R11, 160(SP) + MOVQ R12, 168(SP) + MOVQ BX, 176(SP) - // | w9 @ 8(SP) - XORQ SI, SI - MOVQ 8(R14), AX + // | + // | W part 1 moved to stack + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 112(SP) + // | 15 120(SP) | 16 128(SP) | 17 136(SP) | 18 144(SP) | 19 152(SP) | 20 160(SP) | 21 168(SP) | 22 176(SP) | 23 - | 24 - | 25 - | 26 - | 27 - | 28 - | 29 - + + + MOVQ $0x00, R9 + MOVQ $0x00, R10 + MOVQ $0x00, R11 + MOVQ $0x00, R12 + MOVQ $0x00, R13 + MOVQ $0x00, R14 + MOVQ $0x00, R15 + + // | + +/* i = 0 */ + + // | a0 @ CX + MOVQ (DI), CX + + // | a0 * b8 + MOVQ 64(SI), AX MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + MOVQ AX, 184(SP) + MOVQ DX, R8 - // | w10 @ R8 - XORQ DI, DI - MOVQ 16(R14), AX + // | a0 * b9 + MOVQ 72(SI), AX MULQ CX ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + ADCQ DX, R9 - // | w11 @ R9 - XORQ SI, SI - MOVQ 24(R14), AX + // | a0 * b10 + MOVQ 80(SI), AX MULQ CX ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + ADCQ DX, R10 - // | w12 @ R10 - XORQ DI, DI - MOVQ 32(R14), AX + // | a0 * b11 + MOVQ 88(SI), AX MULQ CX ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + ADCQ DX, R11 - // | w13 @ R11 - XORQ SI, SI - MOVQ 40(R14), AX + // | a0 * b12 + MOVQ 96(SI), AX MULQ CX ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + ADCQ DX, R12 - // | w14 @ R12 - XORQ DI, DI - MOVQ 48(R14), AX + // | a0 * b13 + MOVQ 104(SI), AX MULQ CX ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + ADCQ DX, R13 - // | w15 @ R13 - XORQ SI, SI - MOVQ 56(R14), AX + // | a0 * b14 + MOVQ 112(SI), AX MULQ CX ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI - - // | w16 @ 88(SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, DI - ADDQ SI, 88(SP) - ADCQ $0x00, DI - - // | w17 @ 80(SP) - ADDQ DI, R15 - ADCQ R15, 80(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 - - // | Reduce by modulus - MOVQ 8(SP), CX - SUBQ (R14), CX - MOVQ R8, AX - SBBQ 8(R14), AX - MOVQ R9, DX - SBBQ 16(R14), DX - MOVQ R10, BX - SBBQ 24(R14), BX - MOVQ BX, (SP) - MOVQ R11, BX - SBBQ 32(R14), BX - MOVQ BX, 96(SP) - MOVQ R12, BX - SBBQ 40(R14), BX - MOVQ BX, 104(SP) - MOVQ R13, BX - SBBQ 48(R14), BX - MOVQ BX, 112(SP) - MOVQ 88(SP), BX - SBBQ 56(R14), BX - MOVQ BX, 120(SP) - MOVQ 80(SP), BX - SBBQ 64(R14), BX - MOVQ BX, 128(SP) - SBBQ $0x00, R15 - - // | Compare & Return - MOVQ c+0(FP), DI - CMOVQCS 8(SP), CX - MOVQ CX, (DI) - CMOVQCC AX, R8 - MOVQ R8, 8(DI) - CMOVQCC DX, R9 - MOVQ R9, 16(DI) - CMOVQCC (SP), R10 - MOVQ R10, 24(DI) - CMOVQCC 96(SP), R11 - MOVQ R11, 32(DI) - CMOVQCC 104(SP), R12 - MOVQ R12, 40(DI) - CMOVQCC 112(SP), R13 - MOVQ R13, 48(DI) - MOVQ 88(SP), BX - CMOVQCC 120(SP), BX - MOVQ BX, 56(DI) - MOVQ 80(SP), BX - CMOVQCC 128(SP), BX - MOVQ BX, 64(DI) - RET + ADCQ DX, R14 + // | -// func mul10(c *[20]uint64, a *[10]uint64, b *[10]uint64, p *[10]uint64, inp uint64) -TEXT ·mul10(SB), $160-40 -/* inputs */ - // | - // | Multiplication - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI +/* i = 1 */ - // | - // | - XORQ R10, R10 - XORQ R11, R11 - XORQ R12, R12 - XORQ R13, R13 - XORQ R14, R14 - XORQ R15, R15 - MOVQ $0x00000000, (SP) - MOVQ $0x00000000, 8(SP) - MOVQ $0x00000000, 16(SP) - MOVQ $0x00000000, 24(SP) - MOVQ $0x00000000, 32(SP) - MOVQ $0x00000000, 40(SP) - MOVQ $0x00000000, 48(SP) - MOVQ $0x00000000, 56(SP) - MOVQ $0x00000000, 64(SP) - MOVQ $0x00000000, 72(SP) - MOVQ $0x00000000, 80(SP) - MOVQ $0x00000000, 88(SP) - - // | - // | b0 - MOVQ (SI), CX + // | a1 @ CX + MOVQ 8(DI), CX + MOVQ $0x00, BX - // | a0 * b0 - // | (w0, w1) @ (R8, R9) - MOVQ (DI), AX + // | a1 * b8 + MOVQ 64(SI), AX MULQ CX - MOVQ AX, R8 - MOVQ DX, R9 + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R10 + ADCQ $0x00, BX + MOVQ R8, 192(SP) + MOVQ $0x00, R8 - // | a1 * b0 - // | (w1, w2) @ (R9, R10) - MOVQ 8(DI), AX + // | a1 * b9 + MOVQ 72(SI), AX MULQ CX ADDQ AX, R9 ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a2 * b0 - // | (w2, w3) @ (R10, R11) - MOVQ 16(DI), AX + // | a1 * b10 + MOVQ 80(SI), AX MULQ CX ADDQ AX, R10 ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a3 * b0 - // | (w3, w4) @ (R11, R12) - MOVQ 24(DI), AX + // | a1 * b11 + MOVQ 88(SI), AX MULQ CX ADDQ AX, R11 ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a4 * b0 - // | (w4, w5) @ (R12, R13) - MOVQ 32(DI), AX + // | a1 * b12 + MOVQ 96(SI), AX MULQ CX ADDQ AX, R12 ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a5 * b0 - // | (w5, w6) @ (R13, R14) - MOVQ 40(DI), AX + // | a1 * b13 + MOVQ 104(SI), AX MULQ CX ADDQ AX, R13 ADCQ DX, R14 + ADCQ BX, R15 - // | a6 * b0 - // | (w6, w7) @ (R14, R15) - MOVQ 48(DI), AX + // | a1 * b14 + MOVQ 112(SI), AX MULQ CX ADDQ AX, R14 ADCQ DX, R15 - // | a7 * b0 - // | (w7, w8) @ (R15, (SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - - // | a8 * b0 - // | (w8, w9) @ ((SP), 8(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) + // | - // | a9 * b0 - // | (w9, w10) @ (8(SP), 16(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) +/* i = 2 */ - // | - // | b1 - MOVQ 8(SI), CX + // | a2 @ CX + MOVQ 16(DI), CX + MOVQ $0x00, BX - // | a0 * b1 - // | (w1, w2, w3, w4) @ (R9, R10, R11, R12) - MOVQ (DI), AX + // | a2 * b8 + MOVQ 64(SI), AX MULQ CX ADDQ AX, R9 ADCQ DX, R10 ADCQ $0x00, R11 - ADCQ $0x00, R12 + ADCQ $0x00, BX + MOVQ R9, 200(SP) + MOVQ $0x00, R9 - // | a1 * b1 - // | (w2, w3, w4, w5) @ (R10, R11, R12, R13) - MOVQ 8(DI), AX + // | a2 * b9 + MOVQ 72(SI), AX MULQ CX ADDQ AX, R10 ADCQ DX, R11 - ADCQ $0x00, R12 - ADCQ $0x00, R13 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a2 * b1 - // | (w3, w4, w5, w6) @ (R11, R12, R13, R14) - MOVQ 16(DI), AX + // | a2 * b10 + MOVQ 80(SI), AX MULQ CX ADDQ AX, R11 ADCQ DX, R12 - ADCQ $0x00, R13 - ADCQ $0x00, R14 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a3 * b1 - // | (w4, w5, w6, w7) @ (R12, R13, R14, R15) - MOVQ 24(DI), AX + // | a2 * b11 + MOVQ 88(SI), AX MULQ CX ADDQ AX, R12 ADCQ DX, R13 - ADCQ $0x00, R14 - ADCQ $0x00, R15 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a4 * b1 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ 32(DI), AX + // | a2 * b12 + MOVQ 96(SI), AX MULQ CX ADDQ AX, R13 ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a5 * b1 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 40(DI), AX + // | a2 * b13 + MOVQ 104(SI), AX MULQ CX ADDQ AX, R14 ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + ADCQ BX, R8 - // | a6 * b1 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 48(DI), AX + // | a2 * b14 + MOVQ 112(SI), AX MULQ CX ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) - - // | a7 * b1 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + ADCQ DX, R8 - // | a8 * b1 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | - // | a9 * b1 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) +/* i = 3 */ - // | - // | b2 - MOVQ 16(SI), CX + // | a3 @ CX + MOVQ 24(DI), CX + MOVQ $0x00, BX - // | a0 * b2 - // | (w2, w3, w4, w5) @ (R10, R11, R12, R13) - MOVQ (DI), AX + // | a3 * b8 + MOVQ 64(SI), AX MULQ CX ADDQ AX, R10 ADCQ DX, R11 ADCQ $0x00, R12 - ADCQ $0x00, R13 + ADCQ $0x00, BX + MOVQ R10, 208(SP) + MOVQ $0x00, R10 - // | a1 * b2 - // | (w3, w4, w5, w6) @ (R11, R12, R13, R14) - MOVQ 8(DI), AX + // | a3 * b9 + MOVQ 72(SI), AX MULQ CX ADDQ AX, R11 ADCQ DX, R12 - ADCQ $0x00, R13 - ADCQ $0x00, R14 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a2 * b2 - // | (w4, w5, w6, w7) @ (R12, R13, R14, R15) - MOVQ 16(DI), AX + // | a3 * b10 + MOVQ 80(SI), AX MULQ CX ADDQ AX, R12 ADCQ DX, R13 - ADCQ $0x00, R14 - ADCQ $0x00, R15 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a3 * b2 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ 24(DI), AX + // | a3 * b11 + MOVQ 88(SI), AX MULQ CX ADDQ AX, R13 ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a4 * b2 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 32(DI), AX + // | a3 * b12 + MOVQ 96(SI), AX MULQ CX ADDQ AX, R14 ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a5 * b2 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 40(DI), AX + // | a3 * b13 + MOVQ 104(SI), AX MULQ CX ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) - - // | a6 * b2 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + ADCQ DX, R8 + ADCQ BX, R9 - // | a7 * b2 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 56(DI), AX + // | a3 * b14 + MOVQ 112(SI), AX MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + ADDQ AX, R8 + ADCQ DX, R9 - // | a8 * b2 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | - // | a9 * b2 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) +/* i = 4 */ - // | - // | b3 - MOVQ 24(SI), CX + // | a4 @ CX + MOVQ 32(DI), CX + MOVQ $0x00, BX - // | a0 * b3 - // | (w3, w4, w5, w6) @ (R11, R12, R13, R14) - MOVQ (DI), AX + // | a4 * b8 + MOVQ 64(SI), AX MULQ CX ADDQ AX, R11 ADCQ DX, R12 ADCQ $0x00, R13 - ADCQ $0x00, R14 + ADCQ $0x00, BX + MOVQ R11, 216(SP) + MOVQ $0x00, R11 - // | a1 * b3 - // | (w4, w5, w6, w7) @ (R12, R13, R14, R15) - MOVQ 8(DI), AX + // | a4 * b9 + MOVQ 72(SI), AX MULQ CX ADDQ AX, R12 ADCQ DX, R13 - ADCQ $0x00, R14 - ADCQ $0x00, R15 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a2 * b3 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ 16(DI), AX + // | a4 * b10 + MOVQ 80(SI), AX MULQ CX ADDQ AX, R13 ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a3 * b3 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 24(DI), AX + // | a4 * b11 + MOVQ 88(SI), AX MULQ CX ADDQ AX, R14 ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a4 * b3 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 32(DI), AX + // | a4 * b12 + MOVQ 96(SI), AX MULQ CX ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) - - // | a5 * b3 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) - - // | a6 * b3 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a7 * b3 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) - - // | a8 * b3 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 64(DI), AX + // | a4 * b13 + MOVQ 104(SI), AX MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 - // | a9 * b3 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 72(DI), AX + // | a4 * b14 + MOVQ 112(SI), AX MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + ADDQ AX, R9 + ADCQ DX, R10 - // | - // | b4 - MOVQ 32(SI), CX + // | - // | a0 * b4 - // | (w4, w5, w6, w7) @ (R12, R13, R14, R15) - MOVQ (DI), AX +/* i = 5 */ + + // | a5 @ CX + MOVQ 40(DI), CX + MOVQ $0x00, BX + + // | a5 * b8 + MOVQ 64(SI), AX MULQ CX ADDQ AX, R12 ADCQ DX, R13 ADCQ $0x00, R14 - ADCQ $0x00, R15 + ADCQ $0x00, BX + MOVQ R12, 224(SP) + MOVQ $0x00, R12 - // | a1 * b4 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ 8(DI), AX + // | a5 * b9 + MOVQ 72(SI), AX MULQ CX ADDQ AX, R13 ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a2 * b4 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 16(DI), AX + // | a5 * b10 + MOVQ 80(SI), AX MULQ CX ADDQ AX, R14 ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a3 * b4 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 24(DI), AX + // | a5 * b11 + MOVQ 88(SI), AX MULQ CX ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) - - // | a4 * b4 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a5 * b4 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 40(DI), AX + // | a5 * b12 + MOVQ 96(SI), AX MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a6 * b4 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 48(DI), AX + // | a5 * b13 + MOVQ 104(SI), AX MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 - // | a7 * b4 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 56(DI), AX + // | a5 * b14 + MOVQ 112(SI), AX MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + ADDQ AX, R10 + ADCQ DX, R11 - // | a8 * b4 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | - // | a9 * b4 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) +/* i = 6 */ - // | - // | b5 - MOVQ 40(SI), CX + // | a6 @ CX + MOVQ 48(DI), CX + MOVQ $0x00, BX - // | a0 * b5 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ (DI), AX + // | a6 * b8 + MOVQ 64(SI), AX MULQ CX ADDQ AX, R13 ADCQ DX, R14 ADCQ $0x00, R15 - ADCQ $0x00, (SP) + ADCQ $0x00, BX + MOVQ R13, 232(SP) + MOVQ $0x00, R13 - // | a1 * b5 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 8(DI), AX + // | a6 * b9 + MOVQ 72(SI), AX MULQ CX ADDQ AX, R14 ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a2 * b5 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 16(DI), AX + // | a6 * b10 + MOVQ 80(SI), AX MULQ CX ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a3 * b5 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) - - // | a4 * b5 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 32(DI), AX + // | a6 * b11 + MOVQ 88(SI), AX MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a5 * b5 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 40(DI), AX + // | a6 * b12 + MOVQ 96(SI), AX MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a6 * b5 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 48(DI), AX + // | a6 * b13 + MOVQ 104(SI), AX MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 - // | a7 * b5 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 56(DI), AX + // | a6 * b14 + MOVQ 112(SI), AX MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + ADDQ AX, R11 + ADCQ DX, R12 - // | a8 * b5 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | - // | a9 * b5 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) +/* i = 7 */ - // | - // | b6 - MOVQ 48(SI), CX + // | a7 @ CX + MOVQ 56(DI), CX + MOVQ $0x00, BX - // | a0 * b6 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ (DI), AX + // | a7 * b8 + MOVQ 64(SI), AX MULQ CX ADDQ AX, R14 ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + ADCQ $0x00, R8 + ADCQ $0x00, BX + MOVQ R14, 240(SP) + MOVQ $0x00, R14 - // | a1 * b6 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 8(DI), AX + // | a7 * b9 + MOVQ 72(SI), AX MULQ CX ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) - - // | a2 * b6 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a3 * b6 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 24(DI), AX + // | a7 * b10 + MOVQ 80(SI), AX MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a4 * b6 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 32(DI), AX + // | a7 * b11 + MOVQ 88(SI), AX MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a5 * b6 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 40(DI), AX + // | a7 * b12 + MOVQ 96(SI), AX MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a6 * b6 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 48(DI), AX + // | a7 * b13 + MOVQ 104(SI), AX MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 - // | a7 * b6 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 56(DI), AX + // | a7 * b14 + MOVQ 112(SI), AX MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + ADDQ AX, R12 + ADCQ DX, R13 - // | a8 * b6 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | - // | a9 * b6 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) +/* i = 8 */ - // | - // | b7 - MOVQ 56(SI), CX + // | a8 @ CX + MOVQ 64(DI), CX + MOVQ $0x00, BX - // | a0 * b7 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ (DI), AX + // | a8 * b8 + MOVQ 64(SI), AX MULQ CX ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) - - // | a1 * b7 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) - - // | a2 * b7 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) - - // | a3 * b7 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + ADCQ DX, R8 + ADCQ $0x00, R9 + ADCQ $0x00, BX + MOVQ R15, 248(SP) + MOVQ $0x00, R15 - // | a4 * b7 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 32(DI), AX + // | a8 * b9 + MOVQ 72(SI), AX MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a5 * b7 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 40(DI), AX + // | a8 * b10 + MOVQ 80(SI), AX MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a6 * b7 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 48(DI), AX + // | a8 * b11 + MOVQ 88(SI), AX MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a7 * b7 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 56(DI), AX + // | a8 * b12 + MOVQ 96(SI), AX MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a8 * b7 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 64(DI), AX + // | a8 * b13 + MOVQ 104(SI), AX MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 - // | a9 * b7 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 72(DI), AX + // | a8 * b14 + MOVQ 112(SI), AX MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) - - // | - // | b8 - MOVQ 64(SI), CX + ADDQ AX, R13 + ADCQ DX, R14 - // | a0 * b8 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | - // | a1 * b8 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) +/* i = 9 */ - // | a2 * b8 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | a9 @ CX + MOVQ 72(DI), CX + MOVQ $0x00, BX - // | a3 * b8 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 24(DI), AX + // | a9 * b8 + MOVQ 64(SI), AX MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R10 + ADCQ $0x00, BX + MOVQ R8, 256(SP) + MOVQ $0x00, R8 - // | a4 * b8 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 32(DI), AX + // | a9 * b9 + MOVQ 72(SI), AX MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a5 * b8 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 40(DI), AX + // | a9 * b10 + MOVQ 80(SI), AX MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a6 * b8 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 48(DI), AX + // | a9 * b11 + MOVQ 88(SI), AX MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a7 * b8 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 56(DI), AX + // | a9 * b12 + MOVQ 96(SI), AX MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a8 * b8 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 64(DI), AX + // | a9 * b13 + MOVQ 104(SI), AX MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 - // | a9 * b8 - // | (w17, w18, w19) @ (72(SP), 80(SP), 88(SP)) - MOVQ 72(DI), AX + // | a9 * b14 + MOVQ 112(SI), AX MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - - // | - // | b9 - MOVQ 72(SI), CX + ADDQ AX, R14 + ADCQ DX, R15 - // | a0 * b9 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | - // | a1 * b9 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) +/* i = 10 */ - // | a2 * b9 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | a10 @ CX + MOVQ 80(DI), CX + MOVQ $0x00, BX - // | a3 * b9 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 24(DI), AX + // | a10 * b8 + MOVQ 64(SI), AX MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ $0x00, R11 + ADCQ $0x00, BX + MOVQ R9, 264(SP) + MOVQ $0x00, R9 - // | a4 * b9 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 32(DI), AX + // | a10 * b9 + MOVQ 72(SI), AX MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a5 * b9 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 40(DI), AX + // | a10 * b10 + MOVQ 80(SI), AX MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a6 * b9 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 48(DI), AX + // | a10 * b11 + MOVQ 88(SI), AX MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a7 * b9 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 56(DI), AX + // | a10 * b12 + MOVQ 96(SI), AX MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a8 * b9 - // | (w17, w18, w19) @ (72(SP), 80(SP), 88(SP)) - MOVQ 64(DI), AX + // | a10 * b13 + MOVQ 104(SI), AX MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 - // | a9 * b9 - // | (w18, w19) @ (80(SP), 88(SP)) - MOVQ 72(DI), AX + // | a10 * b14 + MOVQ 112(SI), AX MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - - // | - // | Montgomerry Reduction - MOVQ R15, 96(SP) - MOVQ R14, 104(SP) - MOVQ p+24(FP), R14 + ADDQ AX, R15 + ADCQ DX, R8 - // | - // | (u @ CX) = (w0 @ R8) * inp - MOVQ R8, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | - // | w0 @ R8 - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI +/* i = 11 */ - // | w1 @ R9 - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + // | a11 @ CX + MOVQ 88(DI), CX + MOVQ $0x00, BX - // | w2 @ R10 - XORQ DI, DI - MOVQ 16(R14), AX + // | a11 * b8 + MOVQ 64(SI), AX MULQ CX ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + ADCQ DX, R11 + ADCQ $0x00, R12 + ADCQ $0x00, BX + MOVQ R10, 272(SP) + MOVQ $0x00, R10 - // | w3 @ R11 - XORQ SI, SI - MOVQ 24(R14), AX + // | a11 * b9 + MOVQ 72(SI), AX MULQ CX ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w4 @ R12 - XORQ DI, DI - MOVQ 32(R14), AX + // | a11 * b10 + MOVQ 80(SI), AX MULQ CX ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w5 @ R13 - XORQ SI, SI - MOVQ 40(R14), AX + // | a11 * b11 + MOVQ 88(SI), AX MULQ CX ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w6 @ 104(SP) - XORQ DI, DI - MOVQ 48(R14), AX + // | a11 * b12 + MOVQ 96(SI), AX MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, DI - ADDQ SI, 104(SP) - ADCQ $0x00, DI + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w7 @ 96(SP) - XORQ SI, SI - MOVQ 56(R14), AX + // | a11 * b13 + MOVQ 104(SI), AX MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, SI - ADDQ DI, 96(SP) - ADCQ $0x00, SI + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 - // | w8 @ (SP) - XORQ DI, DI - MOVQ 64(R14), AX + // | a11 * b14 + MOVQ 112(SI), AX MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + ADDQ AX, R8 + ADCQ DX, R9 - // | w9 @ 8(SP) - XORQ SI, SI - MOVQ 72(R14), AX + // | + +/* i = 12 */ + + // | a12 @ CX + MOVQ 96(DI), CX + MOVQ $0x00, BX + + // | a12 * b8 + MOVQ 64(SI), AX MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ $0x00, R13 + ADCQ $0x00, BX + MOVQ R11, 280(SP) + MOVQ $0x00, R11 - // | w10 @ 16(SP) - ADDQ SI, 16(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | a12 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | - MOVQ 24(SP), R8 + // | a12 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | (u @ CX) = (w1 @ R9) * inp - MOVQ R9, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | a12 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w1 @ R9 - XORQ DI, DI - MOVQ (R14), AX + // | a12 * b12 + MOVQ 96(SI), AX MULQ CX - ADDQ AX, R9 - ADCQ DX, DI + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w2 @ R10 - XORQ SI, SI - MOVQ 8(R14), AX + // | a12 * b13 + MOVQ 104(SI), AX MULQ CX - ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 - // | w3 @ R11 - XORQ DI, DI - MOVQ 16(R14), AX + // | a12 * b14 + MOVQ 112(SI), AX MULQ CX - ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI + ADDQ AX, R9 + ADCQ DX, R10 - // | w4 @ R12 - XORQ SI, SI - MOVQ 24(R14), AX + // | + +/* i = 13 */ + + // | a13 @ CX + MOVQ 104(DI), CX + MOVQ $0x00, BX + + // | a13 * b8 + MOVQ 64(SI), AX MULQ CX ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + ADCQ DX, R13 + ADCQ $0x00, R14 + ADCQ $0x00, BX - // | w5 @ R13 - XORQ DI, DI - MOVQ 32(R14), AX + // | a13 * b9 + MOVQ 72(SI), AX MULQ CX ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w6 @ 104(SP) - XORQ SI, SI - MOVQ 40(R14), AX + // | a13 * b10 + MOVQ 80(SI), AX MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, SI - ADDQ DI, 104(SP) - ADCQ $0x00, SI + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w7 @ 96(SP) - XORQ DI, DI - MOVQ 48(R14), AX + // | a13 * b11 + MOVQ 88(SI), AX MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, DI - ADDQ SI, 96(SP) - ADCQ $0x00, DI + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w8 @ (SP) - XORQ SI, SI - MOVQ 56(R14), AX + // | a13 * b12 + MOVQ 96(SI), AX MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w9 @ 8(SP) - XORQ DI, DI - MOVQ 64(R14), AX + // | a13 * b13 + MOVQ 104(SI), AX MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 - // | w10 @ 16(SP) - XORQ SI, SI - MOVQ 72(R14), AX + // | a13 * b14 + MOVQ 112(SI), AX MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI + ADDQ AX, R10 + ADCQ DX, R11 - // | w11 @ R8 - ADDQ SI, R15 - ADCQ R15, R8 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | - // | - MOVQ 32(SP), R9 +/* i = 14 */ - // | (u @ CX) = (w2 @ R10) * inp - MOVQ R10, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | a14 @ CX + MOVQ 112(DI), CX + MOVQ $0x00, BX - // | w2 @ R10 - XORQ DI, DI - MOVQ (R14), AX + // | a14 * b8 + MOVQ 64(SI), AX MULQ CX - ADDQ AX, R10 - ADCQ DX, DI + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ $0x00, R15 + ADCQ $0x00, BX - // | w3 @ R11 - XORQ SI, SI - MOVQ 8(R14), AX + // | a14 * b9 + MOVQ 72(SI), AX MULQ CX - ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w4 @ R12 - XORQ DI, DI - MOVQ 16(R14), AX + // | a14 * b10 + MOVQ 80(SI), AX MULQ CX - ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w5 @ R13 - XORQ SI, SI - MOVQ 24(R14), AX + // | a14 * b11 + MOVQ 88(SI), AX MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w6 @ 104(SP) - XORQ DI, DI - MOVQ 32(R14), AX + // | a14 * b12 + MOVQ 96(SI), AX MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, DI - ADDQ SI, 104(SP) - ADCQ $0x00, DI + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w7 @ 96(SP) - XORQ SI, SI - MOVQ 40(R14), AX + // | a14 * b13 + MOVQ 104(SI), AX MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, SI - ADDQ DI, 96(SP) - ADCQ $0x00, SI + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ $0x00, BX - // | w8 @ (SP) - XORQ DI, DI - MOVQ 48(R14), AX + // | a14 * b14 + MOVQ 112(SI), AX MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + ADDQ AX, R11 + ADCQ DX, BX - // | w9 @ 8(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + // | - // | w10 @ 16(SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI +/* */ - // | w11 @ R8 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + // | + // | W part 2 multiplication + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 184(SP) | 9 192(SP) | 10 200(SP) | 11 208(SP) | 12 216(SP) | 13 224(SP) | 14 232(SP) + // | 15 240(SP) | 16 248(SP) | 17 256(SP) | 18 264(SP) | 19 272(SP) | 20 280(SP) | 21 R12 | 22 R13 | 23 R14 | 24 R15 | 25 R8 | 26 R9 | 27 R10 | 28 R11 | 29 BX - // | w12 @ R9 - ADDQ SI, R15 - ADCQ R15, R9 - MOVQ $0x0000000000000000, R15 + + // | + // | W part 1 + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 112(SP) + // | 15 120(SP) | 16 128(SP) | 17 136(SP) | 18 144(SP) | 19 152(SP) | 20 160(SP) | 21 168(SP) | 22 176(SP) | 23 - | 24 - | 25 - | 26 - | 27 - | 28 - | 29 - + + + MOVQ 64(SP), AX + ADDQ AX, 184(SP) + MOVQ 72(SP), AX + ADCQ AX, 192(SP) + MOVQ 80(SP), AX + ADCQ AX, 200(SP) + MOVQ 88(SP), AX + ADCQ AX, 208(SP) + MOVQ 96(SP), AX + ADCQ AX, 216(SP) + MOVQ 104(SP), AX + ADCQ AX, 224(SP) + MOVQ 112(SP), AX + ADCQ AX, 232(SP) + MOVQ 120(SP), AX + ADCQ AX, 240(SP) + MOVQ 128(SP), AX + ADCQ AX, 248(SP) + MOVQ 136(SP), AX + ADCQ AX, 256(SP) + MOVQ 144(SP), AX + ADCQ AX, 264(SP) + MOVQ 152(SP), AX + ADCQ AX, 272(SP) + MOVQ 160(SP), AX + ADCQ AX, 280(SP) + ADCQ 168(SP), R12 + ADCQ 176(SP), R13 + ADCQ $0x00, R14 ADCQ $0x00, R15 + ADCQ $0x00, R8 + ADCQ $0x00, R9 + ADCQ $0x00, R10 + ADCQ $0x00, R11 + ADCQ $0x00, BX - // | + // | + // | W combined + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 184(SP) | 9 192(SP) | 10 200(SP) | 11 208(SP) | 12 216(SP) | 13 224(SP) | 14 232(SP) + // | 15 240(SP) | 16 248(SP) | 17 256(SP) | 18 264(SP) | 19 272(SP) | 20 280(SP) | 21 R12 | 22 R13 | 23 R14 | 24 R15 | 25 R8 | 26 R9 | 27 R10 | 28 R11 | 29 BX + + + MOVQ (SP), CX + MOVQ 8(SP), DI + MOVQ 16(SP), SI + MOVQ BX, (SP) + MOVQ 24(SP), BX + MOVQ R11, 8(SP) + MOVQ 32(SP), R11 + MOVQ R10, 16(SP) MOVQ 40(SP), R10 + MOVQ R9, 24(SP) + MOVQ 48(SP), R9 + MOVQ R8, 32(SP) + MOVQ 56(SP), R8 + MOVQ R15, 40(SP) + MOVQ 184(SP), R15 + MOVQ R14, 48(SP) + MOVQ R13, 56(SP) + MOVQ R12, 64(SP) - // | (u @ CX) = (w3 @ R11) * inp - MOVQ R11, AX + // | fetch modulus + MOVQ p+24(FP), R14 + + // | + +/* montgomery reduction q1 */ + + // | + +/* i = 0 */ + + // | + // | W + // | 0 CX | 1 DI | 2 SI | 3 BX | 4 R11 | 5 R10 | 6 R9 | 7 R8 | 8 R15 | 9 192(SP) | 10 200(SP) | 11 208(SP) | 12 216(SP) | 13 224(SP) | 14 232(SP) + // | 15 240(SP) | 16 248(SP) | 17 256(SP) | 18 264(SP) | 19 272(SP) | 20 280(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | | u0 = w0 * inp + MOVQ CX, AX MULQ inp+32(FP) - MOVQ AX, CX + MOVQ AX, R13 + MOVQ $0x00, R12 - // | w3 @ R11 - XORQ DI, DI + // | + +/* */ + + // | save u0 + MOVQ R13, 72(SP) + + // | j0 + + // | w0 @ CX MOVQ (R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, DI + MULQ R13 + ADDQ AX, CX + ADCQ DX, R12 - // | w4 @ R12 - XORQ SI, SI + // | j1 + + // | w1 @ DI MOVQ 8(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + MULQ R13 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R12, DI + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w5 @ R13 - XORQ DI, DI + // | j2 + + // | w2 @ SI MOVQ 16(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + MULQ R13 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R12, SI + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j3 - // | w6 @ 104(SP) - XORQ SI, SI + // | w3 @ BX MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, SI - ADDQ DI, 104(SP) - ADCQ $0x00, SI + MULQ R13 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R12, BX + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w7 @ 96(SP) - XORQ DI, DI + // | j4 + + // | w4 @ R11 MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, DI - ADDQ SI, 96(SP) - ADCQ $0x00, DI + MULQ R13 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R12, R11 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w8 @ (SP) - XORQ SI, SI + // | j5 + + // | w5 @ R10 MOVQ 40(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + MULQ R13 + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R12, R10 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w9 @ 8(SP) - XORQ DI, DI + // | j6 + + // | w6 @ R9 MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + MULQ R13 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R12, R9 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w10 @ 16(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI + // | j7 - // | w11 @ R8 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX + // | w7 @ R8 + MOVQ 56(R14), AX + MULQ R13 ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + ADCQ $0x00, DX + ADDQ R12, R8 - // | w12 @ R9 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + // | w8 @ R15 + ADCQ DX, R15 + ADCQ $0x00, CX - // | w13 @ R10 - ADDQ SI, R15 - ADCQ R15, R10 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | - // | - MOVQ 48(SP), R11 +/* i = 1 */ - // | (u @ CX) = (w4 @ R12) * inp - MOVQ R12, AX + // | + // | W + // | 0 - | 1 DI | 2 SI | 3 BX | 4 R11 | 5 R10 | 6 R9 | 7 R8 | 8 R15 | 9 192(SP) | 10 200(SP) | 11 208(SP) | 12 216(SP) | 13 224(SP) | 14 232(SP) + // | 15 240(SP) | 16 248(SP) | 17 256(SP) | 18 264(SP) | 19 272(SP) | 20 280(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | | u1 = w1 * inp + MOVQ DI, AX MULQ inp+32(FP) - MOVQ AX, CX + MOVQ AX, R13 + MOVQ $0x00, R12 - // | w4 @ R12 - XORQ DI, DI + // | + +/* */ + + // | save u1 + MOVQ R13, 80(SP) + + // | j0 + + // | w1 @ DI MOVQ (R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI + MULQ R13 + ADDQ AX, DI + ADCQ DX, R12 - // | w5 @ R13 - XORQ SI, SI + // | j1 + + // | w2 @ SI MOVQ 8(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + MULQ R13 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R12, SI + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w6 @ 104(SP) - XORQ DI, DI + // | j2 + + // | w3 @ BX MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, DI - ADDQ SI, 104(SP) - ADCQ $0x00, DI + MULQ R13 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R12, BX + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j3 - // | w7 @ 96(SP) - XORQ SI, SI + // | w4 @ R11 MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, SI - ADDQ DI, 96(SP) - ADCQ $0x00, SI + MULQ R13 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R12, R11 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w8 @ (SP) - XORQ DI, DI + // | j4 + + // | w5 @ R10 MOVQ 32(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + MULQ R13 + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R12, R10 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j5 - // | w9 @ 8(SP) - XORQ SI, SI + // | w6 @ R9 MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + MULQ R13 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R12, R9 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w10 @ 16(SP) - XORQ DI, DI + // | j6 + + // | w7 @ R8 MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + MULQ R13 + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R12, R8 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j7 - // | w11 @ R8 - XORQ SI, SI + // | w8 @ R15 MOVQ 56(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + MULQ R13 + ADDQ AX, R15 + ADCQ DX, CX + ADDQ R12, R15 - // | w12 @ R9 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI + // | move to idle register + MOVQ 192(SP), DI - // | w13 @ R10 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + // | w9 @ DI + ADCQ CX, DI + MOVQ $0x00, CX + ADCQ $0x00, CX - // | w14 @ R11 - ADDQ SI, R15 - ADCQ R15, R11 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | - // | - MOVQ 56(SP), R12 +/* i = 2 */ - // | (u @ CX) = (w5 @ R13) * inp - MOVQ R13, AX + // | + // | W + // | 0 - | 1 - | 2 SI | 3 BX | 4 R11 | 5 R10 | 6 R9 | 7 R8 | 8 R15 | 9 DI | 10 200(SP) | 11 208(SP) | 12 216(SP) | 13 224(SP) | 14 232(SP) + // | 15 240(SP) | 16 248(SP) | 17 256(SP) | 18 264(SP) | 19 272(SP) | 20 280(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | | u2 = w2 * inp + MOVQ SI, AX MULQ inp+32(FP) - MOVQ AX, CX + MOVQ AX, R13 + MOVQ $0x00, R12 - // | w5 @ R13 - XORQ DI, DI + // | + +/* */ + + // | save u2 + MOVQ R13, 88(SP) + + // | j0 + + // | w2 @ SI MOVQ (R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, DI + MULQ R13 + ADDQ AX, SI + ADCQ DX, R12 + + // | j1 - // | w6 @ 104(SP) - XORQ SI, SI + // | w3 @ BX MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, SI - ADDQ DI, 104(SP) - ADCQ $0x00, SI + MULQ R13 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R12, BX + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w7 @ 96(SP) - XORQ DI, DI + // | j2 + + // | w4 @ R11 MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, DI - ADDQ SI, 96(SP) - ADCQ $0x00, DI + MULQ R13 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R12, R11 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w8 @ (SP) - XORQ SI, SI + // | j3 + + // | w5 @ R10 MOVQ 24(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + MULQ R13 + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R12, R10 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w9 @ 8(SP) - XORQ DI, DI + // | j4 + + // | w6 @ R9 MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + MULQ R13 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R12, R9 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w10 @ 16(SP) - XORQ SI, SI + // | j5 + + // | w7 @ R8 MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI + MULQ R13 + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R12, R8 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w11 @ R8 - XORQ DI, DI + // | j6 + + // | w8 @ R15 MOVQ 48(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + MULQ R13 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R12, R15 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w12 @ R9 - XORQ SI, SI + // | j7 + + // | w9 @ DI MOVQ 56(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + MULQ R13 + ADDQ AX, DI + ADCQ DX, CX + ADDQ R12, DI - // | w13 @ R10 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + // | move to idle register + MOVQ 200(SP), SI - // | w14 @ R11 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + // | w10 @ SI + ADCQ CX, SI + MOVQ $0x00, CX + ADCQ $0x00, CX - // | w15 @ R12 - ADDQ SI, R15 - ADCQ R15, R12 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | - // | - MOVQ 64(SP), R13 +/* i = 3 */ - // | (u @ CX) = (w6 @ 104(SP)) * inp - MOVQ 104(SP), AX + // | + // | W + // | 0 - | 1 - | 2 - | 3 BX | 4 R11 | 5 R10 | 6 R9 | 7 R8 | 8 R15 | 9 DI | 10 SI | 11 208(SP) | 12 216(SP) | 13 224(SP) | 14 232(SP) + // | 15 240(SP) | 16 248(SP) | 17 256(SP) | 18 264(SP) | 19 272(SP) | 20 280(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | | u3 = w3 * inp + MOVQ BX, AX MULQ inp+32(FP) - MOVQ AX, CX + MOVQ AX, R13 + MOVQ $0x00, R12 - // | w6 @ 104(SP) - XORQ DI, DI + // | + +/* */ + + // | save u3 + MOVQ R13, 96(SP) + + // | j0 + + // | w3 @ BX MOVQ (R14), AX - MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, DI + MULQ R13 + ADDQ AX, BX + ADCQ DX, R12 - // | w7 @ 96(SP) - XORQ SI, SI + // | j1 + + // | w4 @ R11 MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, SI - ADDQ DI, 96(SP) - ADCQ $0x00, SI + MULQ R13 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R12, R11 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w8 @ (SP) - XORQ DI, DI + // | j2 + + // | w5 @ R10 MOVQ 16(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + MULQ R13 + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R12, R10 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w9 @ 8(SP) - XORQ SI, SI + // | j3 + + // | w6 @ R9 MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + MULQ R13 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R12, R9 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w10 @ 16(SP) - XORQ DI, DI + // | j4 + + // | w7 @ R8 MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + MULQ R13 + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R12, R8 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w11 @ R8 - XORQ SI, SI + // | j5 + + // | w8 @ R15 MOVQ 40(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + MULQ R13 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R12, R15 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w12 @ R9 - XORQ DI, DI + // | j6 + + // | w9 @ DI MOVQ 48(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI + MULQ R13 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R12, DI + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w13 @ R10 - XORQ SI, SI + // | j7 + + // | w10 @ SI MOVQ 56(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + MULQ R13 + ADDQ AX, SI + ADCQ DX, CX + ADDQ R12, SI - // | w14 @ R11 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI + // | move to idle register + MOVQ 208(SP), BX - // | w15 @ R12 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + // | w11 @ BX + ADCQ CX, BX + MOVQ $0x00, CX + ADCQ $0x00, CX - // | w16 @ R13 - ADDQ SI, R15 - ADCQ R15, R13 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | - // | - MOVQ 72(SP), BX - MOVQ BX, 104(SP) +/* i = 4 */ - // | (u @ CX) = (w7 @ 96(SP)) * inp - MOVQ 96(SP), AX + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 R11 | 5 R10 | 6 R9 | 7 R8 | 8 R15 | 9 DI | 10 SI | 11 BX | 12 216(SP) | 13 224(SP) | 14 232(SP) + // | 15 240(SP) | 16 248(SP) | 17 256(SP) | 18 264(SP) | 19 272(SP) | 20 280(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | | u4 = w4 * inp + MOVQ R11, AX MULQ inp+32(FP) - MOVQ AX, CX + MOVQ AX, R13 + MOVQ $0x00, R12 - // | w7 @ 96(SP) - XORQ DI, DI + // | + +/* */ + + // | save u4 + MOVQ R13, 104(SP) + + // | j0 + + // | w4 @ R11 MOVQ (R14), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, DI + MULQ R13 + ADDQ AX, R11 + ADCQ DX, R12 - // | w8 @ (SP) - XORQ SI, SI + // | j1 + + // | w5 @ R10 MOVQ 8(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + MULQ R13 + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R12, R10 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w9 @ 8(SP) - XORQ DI, DI + // | j2 + + // | w6 @ R9 MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + MULQ R13 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R12, R9 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w10 @ 16(SP) - XORQ SI, SI + // | j3 + + // | w7 @ R8 MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI + MULQ R13 + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R12, R8 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w11 @ R8 - XORQ DI, DI + // | j4 + + // | w8 @ R15 MOVQ 32(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + MULQ R13 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R12, R15 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w12 @ R9 - XORQ SI, SI + // | j5 + + // | w9 @ DI MOVQ 40(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + MULQ R13 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R12, DI + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w13 @ R10 - XORQ DI, DI + // | j6 + + // | w10 @ SI MOVQ 48(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + MULQ R13 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R12, SI + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w14 @ R11 - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + // | j7 + + // | w11 @ BX + MOVQ 56(R14), AX + MULQ R13 + ADDQ AX, BX + ADCQ DX, CX + ADDQ R12, BX - // | w15 @ R12 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + // | move to idle register + MOVQ 216(SP), R11 - // | w16 @ R13 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + // | w12 @ R11 + ADCQ CX, R11 + MOVQ $0x00, CX + ADCQ $0x00, CX - // | w17 @ 104(SP) - ADDQ SI, R15 - ADCQ R15, 104(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | - // | - MOVQ 80(SP), BX - MOVQ BX, 96(SP) +/* i = 5 */ - // | (u @ CX) = (w8 @ (SP)) * inp - MOVQ (SP), AX + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 R10 | 6 R9 | 7 R8 | 8 R15 | 9 DI | 10 SI | 11 BX | 12 R11 | 13 224(SP) | 14 232(SP) + // | 15 240(SP) | 16 248(SP) | 17 256(SP) | 18 264(SP) | 19 272(SP) | 20 280(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | | u5 = w5 * inp + MOVQ R10, AX MULQ inp+32(FP) - MOVQ AX, CX + MOVQ AX, R13 + MOVQ $0x00, R12 - // | w8 @ (SP) - XORQ DI, DI + // | + +/* */ + + // | save u5 + MOVQ R13, 112(SP) + + // | j0 + + // | w5 @ R10 MOVQ (R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI + MULQ R13 + ADDQ AX, R10 + ADCQ DX, R12 + + // | j1 - // | w9 @ 8(SP) - XORQ SI, SI + // | w6 @ R9 MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + MULQ R13 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R12, R9 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w10 @ 16(SP) - XORQ DI, DI + // | j2 + + // | w7 @ R8 MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + MULQ R13 + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R12, R8 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j3 - // | w11 @ R8 - XORQ SI, SI + // | w8 @ R15 MOVQ 24(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + MULQ R13 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R12, R15 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w12 @ R9 - XORQ DI, DI + // | j4 + + // | w9 @ DI MOVQ 32(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI + MULQ R13 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R12, DI + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w13 @ R10 - XORQ SI, SI + // | j5 + + // | w10 @ SI MOVQ 40(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + MULQ R13 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R12, SI + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w14 @ R11 - XORQ DI, DI + // | j6 + + // | w11 @ BX MOVQ 48(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI + MULQ R13 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R12, BX + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w15 @ R12 - XORQ SI, SI + // | j7 + + // | w12 @ R11 MOVQ 56(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + MULQ R13 + ADDQ AX, R11 + ADCQ DX, CX + ADDQ R12, R11 - // | w16 @ R13 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + // | move to idle register + MOVQ 224(SP), R10 - // | w17 @ 104(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, SI - ADDQ DI, 104(SP) - ADCQ $0x00, SI + // | w13 @ R10 + ADCQ CX, R10 + MOVQ $0x00, CX + ADCQ $0x00, CX - // | w18 @ 96(SP) - ADDQ SI, R15 - ADCQ R15, 96(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | + +/* i = 6 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 R9 | 7 R8 | 8 R15 | 9 DI | 10 SI | 11 BX | 12 R11 | 13 R10 | 14 232(SP) + // | 15 240(SP) | 16 248(SP) | 17 256(SP) | 18 264(SP) | 19 272(SP) | 20 280(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) - // | - MOVQ 88(SP), BX - MOVQ BX, (SP) - // | (u @ CX) = (w9 @ 8(SP)) * inp - MOVQ 8(SP), AX + // | | u6 = w6 * inp + MOVQ R9, AX MULQ inp+32(FP) - MOVQ AX, CX + MOVQ AX, R13 + MOVQ $0x00, R12 - // | w9 @ 8(SP) - XORQ DI, DI + // | + +/* */ + + // | save u6 + MOVQ R13, 120(SP) + + // | j0 + + // | w6 @ R9 MOVQ (R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI + MULQ R13 + ADDQ AX, R9 + ADCQ DX, R12 - // | w10 @ 16(SP) - XORQ SI, SI + // | j1 + + // | w7 @ R8 MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI + MULQ R13 + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R12, R8 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w11 @ R8 - XORQ DI, DI + // | j2 + + // | w8 @ R15 MOVQ 16(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + MULQ R13 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R12, R15 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w12 @ R9 - XORQ SI, SI + // | j3 + + // | w9 @ DI MOVQ 24(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + MULQ R13 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R12, DI + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w13 @ R10 - XORQ DI, DI + // | j4 + + // | w10 @ SI MOVQ 32(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + MULQ R13 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R12, SI + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w14 @ R11 - XORQ SI, SI + // | j5 + + // | w11 @ BX MOVQ 40(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + MULQ R13 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R12, BX + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w15 @ R12 - XORQ DI, DI + // | j6 + + // | w12 @ R11 MOVQ 48(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + MULQ R13 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R12, R11 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w16 @ R13 - XORQ SI, SI + // | j7 + + // | w13 @ R10 MOVQ 56(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + MULQ R13 + ADDQ AX, R10 + ADCQ DX, CX + ADDQ R12, R10 - // | w17 @ 104(SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, DI - ADDQ SI, 104(SP) - ADCQ $0x00, DI + // | move to idle register + MOVQ 232(SP), R9 - // | w18 @ 96(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, SI - ADDQ DI, 96(SP) - ADCQ $0x00, SI + // | w14 @ R9 + ADCQ CX, R9 + MOVQ $0x00, CX + ADCQ $0x00, CX - // | w19 @ (SP) - ADDQ SI, R15 - ADCQ R15, (SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | - // | Reduce by modulus - MOVQ 16(SP), CX - SUBQ (R14), CX +/* i = 7 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 R8 | 8 R15 | 9 DI | 10 SI | 11 BX | 12 R11 | 13 R10 | 14 R9 + // | 15 240(SP) | 16 248(SP) | 17 256(SP) | 18 264(SP) | 19 272(SP) | 20 280(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | | u7 = w7 * inp MOVQ R8, AX - SBBQ 8(R14), AX - MOVQ R9, DX - SBBQ 16(R14), DX - MOVQ R10, BX - SBBQ 24(R14), BX - MOVQ BX, 8(SP) - MOVQ R11, BX - SBBQ 32(R14), BX - MOVQ BX, 112(SP) - MOVQ R12, BX - SBBQ 40(R14), BX - MOVQ BX, 120(SP) - MOVQ R13, BX - SBBQ 48(R14), BX - MOVQ BX, 128(SP) - MOVQ 104(SP), BX - SBBQ 56(R14), BX - MOVQ BX, 136(SP) - MOVQ 96(SP), BX - SBBQ 64(R14), BX - MOVQ BX, 144(SP) - MOVQ (SP), BX - SBBQ 72(R14), BX - MOVQ BX, 152(SP) - SBBQ $0x00, R15 + MULQ inp+32(FP) + MOVQ AX, R13 + MOVQ $0x00, R12 - // | Compare & Return - MOVQ c+0(FP), DI - CMOVQCS 16(SP), CX - MOVQ CX, (DI) - CMOVQCC AX, R8 - MOVQ R8, 8(DI) - CMOVQCC DX, R9 - MOVQ R9, 16(DI) - CMOVQCC 8(SP), R10 - MOVQ R10, 24(DI) - CMOVQCC 112(SP), R11 - MOVQ R11, 32(DI) - CMOVQCC 120(SP), R12 - MOVQ R12, 40(DI) - CMOVQCC 128(SP), R13 - MOVQ R13, 48(DI) - MOVQ 104(SP), BX - CMOVQCC 136(SP), BX - MOVQ BX, 56(DI) - MOVQ 96(SP), BX - CMOVQCC 144(SP), BX - MOVQ BX, 64(DI) - MOVQ (SP), BX - CMOVQCC 152(SP), BX - MOVQ BX, 72(DI) - RET + // | +/* */ -// func mul11(c *[22]uint64, a *[11]uint64, b *[11]uint64, p *[11]uint64, inp uint64) -TEXT ·mul11(SB), $184-40 -/* inputs */ - // | - // | Multiplication - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI + // | save u7 + MOVQ R13, 128(SP) - // | - // | - XORQ R10, R10 - XORQ R11, R11 - XORQ R12, R12 - XORQ R13, R13 - XORQ R14, R14 - XORQ R15, R15 - MOVQ $0x00000000, (SP) - MOVQ $0x00000000, 8(SP) - MOVQ $0x00000000, 16(SP) - MOVQ $0x00000000, 24(SP) - MOVQ $0x00000000, 32(SP) - MOVQ $0x00000000, 40(SP) - MOVQ $0x00000000, 48(SP) - MOVQ $0x00000000, 56(SP) - MOVQ $0x00000000, 64(SP) - MOVQ $0x00000000, 72(SP) - MOVQ $0x00000000, 80(SP) - MOVQ $0x00000000, 88(SP) - MOVQ $0x00000000, 96(SP) - MOVQ $0x00000000, 104(SP) - - // | - // | b0 - MOVQ (SI), CX + // | j0 - // | a0 * b0 - // | (w0, w1) @ (R8, R9) - MOVQ (DI), AX - MULQ CX - MOVQ AX, R8 - MOVQ DX, R9 + // | w7 @ R8 + MOVQ (R14), AX + MULQ R13 + ADDQ AX, R8 + ADCQ DX, R12 - // | a1 * b0 - // | (w1, w2) @ (R9, R10) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, R10 + // | j1 - // | a2 * b0 - // | (w2, w3) @ (R10, R11) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, R11 + // | w8 @ R15 + MOVQ 8(R14), AX + MULQ R13 + ADDQ AX, R15 + ADCQ $0x00, DX + ADDQ R12, R15 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j2 + + // | w9 @ DI + MOVQ 16(R14), AX + MULQ R13 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R12, DI + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j3 + + // | w10 @ SI + MOVQ 24(R14), AX + MULQ R13 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R12, SI + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j4 + + // | w11 @ BX + MOVQ 32(R14), AX + MULQ R13 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R12, BX + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a3 * b0 - // | (w3, w4) @ (R11, R12) - MOVQ 24(DI), AX - MULQ CX + // | j5 + + // | w12 @ R11 + MOVQ 40(R14), AX + MULQ R13 ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R12, R11 + MOVQ $0x00, R12 ADCQ DX, R12 - // | a4 * b0 - // | (w4, w5) @ (R12, R13) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, R13 + // | j6 - // | a5 * b0 - // | (w5, w6) @ (R13, R14) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 + // | w13 @ R10 + MOVQ 48(R14), AX + MULQ R13 + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R12, R10 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a6 * b0 - // | (w6, w7) @ (R14, R15) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 + // | j7 - // | a7 * b0 - // | (w7, w8) @ (R15, (SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) + // | w14 @ R9 + MOVQ 56(R14), AX + MULQ R13 + ADDQ AX, R9 + ADCQ DX, CX + ADDQ R12, R9 - // | a8 * b0 - // | (w8, w9) @ ((SP), 8(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) + // | move to idle register + MOVQ 240(SP), R8 - // | a9 * b0 - // | (w9, w10) @ (8(SP), 16(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) + // | w15 @ R8 + ADCQ CX, R8 + MOVQ $0x00, CX + ADCQ $0x00, CX - // | a10 * b0 - // | (w10, w11) @ (16(SP), 24(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) + // | + // | W q1 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R15 | 9 DI | 10 SI | 11 BX | 12 R11 | 13 R10 | 14 R9 + // | 15 R8 | 16 248(SP) | 17 256(SP) | 18 264(SP) | 19 272(SP) | 20 280(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) - // | - // | b1 - MOVQ 8(SI), CX - // | a0 * b1 - // | (w1, w2, w3, w4) @ (R9, R10, R11, R12) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, R10 - ADCQ $0x00, R11 - ADCQ $0x00, R12 + // | save the carry from q1 + // | should be added to w16 + MOVQ CX, 136(SP) - // | a1 * b1 - // | (w2, w3, w4, w5) @ (R10, R11, R12, R13) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, R11 - ADCQ $0x00, R12 - ADCQ $0x00, R13 + // | - // | a2 * b1 - // | (w3, w4, w5, w6) @ (R11, R12, R13, R14) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, R12 - ADCQ $0x00, R13 - ADCQ $0x00, R14 +/* montgomerry reduction q2 */ - // | a3 * b1 - // | (w4, w5, w6, w7) @ (R12, R13, R14, R15) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, R13 - ADCQ $0x00, R14 - ADCQ $0x00, R15 + // | - // | a4 * b1 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) +/* i = 0 */ - // | a5 * b1 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R15 | 9 DI | 10 SI | 11 BX | 12 R11 | 13 R10 | 14 R9 + // | 15 R8 | 16 248(SP) | 17 256(SP) | 18 264(SP) | 19 272(SP) | 20 280(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) - // | a6 * b1 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 48(DI), AX - MULQ CX + + MOVQ $0x00, R12 + + // | + +/* */ + + // | j8 + + // | w8 @ R15 + MOVQ 64(R14), AX + MULQ 72(SP) ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + ADCQ DX, R12 - // | a7 * b1 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | j9 - // | a8 * b1 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | w9 @ DI + MOVQ 72(R14), AX + MULQ 72(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R12, DI + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a9 * b1 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | j10 - // | a10 * b1 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | w10 @ SI + MOVQ 80(R14), AX + MULQ 72(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R12, SI + MOVQ $0x00, R12 + ADCQ DX, R12 - // | - // | b2 - MOVQ 16(SI), CX + // | j11 - // | a0 * b2 - // | (w2, w3, w4, w5) @ (R10, R11, R12, R13) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, R11 - ADCQ $0x00, R12 - ADCQ $0x00, R13 + // | w11 @ BX + MOVQ 88(R14), AX + MULQ 72(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R12, BX + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a1 * b2 - // | (w3, w4, w5, w6) @ (R11, R12, R13, R14) - MOVQ 8(DI), AX - MULQ CX + // | j12 + + // | w12 @ R11 + MOVQ 96(R14), AX + MULQ 72(SP) ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R12, R11 + MOVQ $0x00, R12 ADCQ DX, R12 - ADCQ $0x00, R13 - ADCQ $0x00, R14 - // | a2 * b2 - // | (w4, w5, w6, w7) @ (R12, R13, R14, R15) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, R13 - ADCQ $0x00, R14 - ADCQ $0x00, R15 + // | j13 - // | a3 * b2 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) + // | w13 @ R10 + MOVQ 104(R14), AX + MULQ 72(SP) + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R12, R10 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a4 * b2 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + // | j14 - // | a5 * b2 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + // | w14 @ R9 + MOVQ 112(R14), AX + MULQ 72(SP) + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R12, R9 - // | a6 * b2 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | w15 @ R8 + ADCQ DX, R8 - // | a7 * b2 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | bring the carry from q1 + MOVQ 136(SP), CX + ADCQ $0x00, CX - // | a8 * b2 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | - // | a9 * b2 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) +/* i = 1 */ - // | a10 * b2 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R15 | 9 DI | 10 SI | 11 BX | 12 R11 | 13 R10 | 14 R9 + // | 15 R8 | 16 248(SP) | 17 256(SP) | 18 264(SP) | 19 272(SP) | 20 280(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) - // | - // | b3 - MOVQ 24(SI), CX - // | a0 * b3 - // | (w3, w4, w5, w6) @ (R11, R12, R13, R14) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, R12 - ADCQ $0x00, R13 - ADCQ $0x00, R14 + MOVQ $0x00, R12 - // | a1 * b3 - // | (w4, w5, w6, w7) @ (R12, R13, R14, R15) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, R13 - ADCQ $0x00, R14 - ADCQ $0x00, R15 + // | - // | a2 * b3 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) +/* */ - // | a3 * b3 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + // | j8 - // | a4 * b3 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + // | w9 @ DI + MOVQ 64(R14), AX + MULQ 80(SP) + ADDQ AX, DI + ADCQ DX, R12 + MOVQ DI, 72(SP) - // | a5 * b3 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | j9 - // | a6 * b3 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | w10 @ SI + MOVQ 72(R14), AX + MULQ 80(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R12, SI + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a7 * b3 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | j10 - // | a8 * b3 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | w11 @ BX + MOVQ 80(R14), AX + MULQ 80(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R12, BX + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a9 * b3 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | j11 - // | a10 * b3 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | w12 @ R11 + MOVQ 88(R14), AX + MULQ 80(SP) + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R12, R11 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | - // | b4 - MOVQ 32(SI), CX + // | j12 - // | a0 * b4 - // | (w4, w5, w6, w7) @ (R12, R13, R14, R15) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, R13 - ADCQ $0x00, R14 - ADCQ $0x00, R15 + // | w13 @ R10 + MOVQ 96(R14), AX + MULQ 80(SP) + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R12, R10 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a1 * b4 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) + // | j13 - // | a2 * b4 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + // | w14 @ R9 + MOVQ 104(R14), AX + MULQ 80(SP) + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R12, R9 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a3 * b4 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + // | j14 - // | a4 * b4 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | w15 @ R8 + MOVQ 112(R14), AX + MULQ 80(SP) + ADDQ AX, R8 + ADCQ DX, CX + ADDQ R12, R8 - // | a5 * b4 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | move to an idle register + MOVQ 248(SP), R13 - // | a6 * b4 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | w16 @ R13 + ADCQ CX, R13 + MOVQ $0x00, CX + ADCQ $0x00, CX - // | a7 * b4 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | - // | a8 * b4 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) +/* i = 2 */ - // | a9 * b4 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R15 | 9 72(SP) | 10 SI | 11 BX | 12 R11 | 13 R10 | 14 R9 + // | 15 R8 | 16 R13 | 17 256(SP) | 18 264(SP) | 19 272(SP) | 20 280(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) - // | a10 * b4 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) - // | - // | b5 - MOVQ 40(SI), CX + MOVQ $0x00, R12 - // | a0 * b5 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) + // | - // | a1 * b5 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) +/* */ - // | a2 * b5 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + // | j8 - // | a3 * b5 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | w10 @ SI + MOVQ 64(R14), AX + MULQ 88(SP) + ADDQ AX, SI + ADCQ DX, R12 + MOVQ SI, 80(SP) - // | a4 * b5 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | j9 - // | a5 * b5 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | w11 @ BX + MOVQ 72(R14), AX + MULQ 88(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R12, BX + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a6 * b5 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | j10 - // | a7 * b5 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | w12 @ R11 + MOVQ 80(R14), AX + MULQ 88(SP) + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R12, R11 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a8 * b5 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | j11 - // | a9 * b5 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | w13 @ R10 + MOVQ 88(R14), AX + MULQ 88(SP) + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R12, R10 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a10 * b5 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | j12 - // | - // | b6 - MOVQ 48(SI), CX + // | w14 @ R9 + MOVQ 96(R14), AX + MULQ 88(SP) + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R12, R9 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a0 * b6 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + // | j13 - // | a1 * b6 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + // | w15 @ R8 + MOVQ 104(R14), AX + MULQ 88(SP) + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R12, R8 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a2 * b6 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | j14 - // | a3 * b6 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | w16 @ R13 + MOVQ 112(R14), AX + MULQ 88(SP) + ADDQ AX, R13 + ADCQ DX, CX + ADDQ R12, R13 - // | a4 * b6 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | move to an idle register + MOVQ 256(SP), DI - // | a5 * b6 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | w17 @ DI + ADCQ CX, DI + MOVQ $0x00, CX + ADCQ $0x00, CX - // | a6 * b6 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | - // | a7 * b6 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) +/* i = 3 */ - // | a8 * b6 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R15 | 9 72(SP) | 10 80(SP) | 11 BX | 12 R11 | 13 R10 | 14 R9 + // | 15 R8 | 16 R13 | 17 DI | 18 264(SP) | 19 272(SP) | 20 280(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) - // | a9 * b6 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) - // | a10 * b6 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + MOVQ $0x00, R12 - // | - // | b7 - MOVQ 56(SI), CX + // | - // | a0 * b7 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) +/* */ - // | a1 * b7 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | j8 - // | a2 * b7 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | w11 @ BX + MOVQ 64(R14), AX + MULQ 96(SP) + ADDQ AX, BX + ADCQ DX, R12 + MOVQ BX, 88(SP) - // | a3 * b7 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | j9 - // | a4 * b7 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | w12 @ R11 + MOVQ 72(R14), AX + MULQ 96(SP) + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R12, R11 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a5 * b7 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | j10 - // | a6 * b7 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | w13 @ R10 + MOVQ 80(R14), AX + MULQ 96(SP) + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R12, R10 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a7 * b7 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | j11 - // | a8 * b7 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | w14 @ R9 + MOVQ 88(R14), AX + MULQ 96(SP) + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R12, R9 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a9 * b7 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | j12 - // | a10 * b7 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | w15 @ R8 + MOVQ 96(R14), AX + MULQ 96(SP) + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R12, R8 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | - // | b8 - MOVQ 64(SI), CX + // | j13 - // | a0 * b8 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | w16 @ R13 + MOVQ 104(R14), AX + MULQ 96(SP) + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R12, R13 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a1 * b8 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | j14 - // | a2 * b8 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | w17 @ DI + MOVQ 112(R14), AX + MULQ 96(SP) + ADDQ AX, DI + ADCQ DX, CX + ADDQ R12, DI - // | a3 * b8 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | move to an idle register + MOVQ 264(SP), BX - // | a4 * b8 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | w18 @ BX + ADCQ CX, BX + MOVQ $0x00, CX + ADCQ $0x00, CX - // | a5 * b8 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | - // | a6 * b8 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) +/* i = 4 */ - // | a7 * b8 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R15 | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 R11 | 13 R10 | 14 R9 + // | 15 R8 | 16 R13 | 17 DI | 18 BX | 19 272(SP) | 20 280(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) - // | a8 * b8 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) - // | a9 * b8 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + MOVQ $0x00, R12 - // | a10 * b8 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + // | - // | - // | b9 - MOVQ 72(SI), CX +/* */ - // | a0 * b9 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | j8 - // | a1 * b9 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | w12 @ R11 + MOVQ 64(R14), AX + MULQ 104(SP) + ADDQ AX, R11 + ADCQ DX, R12 + MOVQ R11, 96(SP) - // | a2 * b9 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | j9 - // | a3 * b9 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | w13 @ R10 + MOVQ 72(R14), AX + MULQ 104(SP) + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R12, R10 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a4 * b9 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | j10 - // | a5 * b9 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | w14 @ R9 + MOVQ 80(R14), AX + MULQ 104(SP) + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R12, R9 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a6 * b9 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | j11 - // | a7 * b9 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | w15 @ R8 + MOVQ 88(R14), AX + MULQ 104(SP) + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R12, R8 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a8 * b9 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | j12 - // | a9 * b9 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + // | w16 @ R13 + MOVQ 96(R14), AX + MULQ 104(SP) + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R12, R13 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a10 * b9 - // | (w19, w20, w21) @ (88(SP), 96(SP), 104(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) + // | j13 - // | - // | b10 - MOVQ 80(SI), CX + // | w17 @ DI + MOVQ 104(R14), AX + MULQ 104(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R12, DI + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a0 * b10 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | j14 - // | a1 * b10 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | w18 @ BX + MOVQ 112(R14), AX + MULQ 104(SP) + ADDQ AX, BX + ADCQ DX, CX + ADDQ R12, BX - // | a2 * b10 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | move to an idle register + MOVQ 272(SP), SI - // | a3 * b10 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | w19 @ SI + ADCQ CX, SI + MOVQ $0x00, CX + ADCQ $0x00, CX - // | a4 * b10 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | - // | a5 * b10 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) +/* i = 5 */ - // | a6 * b10 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R15 | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 R10 | 14 R9 + // | 15 R8 | 16 R13 | 17 DI | 18 BX | 19 SI | 20 280(SP) | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) - // | a7 * b10 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) - // | a8 * b10 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + MOVQ $0x00, R12 - // | a9 * b10 - // | (w19, w20, w21) @ (88(SP), 96(SP), 104(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) + // | - // | a10 * b10 - // | (w20, w21) @ (96(SP), 104(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) +/* */ - // | - // | Montgomerry Reduction - MOVQ R15, 112(SP) - MOVQ R14, 120(SP) - MOVQ p+24(FP), R14 + // | j8 - // | - // | (u @ CX) = (w0 @ R8) * inp - MOVQ R8, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | w13 @ R10 + MOVQ 64(R14), AX + MULQ 112(SP) + ADDQ AX, R10 + ADCQ DX, R12 + MOVQ R10, 104(SP) - // | w0 @ R8 - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI + // | j9 - // | w1 @ R9 - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX + // | w14 @ R9 + MOVQ 72(R14), AX + MULQ 112(SP) ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + ADCQ $0x00, DX + ADDQ R12, R9 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w2 @ R10 - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + // | j10 - // | w3 @ R11 - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + // | w15 @ R8 + MOVQ 80(R14), AX + MULQ 112(SP) + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R12, R8 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w4 @ R12 - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + // | j11 - // | w5 @ R13 - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX + // | w16 @ R13 + MOVQ 88(R14), AX + MULQ 112(SP) ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + ADCQ $0x00, DX + ADDQ R12, R13 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w6 @ 120(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 120(SP) - ADCQ DX, DI - ADDQ SI, 120(SP) - ADCQ $0x00, DI + // | j12 - // | w7 @ 112(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, SI - ADDQ DI, 112(SP) - ADCQ $0x00, SI + // | w17 @ DI + MOVQ 96(R14), AX + MULQ 112(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R12, DI + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w8 @ (SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + // | j13 - // | w9 @ 8(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + // | w18 @ BX + MOVQ 104(R14), AX + MULQ 112(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R12, BX + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w10 @ 16(SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + // | j14 - // | w11 @ 24(SP) - ADDQ DI, 24(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | w19 @ SI + MOVQ 112(R14), AX + MULQ 112(SP) + ADDQ AX, SI + ADCQ DX, CX + ADDQ R12, SI - // | - MOVQ 32(SP), R8 + // | move to an idle register + MOVQ 280(SP), R10 - // | (u @ CX) = (w1 @ R9) * inp - MOVQ R9, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | w20 @ R10 + ADCQ CX, R10 + MOVQ $0x00, CX + ADCQ $0x00, CX - // | w1 @ R9 - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, DI + // | - // | w2 @ R10 - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI +/* i = 6 */ - // | w3 @ R11 - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R15 | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 R9 + // | 15 R8 | 16 R13 | 17 DI | 18 BX | 19 SI | 20 R10 | 21 64(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) - // | w4 @ R12 - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI - // | w5 @ R13 - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + MOVQ $0x00, R12 - // | w6 @ 120(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 120(SP) - ADCQ DX, SI - ADDQ DI, 120(SP) - ADCQ $0x00, SI + // | - // | w7 @ 112(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, DI - ADDQ SI, 112(SP) - ADCQ $0x00, DI +/* */ - // | w8 @ (SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + // | j8 - // | w9 @ 8(SP) - XORQ DI, DI + // | w14 @ R9 MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + MULQ 120(SP) + ADDQ AX, R9 + ADCQ DX, R12 - // | w10 @ 16(SP) - XORQ SI, SI + // | j9 + + // | w15 @ R8 MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI + MULQ 120(SP) + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R12, R8 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w11 @ 24(SP) - XORQ DI, DI + // | j10 + + // | w16 @ R13 MOVQ 80(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI - ADDQ SI, 24(SP) - ADCQ $0x00, DI + MULQ 120(SP) + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R12, R13 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w12 @ R8 - ADDQ DI, R15 - ADCQ R15, R8 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | j11 - // | - MOVQ 40(SP), R9 + // | w17 @ DI + MOVQ 88(R14), AX + MULQ 120(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R12, DI + MOVQ $0x00, R12 + ADCQ DX, R12 - // | (u @ CX) = (w2 @ R10) * inp - MOVQ R10, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | j12 - // | w2 @ R10 - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI + // | w18 @ BX + MOVQ 96(R14), AX + MULQ 120(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R12, BX + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w3 @ R11 - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + // | j13 - // | w4 @ R12 - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + // | w19 @ SI + MOVQ 104(R14), AX + MULQ 120(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R12, SI + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w5 @ R13 - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + // | j14 - // | w6 @ 120(SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 120(SP) - ADCQ DX, DI - ADDQ SI, 120(SP) - ADCQ $0x00, DI + // | w20 @ R10 + MOVQ 112(R14), AX + MULQ 120(SP) + ADDQ AX, R10 + ADCQ DX, CX + ADDQ R12, R10 - // | w7 @ 112(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, SI - ADDQ DI, 112(SP) - ADCQ $0x00, SI + // | move to an idle register + MOVQ 64(SP), R11 - // | w8 @ (SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + // | w21 @ R11 + ADCQ CX, R11 + MOVQ $0x00, CX + ADCQ $0x00, CX - // | w9 @ 8(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + // | - // | w10 @ 16(SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI +/* i = 7 */ - // | w11 @ 24(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R15 | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 R9 + // | 15 R8 | 16 R13 | 17 DI | 18 BX | 19 SI | 20 R10 | 21 R11 | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) - // | w12 @ R8 - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI - // | w13 @ R9 - ADDQ DI, R15 - ADCQ R15, R9 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + MOVQ $0x00, R12 - // | - MOVQ 48(SP), R10 + // | - // | (u @ CX) = (w3 @ R11) * inp - MOVQ R11, AX - MULQ inp+32(FP) - MOVQ AX, CX +/* */ - // | w3 @ R11 - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, DI + // | j8 - // | w4 @ R12 - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + // | w15 @ R8 + MOVQ 64(R14), AX + MULQ 128(SP) + ADDQ AX, R8 + ADCQ DX, R12 - // | w5 @ R13 - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX + // | j9 + + // | w16 @ R13 + MOVQ 72(R14), AX + MULQ 128(SP) ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + ADCQ $0x00, DX + ADDQ R12, R13 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w6 @ 120(SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 120(SP) - ADCQ DX, SI - ADDQ DI, 120(SP) - ADCQ $0x00, SI + // | j10 - // | w7 @ 112(SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, DI - ADDQ SI, 112(SP) - ADCQ $0x00, DI + // | w17 @ DI + MOVQ 80(R14), AX + MULQ 128(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R12, DI + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w8 @ (SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + // | j11 - // | w9 @ 8(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + // | w18 @ BX + MOVQ 88(R14), AX + MULQ 128(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R12, BX + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w10 @ 16(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI + // | j12 - // | w11 @ 24(SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI - ADDQ SI, 24(SP) - ADCQ $0x00, DI + // | w19 @ SI + MOVQ 96(R14), AX + MULQ 128(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R12, SI + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w12 @ R8 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + // | j13 - // | w13 @ R9 - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI + // | w20 @ R10 + MOVQ 104(R14), AX + MULQ 128(SP) + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R12, R10 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w14 @ R10 - ADDQ DI, R15 - ADCQ R15, R10 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | j14 - // | - MOVQ 56(SP), R11 + // | w21 @ R11 + MOVQ 112(R14), AX + MULQ 128(SP) + ADDQ AX, R11 + ADCQ DX, CX + ADDQ R12, R11 - // | (u @ CX) = (w4 @ R12) * inp - MOVQ R12, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | tolarete this limb to stay in stack + // | w22 @ 56(SP) + ADCQ CX, 56(SP) + MOVQ $0x00, CX + ADCQ $0x00, CX - // | w4 @ R12 - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI + // | + // | q2 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R15 | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 R9 + // | 15 R8 | 16 R13 | 17 DI | 18 BX | 19 SI | 20 R10 | 21 R11 | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) - // | w5 @ R13 - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI - // | w6 @ 120(SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 120(SP) - ADCQ DX, DI - ADDQ SI, 120(SP) - ADCQ $0x00, DI + // | save the carry from q2 + // | should be added to w23 + MOVQ CX, 136(SP) - // | w7 @ 112(SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, SI - ADDQ DI, 112(SP) - ADCQ $0x00, SI + // | - // | w8 @ (SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI +/* q2 q3 transition swap */ - // | w9 @ 8(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + MOVQ 72(SP), CX + MOVQ R11, 72(SP) + MOVQ 80(SP), R11 + MOVQ R10, 80(SP) + MOVQ 88(SP), R10 + MOVQ SI, 88(SP) + MOVQ 96(SP), SI + MOVQ BX, 96(SP) + MOVQ 104(SP), BX + MOVQ DI, 104(SP) - // | w10 @ 16(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + // | + // | W q2 q3 transition + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R15 | 9 CX | 10 R11 | 11 R10 | 12 SI | 13 BX | 14 R9 + // | 15 R8 | 16 R13 | 17 104(SP) | 18 96(SP) | 19 88(SP) | 20 80(SP) | 21 72(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) - // | w11 @ 24(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI - // | w12 @ R8 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + // | - // | w13 @ R9 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI +/* montgomery reduction q3 */ - // | w14 @ R10 - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + // | - // | w15 @ R11 - ADDQ DI, R15 - ADCQ R15, R11 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 +/* i = 8 */ - // | - MOVQ 64(SP), R12 + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R15 | 9 CX | 10 R11 | 11 R10 | 12 SI | 13 BX | 14 R9 + // | 15 R8 | 16 R13 | 17 104(SP) | 18 96(SP) | 19 88(SP) | 20 80(SP) | 21 72(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) - // | (u @ CX) = (w5 @ R13) * inp - MOVQ R13, AX + + // | | u8 = w8 * inp + MOVQ R15, AX MULQ inp+32(FP) - MOVQ AX, CX + MOVQ AX, DI + MOVQ $0x00, R12 - // | w5 @ R13 - XORQ DI, DI + // | + +/* */ + + // | save u8 + MOVQ DI, 112(SP) + + // | j0 + + // | w8 @ R15 MOVQ (R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, DI + MULQ DI + ADDQ AX, R15 + ADCQ DX, R12 + + // | j1 - // | w6 @ 120(SP) - XORQ SI, SI + // | w9 @ CX MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 120(SP) - ADCQ DX, SI - ADDQ DI, 120(SP) - ADCQ $0x00, SI + MULQ DI + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R12, CX + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w7 @ 112(SP) - XORQ DI, DI + // | j2 + + // | w10 @ R11 MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, DI - ADDQ SI, 112(SP) - ADCQ $0x00, DI + MULQ DI + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R12, R11 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w8 @ (SP) - XORQ SI, SI + // | j3 + + // | w11 @ R10 MOVQ 24(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + MULQ DI + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R12, R10 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w9 @ 8(SP) - XORQ DI, DI + // | j4 + + // | w12 @ SI MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + MULQ DI + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R12, SI + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w10 @ 16(SP) - XORQ SI, SI + // | j5 + + // | w13 @ BX MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI + MULQ DI + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R12, BX + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w11 @ 24(SP) - XORQ DI, DI + // | j6 + + // | w14 @ R9 MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI - ADDQ SI, 24(SP) - ADCQ $0x00, DI + MULQ DI + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R12, R9 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w12 @ R8 - XORQ SI, SI + // | j7 + + // | w15 @ R8 MOVQ 56(R14), AX - MULQ CX + MULQ DI ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + ADCQ $0x00, DX + ADDQ R12, R8 - // | w13 @ R9 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI + // | w16 @ R13 + ADCQ DX, R13 + ADCQ $0x00, R15 - // | w14 @ R10 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + // | - // | w15 @ R11 - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI +/* i = 9 */ - // | w16 @ R12 - ADDQ DI, R15 - ADCQ R15, R12 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 CX | 10 R11 | 11 R10 | 12 SI | 13 BX | 14 R9 + // | 15 R8 | 16 R13 | 17 104(SP) | 18 96(SP) | 19 88(SP) | 20 80(SP) | 21 72(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) - // | - MOVQ 72(SP), R13 - // | (u @ CX) = (w6 @ 120(SP)) * inp - MOVQ 120(SP), AX + // | | u9 = w9 * inp + MOVQ CX, AX MULQ inp+32(FP) - MOVQ AX, CX + MOVQ AX, DI + MOVQ $0x00, R12 - // | w6 @ 120(SP) - XORQ DI, DI + // | + +/* */ + + // | save u9 + MOVQ DI, 120(SP) + + // | j0 + + // | w9 @ CX MOVQ (R14), AX - MULQ CX - ADDQ AX, 120(SP) - ADCQ DX, DI + MULQ DI + ADDQ AX, CX + ADCQ DX, R12 + + // | j1 - // | w7 @ 112(SP) - XORQ SI, SI + // | w10 @ R11 MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, SI - ADDQ DI, 112(SP) - ADCQ $0x00, SI + MULQ DI + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R12, R11 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w8 @ (SP) - XORQ DI, DI + // | j2 + + // | w11 @ R10 MOVQ 16(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + MULQ DI + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R12, R10 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j3 - // | w9 @ 8(SP) - XORQ SI, SI + // | w12 @ SI MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + MULQ DI + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R12, SI + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w10 @ 16(SP) - XORQ DI, DI + // | j4 + + // | w13 @ BX MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + MULQ DI + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R12, BX + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w11 @ 24(SP) - XORQ SI, SI + // | j5 + + // | w14 @ R9 MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI + MULQ DI + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R12, R9 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w12 @ R8 - XORQ DI, DI + // | j6 + + // | w15 @ R8 MOVQ 48(R14), AX - MULQ CX + MULQ DI ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + ADCQ $0x00, DX + ADDQ R12, R8 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w13 @ R9 - XORQ SI, SI + // | j7 + + // | w16 @ R13 MOVQ 56(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + MULQ DI + ADDQ AX, R13 + ADCQ DX, R15 + ADDQ R12, R13 - // | w14 @ R10 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + // | move to idle register + MOVQ 104(SP), CX - // | w15 @ R11 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + // | w17 @ CX + ADCQ R15, CX + MOVQ $0x00, R15 + ADCQ $0x00, R15 - // | w16 @ R12 - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + // | - // | w17 @ R13 - ADDQ DI, R15 - ADCQ R15, R13 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 +/* i = 10 */ - // | - MOVQ 80(SP), BX - MOVQ BX, 120(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 R11 | 11 R10 | 12 SI | 13 BX | 14 R9 + // | 15 R8 | 16 R13 | 17 CX | 18 96(SP) | 19 88(SP) | 20 80(SP) | 21 72(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) - // | (u @ CX) = (w7 @ 112(SP)) * inp - MOVQ 112(SP), AX + + // | | u10 = w10 * inp + MOVQ R11, AX MULQ inp+32(FP) - MOVQ AX, CX + MOVQ AX, DI + MOVQ $0x00, R12 - // | w7 @ 112(SP) - XORQ DI, DI + // | + +/* */ + + // | save u10 + MOVQ DI, 104(SP) + + // | j0 + + // | w10 @ R11 MOVQ (R14), AX - MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, DI + MULQ DI + ADDQ AX, R11 + ADCQ DX, R12 - // | w8 @ (SP) - XORQ SI, SI + // | j1 + + // | w11 @ R10 MOVQ 8(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + MULQ DI + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R12, R10 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w9 @ 8(SP) - XORQ DI, DI + // | j2 + + // | w12 @ SI MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + MULQ DI + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R12, SI + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w10 @ 16(SP) - XORQ SI, SI + // | j3 + + // | w13 @ BX MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI + MULQ DI + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R12, BX + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w11 @ 24(SP) - XORQ DI, DI + // | j4 + + // | w14 @ R9 MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI - ADDQ SI, 24(SP) - ADCQ $0x00, DI + MULQ DI + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R12, R9 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w12 @ R8 - XORQ SI, SI + // | j5 + + // | w15 @ R8 MOVQ 40(R14), AX - MULQ CX + MULQ DI ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + ADCQ $0x00, DX + ADDQ R12, R8 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w13 @ R9 - XORQ DI, DI + // | j6 + + // | w16 @ R13 MOVQ 48(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI + MULQ DI + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R12, R13 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w14 @ R10 - XORQ SI, SI + // | j7 + + // | w17 @ CX MOVQ 56(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + MULQ DI + ADDQ AX, CX + ADCQ DX, R15 + ADDQ R12, CX - // | w15 @ R11 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI + // | move to idle register + MOVQ 96(SP), R11 - // | w16 @ R12 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + // | w18 @ R11 + ADCQ R15, R11 + MOVQ $0x00, R15 + ADCQ $0x00, R15 - // | w17 @ R13 - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + // | - // | w18 @ 120(SP) - ADDQ DI, R15 - ADCQ R15, 120(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 +/* i = 11 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 R10 | 12 SI | 13 BX | 14 R9 + // | 15 R8 | 16 R13 | 17 CX | 18 R11 | 19 88(SP) | 20 80(SP) | 21 72(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) - // | - MOVQ 88(SP), BX - MOVQ BX, 112(SP) - // | (u @ CX) = (w8 @ (SP)) * inp - MOVQ (SP), AX + // | | u11 = w11 * inp + MOVQ R10, AX MULQ inp+32(FP) - MOVQ AX, CX + MOVQ AX, DI + MOVQ $0x00, R12 - // | w8 @ (SP) - XORQ DI, DI + // | + +/* */ + + // | save u11 + MOVQ DI, 96(SP) + + // | j0 + + // | w11 @ R10 MOVQ (R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI + MULQ DI + ADDQ AX, R10 + ADCQ DX, R12 - // | w9 @ 8(SP) - XORQ SI, SI + // | j1 + + // | w12 @ SI MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + MULQ DI + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R12, SI + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w10 @ 16(SP) - XORQ DI, DI + // | j2 + + // | w13 @ BX MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + MULQ DI + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R12, BX + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j3 - // | w11 @ 24(SP) - XORQ SI, SI + // | w14 @ R9 MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI + MULQ DI + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R12, R9 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w12 @ R8 - XORQ DI, DI + // | j4 + + // | w15 @ R8 MOVQ 32(R14), AX - MULQ CX + MULQ DI ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + ADCQ $0x00, DX + ADDQ R12, R8 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w13 @ R9 - XORQ SI, SI + // | j5 + + // | w16 @ R13 MOVQ 40(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + MULQ DI + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R12, R13 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w14 @ R10 - XORQ DI, DI + // | j6 + + // | w17 @ CX MOVQ 48(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + MULQ DI + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R12, CX + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w15 @ R11 - XORQ SI, SI + // | j7 + + // | w18 @ R11 MOVQ 56(R14), AX - MULQ CX + MULQ DI ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + ADCQ DX, R15 + ADDQ R12, R11 - // | w16 @ R12 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + // | move to idle register + MOVQ 88(SP), R10 - // | w17 @ R13 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + // | w19 @ R10 + ADCQ R15, R10 + MOVQ $0x00, R15 + ADCQ $0x00, R15 - // | w18 @ 120(SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, 120(SP) - ADCQ DX, DI - ADDQ SI, 120(SP) - ADCQ $0x00, DI + // | - // | w19 @ 112(SP) - ADDQ DI, R15 - ADCQ R15, 112(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 +/* i = 12 */ - // | - MOVQ 96(SP), BX - MOVQ BX, (SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 SI | 13 BX | 14 R9 + // | 15 R8 | 16 R13 | 17 CX | 18 R11 | 19 R10 | 20 80(SP) | 21 72(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) - // | (u @ CX) = (w9 @ 8(SP)) * inp - MOVQ 8(SP), AX + + // | | u12 = w12 * inp + MOVQ SI, AX MULQ inp+32(FP) - MOVQ AX, CX + MOVQ AX, DI + MOVQ $0x00, R12 - // | w9 @ 8(SP) - XORQ DI, DI + // | + +/* */ + + // | save u12 + MOVQ DI, 88(SP) + + // | j0 + + // | w12 @ SI MOVQ (R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI + MULQ DI + ADDQ AX, SI + ADCQ DX, R12 - // | w10 @ 16(SP) - XORQ SI, SI + // | j1 + + // | w13 @ BX MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI + MULQ DI + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R12, BX + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w11 @ 24(SP) - XORQ DI, DI + // | j2 + + // | w14 @ R9 MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI - ADDQ SI, 24(SP) - ADCQ $0x00, DI + MULQ DI + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R12, R9 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w12 @ R8 - XORQ SI, SI + // | j3 + + // | w15 @ R8 MOVQ 24(R14), AX - MULQ CX + MULQ DI ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + ADCQ $0x00, DX + ADDQ R12, R8 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w13 @ R9 - XORQ DI, DI + // | j4 + + // | w16 @ R13 MOVQ 32(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI + MULQ DI + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R12, R13 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w14 @ R10 - XORQ SI, SI + // | j5 + + // | w17 @ CX MOVQ 40(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + MULQ DI + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R12, CX + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w15 @ R11 - XORQ DI, DI + // | j6 + + // | w18 @ R11 MOVQ 48(R14), AX - MULQ CX + MULQ DI ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI + ADCQ $0x00, DX + ADDQ R12, R11 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w16 @ R12 - XORQ SI, SI + // | j7 + + // | w19 @ R10 MOVQ 56(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + MULQ DI + ADDQ AX, R10 + ADCQ DX, R15 + ADDQ R12, R10 - // | w17 @ R13 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + // | move to idle register + MOVQ 80(SP), SI - // | w18 @ 120(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 120(SP) - ADCQ DX, SI - ADDQ DI, 120(SP) - ADCQ $0x00, SI + // | w20 @ SI + ADCQ R15, SI + MOVQ $0x00, R15 + ADCQ $0x00, R15 - // | w19 @ 112(SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, DI - ADDQ SI, 112(SP) - ADCQ $0x00, DI + // | - // | w20 @ (SP) - ADDQ DI, R15 - ADCQ R15, (SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 +/* i = 13 */ - // | - MOVQ 104(SP), BX - MOVQ BX, 8(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 BX | 14 R9 + // | 15 R8 | 16 R13 | 17 CX | 18 R11 | 19 R10 | 20 SI | 21 72(SP) | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | | u13 = w13 * inp + MOVQ BX, AX + MULQ inp+32(FP) + MOVQ AX, DI + MOVQ $0x00, R12 + + // | - // | (u @ CX) = (w10 @ 16(SP)) * inp - MOVQ 16(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX +/* */ - // | w10 @ 16(SP) - XORQ DI, DI + // | save u13 + MOVQ DI, 80(SP) + + // | j0 + + // | w13 @ BX MOVQ (R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI + MULQ DI + ADDQ AX, BX + ADCQ DX, R12 - // | w11 @ 24(SP) - XORQ SI, SI + // | j1 + + // | w14 @ R9 MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI + MULQ DI + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R12, R9 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w12 @ R8 - XORQ DI, DI + // | j2 + + // | w15 @ R8 MOVQ 16(R14), AX - MULQ CX + MULQ DI ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + ADCQ $0x00, DX + ADDQ R12, R8 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w13 @ R9 - XORQ SI, SI + // | j3 + + // | w16 @ R13 MOVQ 24(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + MULQ DI + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R12, R13 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w14 @ R10 - XORQ DI, DI + // | j4 + + // | w17 @ CX MOVQ 32(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + MULQ DI + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R12, CX + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w15 @ R11 - XORQ SI, SI + // | j5 + + // | w18 @ R11 MOVQ 40(R14), AX - MULQ CX + MULQ DI ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + ADCQ $0x00, DX + ADDQ R12, R11 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w16 @ R12 - XORQ DI, DI + // | j6 + + // | w19 @ R10 MOVQ 48(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + MULQ DI + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R12, R10 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w17 @ R13 - XORQ SI, SI + // | j7 + + // | w20 @ SI MOVQ 56(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + MULQ DI + ADDQ AX, SI + ADCQ DX, R15 + ADDQ R12, SI - // | w18 @ 120(SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 120(SP) - ADCQ DX, DI - ADDQ SI, 120(SP) - ADCQ $0x00, DI + // | move to idle register + MOVQ 72(SP), BX - // | w19 @ 112(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, SI - ADDQ DI, 112(SP) - ADCQ $0x00, SI + // | w21 @ BX + ADCQ R15, BX + MOVQ $0x00, R15 + ADCQ $0x00, R15 - // | w20 @ (SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + // | - // | w21 @ 8(SP) - ADDQ DI, R15 - ADCQ R15, 8(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 +/* i = 14 */ - // | Reduce by modulus - MOVQ 24(SP), CX - SUBQ (R14), CX - MOVQ R8, AX - SBBQ 8(R14), AX - MOVQ R9, DX - SBBQ 16(R14), DX - MOVQ R10, BX - SBBQ 24(R14), BX - MOVQ BX, 16(SP) - MOVQ R11, BX - SBBQ 32(R14), BX - MOVQ BX, 128(SP) - MOVQ R12, BX - SBBQ 40(R14), BX - MOVQ BX, 136(SP) - MOVQ R13, BX - SBBQ 48(R14), BX - MOVQ BX, 144(SP) - MOVQ 120(SP), BX - SBBQ 56(R14), BX - MOVQ BX, 152(SP) - MOVQ 112(SP), BX - SBBQ 64(R14), BX - MOVQ BX, 160(SP) - MOVQ (SP), BX - SBBQ 72(R14), BX - MOVQ BX, 168(SP) - MOVQ 8(SP), BX - SBBQ 80(R14), BX - MOVQ BX, 176(SP) - SBBQ $0x00, R15 + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 R9 + // | 15 R8 | 16 R13 | 17 CX | 18 R11 | 19 R10 | 20 SI | 21 BX | 22 56(SP) | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) - // | Compare & Return - MOVQ c+0(FP), DI - CMOVQCS 24(SP), CX - MOVQ CX, (DI) - CMOVQCC AX, R8 - MOVQ R8, 8(DI) - CMOVQCC DX, R9 - MOVQ R9, 16(DI) - CMOVQCC 16(SP), R10 - MOVQ R10, 24(DI) - CMOVQCC 128(SP), R11 - MOVQ R11, 32(DI) - CMOVQCC 136(SP), R12 - MOVQ R12, 40(DI) - CMOVQCC 144(SP), R13 - MOVQ R13, 48(DI) - MOVQ 120(SP), BX - CMOVQCC 152(SP), BX - MOVQ BX, 56(DI) - MOVQ 112(SP), BX - CMOVQCC 160(SP), BX - MOVQ BX, 64(DI) - MOVQ (SP), BX - CMOVQCC 168(SP), BX - MOVQ BX, 72(DI) - MOVQ 8(SP), BX - CMOVQCC 176(SP), BX - MOVQ BX, 80(DI) - RET -// func square11(c *[22]uint64, a *fe704, p *fe704) -TEXT ·square11(SB), $104-24 -/* inputs */ - // | - MOVQ a+8(FP), DI - XORQ R11, R11 - XORQ R12, R12 - XORQ R13, R13 - XORQ R14, R14 - XORQ R15, R15 - XORQ CX, CX - XORQ SI, SI - MOVQ $0x00000000, (SP) - MOVQ $0x00000000, 8(SP) - MOVQ $0x00000000, 16(SP) - MOVQ $0x00000000, 24(SP) - MOVQ $0x00000000, 32(SP) - MOVQ $0x00000000, 40(SP) - MOVQ $0x00000000, 48(SP) - MOVQ $0x00000000, 56(SP) - MOVQ $0x00000000, 64(SP) - MOVQ $0x00000000, 72(SP) - MOVQ $0x00000000, 80(SP) - MOVQ $0x00000000, 88(SP) - MOVQ $0x00000000, 96(SP) - - // | a0 - // | w0 @ R9 - MOVQ (DI), R8 - MOVQ R8, AX - MULQ R8 - MOVQ AX, R9 - MOVQ DX, R10 + // | | u14 = w14 * inp + MOVQ R9, AX + MULQ inp+32(FP) + MOVQ AX, DI + MOVQ $0x00, R12 - // | w1 @ R10 - MOVQ 8(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, R12 - ADDQ AX, R10 - ADCQ DX, R11 + // | - // | w2 @ R11 - MOVQ 16(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, R13 - ADDQ AX, R11 +/* */ + + // | save u14 + MOVQ DI, 72(SP) + + // | j0 + + // | w14 @ R9 + MOVQ (R14), AX + MULQ DI + ADDQ AX, R9 ADCQ DX, R12 - ADCQ $0x00, R13 - // | w3 @ R12 - MOVQ 24(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, R14 - ADDQ AX, R12 - ADCQ DX, R13 - ADCQ $0x00, R14 + // | j1 - // | w4 @ R13 - MOVQ 32(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, R15 - ADDQ AX, R13 - ADCQ DX, R14 - ADCQ $0x00, R15 + // | w15 @ R8 + MOVQ 8(R14), AX + MULQ DI + ADDQ AX, R8 + ADCQ $0x00, DX + ADDQ R12, R8 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w5 @ R14 - MOVQ 40(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, CX + // | j2 - // | w6 @ R15 - MOVQ 48(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, SI - ADDQ AX, R15 - ADCQ DX, CX - ADCQ $0x00, SI + // | w16 @ R13 + MOVQ 16(R14), AX + MULQ DI + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R12, R13 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w7 @ CX - MOVQ 56(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, (SP) + // | j3 + + // | w17 @ CX + MOVQ 24(R14), AX + MULQ DI ADDQ AX, CX - ADCQ DX, SI - ADCQ $0x00, (SP) + ADCQ $0x00, DX + ADDQ R12, CX + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w8 @ SI - MOVQ 64(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 8(SP) - ADDQ AX, SI - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) + // | j4 - // | w9 @ (SP) - MOVQ 72(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 16(SP) - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) + // | w18 @ R11 + MOVQ 32(R14), AX + MULQ DI + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R12, R11 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w10 @ 8(SP) - MOVQ 80(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 24(SP) - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) + // | j5 - // | a1 - // | w2 @ R11 - MOVQ 8(DI), R8 - MOVQ R8, AX - MULQ R8 - ADDQ AX, R11 + // | w19 @ R10 + MOVQ 40(R14), AX + MULQ DI + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R12, R10 + MOVQ $0x00, R12 ADCQ DX, R12 - ADCQ $0x00, R13 - ADCQ $0x00, R14 - // | w3 @ R12 - MOVQ 16(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, R14 - ADDQ AX, R12 - ADCQ DX, R13 - ADCQ $0x00, R14 + // | j6 - // | w4 @ R13 - MOVQ 24(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, R15 - ADDQ AX, R13 - ADCQ DX, R14 - ADCQ $0x00, R15 + // | w20 @ SI + MOVQ 48(R14), AX + MULQ DI + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R12, SI + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w5 @ R14 - MOVQ 32(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, CX - ADDQ AX, R14 + // | j7 + + // | w21 @ BX + MOVQ 56(R14), AX + MULQ DI + ADDQ AX, BX ADCQ DX, R15 - ADCQ $0x00, CX + ADDQ R12, BX - // | w6 @ R15 - MOVQ 40(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, SI - ADDQ AX, R15 - ADCQ DX, CX - ADCQ $0x00, SI + // | move to idle register + MOVQ 56(SP), R9 + + // | w22 @ R9 + ADCQ R15, R9 + MOVQ $0x00, R15 + ADCQ $0x00, R15 + + // | + // | W q3 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - + // | 15 R8 | 16 R13 | 17 CX | 18 R11 | 19 R10 | 20 SI | 21 BX | 22 R9 | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) + + + // | aggregate carries from q2 & q3 + // | should be added to w23 + ADCQ R15, 136(SP) + + // | + +/* montgomerry reduction q4 */ + + // | + +/* i = 0 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - + // | 15 R8 | 16 R13 | 17 CX | 18 R11 | 19 R10 | 20 SI | 21 BX | 22 R9 | 23 48(SP) | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) - // | w7 @ CX - MOVQ 48(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, (SP) - ADDQ AX, CX - ADCQ DX, SI - ADCQ $0x00, (SP) - // | w8 @ SI - MOVQ 56(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 8(SP) - ADDQ AX, SI - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) + MOVQ $0x00, R12 - // | w9 @ (SP) - MOVQ 64(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 16(SP) - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) + // | - // | w10 @ 8(SP) - MOVQ 72(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 24(SP) - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) +/* */ - // | w11 @ 16(SP) - MOVQ 80(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 32(SP) - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) + // | j8 - // | a2 - // | w4 @ R13 - MOVQ 16(DI), R8 - MOVQ R8, AX - MULQ R8 + // | w16 @ R13 + MOVQ 64(R14), AX + MULQ 112(SP) ADDQ AX, R13 - ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, CX - - // | w5 @ R14 - MOVQ 24(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, CX + ADCQ DX, R12 - // | w6 @ R15 - MOVQ 32(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, SI - ADDQ AX, R15 - ADCQ DX, CX - ADCQ $0x00, SI + // | j9 - // | w7 @ CX - MOVQ 40(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, (SP) + // | w17 @ CX + MOVQ 72(R14), AX + MULQ 112(SP) ADDQ AX, CX - ADCQ DX, SI - ADCQ $0x00, (SP) + ADCQ $0x00, DX + ADDQ R12, CX + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w8 @ SI - MOVQ 48(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 8(SP) - ADDQ AX, SI - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) + // | j10 - // | w9 @ (SP) - MOVQ 56(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 16(SP) - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) + // | w18 @ R11 + MOVQ 80(R14), AX + MULQ 112(SP) + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R12, R11 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w10 @ 8(SP) - MOVQ 64(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 24(SP) - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) + // | j11 - // | w11 @ 16(SP) - MOVQ 72(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 32(SP) - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) + // | w19 @ R10 + MOVQ 88(R14), AX + MULQ 112(SP) + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R12, R10 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w12 @ 24(SP) - MOVQ 80(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 40(SP) - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) + // | j12 - // | a3 - // | w6 @ R15 - MOVQ 24(DI), R8 - MOVQ R8, AX - MULQ R8 - ADDQ AX, R15 - ADCQ DX, CX - ADCQ $0x00, SI - ADCQ $0x00, (SP) + // | w20 @ SI + MOVQ 96(R14), AX + MULQ 112(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R12, SI + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w7 @ CX - MOVQ 32(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, (SP) - ADDQ AX, CX - ADCQ DX, SI - ADCQ $0x00, (SP) + // | j13 - // | w8 @ SI - MOVQ 40(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 8(SP) - ADDQ AX, SI - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) + // | w21 @ BX + MOVQ 104(R14), AX + MULQ 112(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R12, BX + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w9 @ (SP) - MOVQ 48(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 16(SP) - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) + // | j14 - // | w10 @ 8(SP) - MOVQ 56(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 24(SP) - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) + // | w22 @ R9 + MOVQ 112(R14), AX + MULQ 112(SP) + ADDQ AX, R9 + ADCQ 136(SP), DX + ADDQ R12, R9 + MOVQ 48(SP), DI - // | w11 @ 16(SP) - MOVQ 64(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 32(SP) - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) + // | w23 @ DI + ADCQ DX, DI + MOVQ $0x00, R15 + ADCQ $0x00, R15 - // | w12 @ 24(SP) - MOVQ 72(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 40(SP) - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) + // | - // | w13 @ 32(SP) - MOVQ 80(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 48(SP) - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) +/* i = 1 */ - // | a4 - // | w8 @ SI - MOVQ 32(DI), R8 - MOVQ R8, AX - MULQ R8 - ADDQ AX, SI - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - + // | 15 R8 | 16 R13 | 17 CX | 18 R11 | 19 R10 | 20 SI | 21 BX | 22 R9 | 23 DI | 24 40(SP) | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) - // | w9 @ (SP) - MOVQ 40(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 16(SP) - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - // | w10 @ 8(SP) - MOVQ 48(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 24(SP) - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) + MOVQ $0x00, R12 - // | w11 @ 16(SP) - MOVQ 56(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 32(SP) - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) + // | - // | w12 @ 24(SP) - MOVQ 64(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 40(SP) - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) +/* */ - // | w13 @ 32(SP) - MOVQ 72(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 48(SP) - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) + // | j8 - // | w14 @ 40(SP) - MOVQ 80(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 56(SP) - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) + // | w17 @ CX + MOVQ 64(R14), AX + MULQ 120(SP) + ADDQ AX, CX + ADCQ DX, R12 + MOVQ CX, 56(SP) - // | a5 - // | w10 @ 8(SP) - MOVQ 40(DI), R8 - MOVQ R8, AX - MULQ R8 - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | j9 - // | w11 @ 16(SP) - MOVQ 48(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 32(SP) - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) + // | w18 @ R11 + MOVQ 72(R14), AX + MULQ 120(SP) + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R12, R11 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w12 @ 24(SP) - MOVQ 56(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 40(SP) - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) + // | j10 - // | w13 @ 32(SP) - MOVQ 64(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 48(SP) - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) + // | w19 @ R10 + MOVQ 80(R14), AX + MULQ 120(SP) + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R12, R10 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w14 @ 40(SP) - MOVQ 72(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 56(SP) - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) + // | j11 - // | w15 @ 48(SP) - MOVQ 80(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 64(SP) - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) + // | w20 @ SI + MOVQ 88(R14), AX + MULQ 120(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R12, SI + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a6 - // | w12 @ 24(SP) - MOVQ 48(DI), R8 - MOVQ R8, AX - MULQ R8 - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | j12 - // | w13 @ 32(SP) - MOVQ 56(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 48(SP) - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) + // | w21 @ BX + MOVQ 96(R14), AX + MULQ 120(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R12, BX + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w14 @ 40(SP) - MOVQ 64(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 56(SP) - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) + // | j13 - // | w15 @ 48(SP) - MOVQ 72(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 64(SP) - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) + // | w22 @ R9 + MOVQ 104(R14), AX + MULQ 120(SP) + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R12, R9 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | w16 @ 56(SP) - MOVQ 80(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 72(SP) - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) + // | j14 - // | a7 - // | w14 @ 40(SP) - MOVQ 56(DI), R8 - MOVQ R8, AX - MULQ R8 - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | w23 @ DI + MOVQ 112(R14), AX + MULQ 120(SP) + ADDQ AX, DI + ADCQ DX, R15 + ADDQ R12, DI + MOVQ 40(SP), CX - // | w15 @ 48(SP) - MOVQ 64(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 64(SP) - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) + // | w24 @ CX + ADCQ R15, CX + MOVQ $0x00, R15 + ADCQ $0x00, R15 - // | w16 @ 56(SP) - MOVQ 72(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 72(SP) - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) + // | - // | w17 @ 64(SP) - MOVQ 80(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 80(SP) - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) +/* i = 2 */ - // | a8 - // | w16 @ 56(SP) - MOVQ 64(DI), R8 - MOVQ R8, AX - MULQ R8 - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - + // | 15 R8 | 16 R13 | 17 56(SP) | 18 R11 | 19 R10 | 20 SI | 21 BX | 22 R9 | 23 DI | 24 CX | 25 32(SP) | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) - // | w17 @ 64(SP) - MOVQ 72(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 80(SP) - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - // | w18 @ 72(SP) - MOVQ 80(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 88(SP) - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) + MOVQ $0x00, R12 - // | a9 - // | w18 @ 72(SP) - MOVQ 72(DI), R8 - MOVQ R8, AX - MULQ R8 - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) + // | - // | w19 @ 80(SP) - MOVQ 80(DI), AX - MULQ R8 - ADDQ AX, AX - ADCQ DX, DX - ADCQ $0x00, 96(SP) - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) +/* */ - // | a10 - // | w20 @ 88(SP) - MOVQ 80(DI), R8 - MOVQ R8, AX - MULQ R8 - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) + // | j8 - // | - MOVQ c+0(FP), DI - MOVQ R9, (DI) - MOVQ R10, 8(DI) - MOVQ R11, 16(DI) - MOVQ R12, 24(DI) - MOVQ R13, 32(DI) - MOVQ R14, 40(DI) - MOVQ R15, 48(DI) - MOVQ CX, 56(DI) - MOVQ SI, 64(DI) - MOVQ (SP), BX - MOVQ BX, 72(DI) - MOVQ 8(SP), BX - MOVQ BX, 80(DI) - MOVQ 16(SP), BX - MOVQ BX, 88(DI) - MOVQ 24(SP), BX - MOVQ BX, 96(DI) - MOVQ 32(SP), BX - MOVQ BX, 104(DI) - MOVQ 40(SP), BX - MOVQ BX, 112(DI) - MOVQ 48(SP), BX - MOVQ BX, 120(DI) - MOVQ 56(SP), BX - MOVQ BX, 128(DI) - MOVQ 64(SP), BX - MOVQ BX, 136(DI) - MOVQ 72(SP), BX - MOVQ BX, 144(DI) - MOVQ 80(SP), BX - MOVQ BX, 152(DI) - MOVQ 88(SP), BX - MOVQ BX, 160(DI) - MOVQ 96(SP), BX - MOVQ BX, 168(DI) - RET + // | w18 @ R11 + MOVQ 64(R14), AX + MULQ 104(SP) + ADDQ AX, R11 + ADCQ DX, R12 + MOVQ R11, 40(SP) -// func mul12(c *[24]uint64, a *[12]uint64, b *[12]uint64, p *[12]uint64, inp uint64) -TEXT ·mul12(SB), $208-40 -/* inputs */ - // | - // | Multiplication - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI + // | j9 - // | - // | - XORQ R10, R10 - XORQ R11, R11 - XORQ R12, R12 - XORQ R13, R13 - XORQ R14, R14 - XORQ R15, R15 - MOVQ $0x00000000, (SP) - MOVQ $0x00000000, 8(SP) - MOVQ $0x00000000, 16(SP) - MOVQ $0x00000000, 24(SP) - MOVQ $0x00000000, 32(SP) - MOVQ $0x00000000, 40(SP) - MOVQ $0x00000000, 48(SP) - MOVQ $0x00000000, 56(SP) - MOVQ $0x00000000, 64(SP) - MOVQ $0x00000000, 72(SP) - MOVQ $0x00000000, 80(SP) - MOVQ $0x00000000, 88(SP) - MOVQ $0x00000000, 96(SP) - MOVQ $0x00000000, 104(SP) - MOVQ $0x00000000, 112(SP) - MOVQ $0x00000000, 120(SP) - - // | - // | b0 - MOVQ (SI), CX + // | w19 @ R10 + MOVQ 72(R14), AX + MULQ 104(SP) + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R12, R10 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j10 + + // | w20 @ SI + MOVQ 80(R14), AX + MULQ 104(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R12, SI + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j11 + + // | w21 @ BX + MOVQ 88(R14), AX + MULQ 104(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R12, BX + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a0 * b0 - // | (w0, w1) @ (R8, R9) - MOVQ (DI), AX - MULQ CX - MOVQ AX, R8 - MOVQ DX, R9 + // | j12 - // | a1 * b0 - // | (w1, w2) @ (R9, R10) - MOVQ 8(DI), AX - MULQ CX + // | w22 @ R9 + MOVQ 96(R14), AX + MULQ 104(SP) ADDQ AX, R9 - ADCQ DX, R10 + ADCQ $0x00, DX + ADDQ R12, R9 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a2 * b0 - // | (w2, w3) @ (R10, R11) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, R11 + // | j13 - // | a3 * b0 - // | (w3, w4) @ (R11, R12) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, R11 + // | w23 @ DI + MOVQ 104(R14), AX + MULQ 104(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R12, DI + MOVQ $0x00, R12 ADCQ DX, R12 - // | a4 * b0 - // | (w4, w5) @ (R12, R13) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, R13 - - // | a5 * b0 - // | (w5, w6) @ (R13, R14) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 + // | j14 - // | a6 * b0 - // | (w6, w7) @ (R14, R15) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, R14 + // | w24 @ CX + MOVQ 112(R14), AX + MULQ 104(SP) + ADDQ AX, CX ADCQ DX, R15 + ADDQ R12, CX + MOVQ 32(SP), R11 - // | a7 * b0 - // | (w7, w8) @ (R15, (SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) + // | w25 @ R11 + ADCQ R15, R11 + MOVQ $0x00, R15 + ADCQ $0x00, R15 - // | a8 * b0 - // | (w8, w9) @ ((SP), 8(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) + // | - // | a9 * b0 - // | (w9, w10) @ (8(SP), 16(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) +/* i = 3 */ - // | a10 * b0 - // | (w10, w11) @ (16(SP), 24(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - + // | 15 R8 | 16 R13 | 17 56(SP) | 18 40(SP) | 19 R10 | 20 SI | 21 BX | 22 R9 | 23 DI | 24 CX | 25 R11 | 26 24(SP) | 27 16(SP) | 28 8(SP) | 29 (SP) - // | a11 * b0 - // | (w11, w12) @ (24(SP), 32(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - // | - // | b1 - MOVQ 8(SI), CX + MOVQ $0x00, R12 - // | a0 * b1 - // | (w1, w2, w3, w4) @ (R9, R10, R11, R12) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, R10 - ADCQ $0x00, R11 - ADCQ $0x00, R12 + // | - // | a1 * b1 - // | (w2, w3, w4, w5) @ (R10, R11, R12, R13) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, R11 - ADCQ $0x00, R12 - ADCQ $0x00, R13 +/* */ - // | a2 * b1 - // | (w3, w4, w5, w6) @ (R11, R12, R13, R14) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R11 + // | j8 + + // | w19 @ R10 + MOVQ 64(R14), AX + MULQ 96(SP) + ADDQ AX, R10 ADCQ DX, R12 - ADCQ $0x00, R13 - ADCQ $0x00, R14 + MOVQ R10, 32(SP) - // | a3 * b1 - // | (w4, w5, w6, w7) @ (R12, R13, R14, R15) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, R13 - ADCQ $0x00, R14 - ADCQ $0x00, R15 + // | j9 - // | a4 * b1 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) + // | w20 @ SI + MOVQ 72(R14), AX + MULQ 96(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R12, SI + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a5 * b1 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + // | j10 - // | a6 * b1 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + // | w21 @ BX + MOVQ 80(R14), AX + MULQ 96(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R12, BX + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a7 * b1 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | j11 - // | a8 * b1 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | w22 @ R9 + MOVQ 88(R14), AX + MULQ 96(SP) + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R12, R9 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a9 * b1 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | j12 - // | a10 * b1 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | w23 @ DI + MOVQ 96(R14), AX + MULQ 96(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R12, DI + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a11 * b1 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | j13 - // | - // | b2 - MOVQ 16(SI), CX + // | w24 @ CX + MOVQ 104(R14), AX + MULQ 96(SP) + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R12, CX + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a0 * b2 - // | (w2, w3, w4, w5) @ (R10, R11, R12, R13) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, R11 - ADCQ $0x00, R12 - ADCQ $0x00, R13 + // | j14 - // | a1 * b2 - // | (w3, w4, w5, w6) @ (R11, R12, R13, R14) - MOVQ 8(DI), AX - MULQ CX + // | w25 @ R11 + MOVQ 112(R14), AX + MULQ 96(SP) ADDQ AX, R11 - ADCQ DX, R12 - ADCQ $0x00, R13 - ADCQ $0x00, R14 + ADCQ DX, R15 + ADDQ R12, R11 + MOVQ 24(SP), R10 - // | a2 * b2 - // | (w4, w5, w6, w7) @ (R12, R13, R14, R15) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, R13 - ADCQ $0x00, R14 + // | w26 @ R10 + ADCQ R15, R10 + MOVQ $0x00, R15 ADCQ $0x00, R15 - // | a3 * b2 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) + // | - // | a4 * b2 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) +/* i = 4 */ - // | a5 * b2 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - + // | 15 R8 | 16 R13 | 17 56(SP) | 18 40(SP) | 19 32(SP) | 20 SI | 21 BX | 22 R9 | 23 DI | 24 CX | 25 R11 | 26 R10 | 27 16(SP) | 28 8(SP) | 29 (SP) - // | a6 * b2 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) - // | a7 * b2 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + MOVQ $0x00, R12 - // | a8 * b2 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | - // | a9 * b2 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) +/* */ - // | a10 * b2 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | j8 - // | a11 * b2 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | w20 @ SI + MOVQ 64(R14), AX + MULQ 88(SP) + ADDQ AX, SI + ADCQ DX, R12 + MOVQ SI, 24(SP) - // | - // | b3 - MOVQ 24(SI), CX + // | j9 - // | a0 * b3 - // | (w3, w4, w5, w6) @ (R11, R12, R13, R14) - MOVQ (DI), AX - MULQ CX + // | w21 @ BX + MOVQ 72(R14), AX + MULQ 88(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R12, BX + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j10 + + // | w22 @ R9 + MOVQ 80(R14), AX + MULQ 88(SP) + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R12, R9 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j11 + + // | w23 @ DI + MOVQ 88(R14), AX + MULQ 88(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R12, DI + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j12 + + // | w24 @ CX + MOVQ 96(R14), AX + MULQ 88(SP) + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R12, CX + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j13 + + // | w25 @ R11 + MOVQ 104(R14), AX + MULQ 88(SP) ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R12, R11 + MOVQ $0x00, R12 ADCQ DX, R12 - ADCQ $0x00, R13 - ADCQ $0x00, R14 - // | a1 * b3 - // | (w4, w5, w6, w7) @ (R12, R13, R14, R15) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, R13 - ADCQ $0x00, R14 - ADCQ $0x00, R15 + // | j14 - // | a2 * b3 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 + // | w26 @ R10 + MOVQ 112(R14), AX + MULQ 88(SP) + ADDQ AX, R10 + ADCQ DX, R15 + ADDQ R12, R10 + MOVQ 16(SP), SI + + // | w27 @ SI + ADCQ R15, SI + MOVQ $0x00, R15 ADCQ $0x00, R15 - ADCQ $0x00, (SP) - // | a3 * b3 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + // | - // | a4 * b3 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) +/* i = 5 */ - // | a5 * b3 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - + // | 15 R8 | 16 R13 | 17 56(SP) | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 BX | 22 R9 | 23 DI | 24 CX | 25 R11 | 26 R10 | 27 SI | 28 8(SP) | 29 (SP) - // | a6 * b3 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) - // | a7 * b3 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + MOVQ $0x00, R12 - // | a8 * b3 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | - // | a9 * b3 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) +/* */ - // | a10 * b3 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | j8 - // | a11 * b3 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | w21 @ BX + MOVQ 64(R14), AX + MULQ 80(SP) + ADDQ AX, BX + ADCQ DX, R12 + MOVQ BX, 16(SP) - // | - // | b4 - MOVQ 32(SI), CX + // | j9 - // | a0 * b4 - // | (w4, w5, w6, w7) @ (R12, R13, R14, R15) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, R13 - ADCQ $0x00, R14 - ADCQ $0x00, R15 + // | w22 @ R9 + MOVQ 72(R14), AX + MULQ 80(SP) + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R12, R9 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a1 * b4 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 + // | j10 + + // | w23 @ DI + MOVQ 80(R14), AX + MULQ 80(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R12, DI + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j11 + + // | w24 @ CX + MOVQ 88(R14), AX + MULQ 80(SP) + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R12, CX + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j12 + + // | w25 @ R11 + MOVQ 96(R14), AX + MULQ 80(SP) + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R12, R11 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j13 + + // | w26 @ R10 + MOVQ 104(R14), AX + MULQ 80(SP) + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R12, R10 + MOVQ $0x00, R12 + ADCQ DX, R12 + + // | j14 + + // | w27 @ SI + MOVQ 112(R14), AX + MULQ 80(SP) + ADDQ AX, SI + ADCQ DX, R15 + ADDQ R12, SI + MOVQ 8(SP), BX + + // | w28 @ BX + ADCQ R15, BX + MOVQ $0x00, R15 ADCQ $0x00, R15 - ADCQ $0x00, (SP) - // | a2 * b4 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + // | - // | a3 * b4 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) +/* i = 6 */ - // | a4 * b4 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - + // | 15 R8 | 16 R13 | 17 56(SP) | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 R9 | 23 DI | 24 CX | 25 R11 | 26 R10 | 27 SI | 28 BX | 29 (SP) - // | a5 * b4 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) - // | a6 * b4 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + MOVQ $0x00, R12 - // | a7 * b4 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | - // | a8 * b4 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) +/* */ - // | a9 * b4 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | j8 - // | a10 * b4 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | w22 @ R9 + MOVQ 64(R14), AX + MULQ 72(SP) + ADDQ AX, R9 + ADCQ DX, R12 - // | a11 * b4 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | j9 - // | - // | b5 - MOVQ 40(SI), CX + // | w23 @ DI + MOVQ 72(R14), AX + MULQ 72(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R12, DI + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a0 * b5 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) + // | j10 - // | a1 * b5 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + // | w24 @ CX + MOVQ 80(R14), AX + MULQ 72(SP) + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R12, CX + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a2 * b5 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + // | j11 - // | a3 * b5 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | w25 @ R11 + MOVQ 88(R14), AX + MULQ 72(SP) + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R12, R11 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a4 * b5 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | j12 - // | a5 * b5 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | w26 @ R10 + MOVQ 96(R14), AX + MULQ 72(SP) + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R12, R10 + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a6 * b5 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | j13 - // | a7 * b5 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | w27 @ SI + MOVQ 104(R14), AX + MULQ 72(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R12, SI + MOVQ $0x00, R12 + ADCQ DX, R12 - // | a8 * b5 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | j14 - // | a9 * b5 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | w28 @ BX + MOVQ 112(R14), AX + MULQ 72(SP) + ADDQ AX, BX + ADCQ DX, R15 + ADDQ R12, BX - // | a10 * b5 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | very last limb goes to short carry register + MOVQ (SP), R12 - // | a11 * b5 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | w-1 @ R12 + ADCQ R15, R12 + MOVQ $0x00, R15 + ADCQ $0x00, R15 - // | - // | b6 - MOVQ 48(SI), CX + // | + // | W q4 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - + // | 15 R8 | 16 R13 | 17 56(SP) | 18 40(SP) | 19 32(SP) | 20 24(SP) | 21 16(SP) | 22 R9 | 23 DI | 24 CX | 25 R11 | 26 R10 | 27 SI | 28 BX | 29 R12 - // | a0 * b6 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) - // | a1 * b6 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + // | - // | a2 * b6 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) +/* modular reduction */ - // | a3 * b6 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + MOVQ R8, DX + SUBQ (R14), DX + MOVQ DX, (SP) + MOVQ R13, DX + SBBQ 8(R14), DX + MOVQ DX, 8(SP) + MOVQ 56(SP), DX + SBBQ 16(R14), DX + MOVQ DX, 128(SP) + MOVQ 40(SP), DX + SBBQ 24(R14), DX + MOVQ DX, 144(SP) + MOVQ 32(SP), DX + SBBQ 32(R14), DX + MOVQ DX, 152(SP) + MOVQ 24(SP), DX + SBBQ 40(R14), DX + MOVQ DX, 160(SP) + MOVQ 16(SP), DX + SBBQ 48(R14), DX + MOVQ DX, 168(SP) + MOVQ R9, DX + SBBQ 56(R14), DX + MOVQ DX, 176(SP) + MOVQ DI, DX + SBBQ 64(R14), DX + MOVQ DX, 184(SP) + MOVQ CX, DX + SBBQ 72(R14), DX + MOVQ DX, 192(SP) + MOVQ R11, DX + SBBQ 80(R14), DX + MOVQ DX, 200(SP) + MOVQ R10, DX + SBBQ 88(R14), DX + MOVQ DX, 208(SP) + MOVQ SI, DX + SBBQ 96(R14), DX + MOVQ DX, 216(SP) + MOVQ BX, DX + SBBQ 104(R14), DX + MOVQ DX, 224(SP) + MOVQ R12, DX + SBBQ 112(R14), DX + MOVQ DX, 232(SP) + SBBQ $0x00, R15 - // | a4 * b6 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | - // | a5 * b6 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) +/* out */ - // | a6 * b6 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + MOVQ c+0(FP), R15 + CMOVQCC (SP), R8 + MOVQ R8, (R15) + CMOVQCC 8(SP), R13 + MOVQ R13, 8(R15) + MOVQ 56(SP), DX + CMOVQCC 128(SP), DX + MOVQ DX, 16(R15) + MOVQ 40(SP), DX + CMOVQCC 144(SP), DX + MOVQ DX, 24(R15) + MOVQ 32(SP), DX + CMOVQCC 152(SP), DX + MOVQ DX, 32(R15) + MOVQ 24(SP), DX + CMOVQCC 160(SP), DX + MOVQ DX, 40(R15) + MOVQ 16(SP), DX + CMOVQCC 168(SP), DX + MOVQ DX, 48(R15) + CMOVQCC 176(SP), R9 + MOVQ R9, 56(R15) + CMOVQCC 184(SP), DI + MOVQ DI, 64(R15) + CMOVQCC 192(SP), CX + MOVQ CX, 72(R15) + CMOVQCC 200(SP), R11 + MOVQ R11, 80(R15) + CMOVQCC 208(SP), R10 + MOVQ R10, 88(R15) + CMOVQCC 216(SP), SI + MOVQ SI, 96(R15) + CMOVQCC 224(SP), BX + MOVQ BX, 104(R15) + CMOVQCC 232(SP), R12 + MOVQ R12, 112(R15) + RET - // | a7 * b6 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | - // | a8 * b6 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) +/* end */ + + +// func cpy16(dst *[16]uint64, src *[16]uint64) +TEXT ·cpy16(SB), NOSPLIT, $0-16 + MOVQ dst+0(FP), DI + MOVQ src+8(FP), SI + MOVQ (SI), R8 + MOVQ R8, (DI) + MOVQ 8(SI), R8 + MOVQ R8, 8(DI) + MOVQ 16(SI), R8 + MOVQ R8, 16(DI) + MOVQ 24(SI), R8 + MOVQ R8, 24(DI) + MOVQ 32(SI), R8 + MOVQ R8, 32(DI) + MOVQ 40(SI), R8 + MOVQ R8, 40(DI) + MOVQ 48(SI), R8 + MOVQ R8, 48(DI) + MOVQ 56(SI), R8 + MOVQ R8, 56(DI) + MOVQ 64(SI), R8 + MOVQ R8, 64(DI) + MOVQ 72(SI), R8 + MOVQ R8, 72(DI) + MOVQ 80(SI), R8 + MOVQ R8, 80(DI) + MOVQ 88(SI), R8 + MOVQ R8, 88(DI) + MOVQ 96(SI), R8 + MOVQ R8, 96(DI) + MOVQ 104(SI), R8 + MOVQ R8, 104(DI) + MOVQ 112(SI), R8 + MOVQ R8, 112(DI) + MOVQ 120(SI), R8 + MOVQ R8, 120(DI) + RET + +// func eq16(a *[16]uint64, b *[16]uint64) bool +TEXT ·eq16(SB), NOSPLIT, $0-17 + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + MOVB $0x00, ret+16(FP) + MOVQ (DI), R8 + CMPQ (SI), R8 + JNE ret + MOVQ 8(DI), R8 + CMPQ 8(SI), R8 + JNE ret + MOVQ 16(DI), R8 + CMPQ 16(SI), R8 + JNE ret + MOVQ 24(DI), R8 + CMPQ 24(SI), R8 + JNE ret + MOVQ 32(DI), R8 + CMPQ 32(SI), R8 + JNE ret + MOVQ 40(DI), R8 + CMPQ 40(SI), R8 + JNE ret + MOVQ 48(DI), R8 + CMPQ 48(SI), R8 + JNE ret + MOVQ 56(DI), R8 + CMPQ 56(SI), R8 + JNE ret + MOVQ 64(DI), R8 + CMPQ 64(SI), R8 + JNE ret + MOVQ 72(DI), R8 + CMPQ 72(SI), R8 + JNE ret + MOVQ 80(DI), R8 + CMPQ 80(SI), R8 + JNE ret + MOVQ 88(DI), R8 + CMPQ 88(SI), R8 + JNE ret + MOVQ 96(DI), R8 + CMPQ 96(SI), R8 + JNE ret + MOVQ 104(DI), R8 + CMPQ 104(SI), R8 + JNE ret + MOVQ 112(DI), R8 + CMPQ 112(SI), R8 + JNE ret + MOVQ 120(DI), R8 + CMPQ 120(SI), R8 + JNE ret + MOVB $0x01, ret+16(FP) + +ret: + RET + +// func cmp16(a *[16]uint64, b *[16]uint64) int8 +TEXT ·cmp16(SB), NOSPLIT, $0-17 + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + MOVQ 120(DI), R8 + CMPQ 120(SI), R8 + JB gt + JA lt + MOVQ 112(DI), R8 + CMPQ 112(SI), R8 + JB gt + JA lt + MOVQ 104(DI), R8 + CMPQ 104(SI), R8 + JB gt + JA lt + MOVQ 96(DI), R8 + CMPQ 96(SI), R8 + JB gt + JA lt + MOVQ 88(DI), R8 + CMPQ 88(SI), R8 + JB gt + JA lt + MOVQ 80(DI), R8 + CMPQ 80(SI), R8 + JB gt + JA lt + MOVQ 72(DI), R8 + CMPQ 72(SI), R8 + JB gt + JA lt + MOVQ 64(DI), R8 + CMPQ 64(SI), R8 + JB gt + JA lt + MOVQ 56(DI), R8 + CMPQ 56(SI), R8 + JB gt + JA lt + MOVQ 48(DI), R8 + CMPQ 48(SI), R8 + JB gt + JA lt + MOVQ 40(DI), R8 + CMPQ 40(SI), R8 + JB gt + JA lt + MOVQ 32(DI), R8 + CMPQ 32(SI), R8 + JB gt + JA lt + MOVQ 24(DI), R8 + CMPQ 24(SI), R8 + JB gt + JA lt + MOVQ 16(DI), R8 + CMPQ 16(SI), R8 + JB gt + JA lt + MOVQ 8(DI), R8 + CMPQ 8(SI), R8 + JB gt + JA lt + MOVQ (DI), R8 + CMPQ (SI), R8 + JB gt + JA lt + MOVB $0x00, ret+16(FP) + JMP ret - // | a9 * b6 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) +gt: + MOVB $0x01, ret+16(FP) + JMP ret - // | a10 * b6 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) +lt: + MOVB $0xff, ret+16(FP) - // | a11 * b6 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) +ret: + RET +// func add16(c *[16]uint64, a *[16]uint64, b *[16]uint64, p *[16]uint64) +TEXT ·add16(SB), NOSPLIT, $176-32 // | - // | b7 - MOVQ 56(SI), CX + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + XORQ AX, AX - // | a0 * b7 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + // | + MOVQ (DI), CX + ADDQ (SI), CX + MOVQ 8(DI), DX + ADCQ 8(SI), DX + MOVQ 16(DI), R8 + ADCQ 16(SI), R8 + MOVQ 24(DI), R9 + ADCQ 24(SI), R9 + MOVQ 32(DI), R10 + ADCQ 32(SI), R10 + MOVQ 40(DI), R11 + ADCQ 40(SI), R11 + MOVQ 48(DI), R12 + ADCQ 48(SI), R12 + MOVQ 56(DI), R13 + ADCQ 56(SI), R13 + MOVQ 64(DI), R14 + ADCQ 64(SI), R14 + MOVQ 72(DI), R15 + ADCQ 72(SI), R15 + MOVQ 80(DI), BX + ADCQ 80(SI), BX + MOVQ BX, (SP) + MOVQ 88(DI), BX + ADCQ 88(SI), BX + MOVQ BX, 8(SP) + MOVQ 96(DI), BX + ADCQ 96(SI), BX + MOVQ BX, 16(SP) + MOVQ 104(DI), BX + ADCQ 104(SI), BX + MOVQ BX, 24(SP) + MOVQ 112(DI), BX + ADCQ 112(SI), BX + MOVQ BX, 32(SP) + MOVQ 120(DI), BX + ADCQ 120(SI), BX + MOVQ BX, 40(SP) + ADCQ $0x00, AX - // | a1 * b7 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | + MOVQ p+24(FP), SI + MOVQ CX, BX + SUBQ (SI), BX + MOVQ BX, 48(SP) + MOVQ DX, BX + SBBQ 8(SI), BX + MOVQ BX, 56(SP) + MOVQ R8, BX + SBBQ 16(SI), BX + MOVQ BX, 64(SP) + MOVQ R9, BX + SBBQ 24(SI), BX + MOVQ BX, 72(SP) + MOVQ R10, BX + SBBQ 32(SI), BX + MOVQ BX, 80(SP) + MOVQ R11, BX + SBBQ 40(SI), BX + MOVQ BX, 88(SP) + MOVQ R12, BX + SBBQ 48(SI), BX + MOVQ BX, 96(SP) + MOVQ R13, BX + SBBQ 56(SI), BX + MOVQ BX, 104(SP) + MOVQ R14, BX + SBBQ 64(SI), BX + MOVQ BX, 112(SP) + MOVQ R15, BX + SBBQ 72(SI), BX + MOVQ BX, 120(SP) + MOVQ (SP), BX + SBBQ 80(SI), BX + MOVQ BX, 128(SP) + MOVQ 8(SP), BX + SBBQ 88(SI), BX + MOVQ BX, 136(SP) + MOVQ 16(SP), BX + SBBQ 96(SI), BX + MOVQ BX, 144(SP) + MOVQ 24(SP), BX + SBBQ 104(SI), BX + MOVQ BX, 152(SP) + MOVQ 32(SP), BX + SBBQ 112(SI), BX + MOVQ BX, 160(SP) + MOVQ 40(SP), BX + SBBQ 120(SI), BX + MOVQ BX, 168(SP) + SBBQ $0x00, AX - // | a2 * b7 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | + MOVQ c+0(FP), DI + CMOVQCC 48(SP), CX + MOVQ CX, (DI) + CMOVQCC 56(SP), DX + MOVQ DX, 8(DI) + CMOVQCC 64(SP), R8 + MOVQ R8, 16(DI) + CMOVQCC 72(SP), R9 + MOVQ R9, 24(DI) + CMOVQCC 80(SP), R10 + MOVQ R10, 32(DI) + CMOVQCC 88(SP), R11 + MOVQ R11, 40(DI) + CMOVQCC 96(SP), R12 + MOVQ R12, 48(DI) + CMOVQCC 104(SP), R13 + MOVQ R13, 56(DI) + CMOVQCC 112(SP), R14 + MOVQ R14, 64(DI) + CMOVQCC 120(SP), R15 + MOVQ R15, 72(DI) + MOVQ (SP), BX + CMOVQCC 128(SP), BX + MOVQ BX, 80(DI) + MOVQ 8(SP), BX + CMOVQCC 136(SP), BX + MOVQ BX, 88(DI) + MOVQ 16(SP), BX + CMOVQCC 144(SP), BX + MOVQ BX, 96(DI) + MOVQ 24(SP), BX + CMOVQCC 152(SP), BX + MOVQ BX, 104(DI) + MOVQ 32(SP), BX + CMOVQCC 160(SP), BX + MOVQ BX, 112(DI) + MOVQ 40(SP), BX + CMOVQCC 168(SP), BX + MOVQ BX, 120(DI) + RET - // | a3 * b7 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | - // | a4 * b7 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) +/* end */ - // | a5 * b7 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + RET - // | a6 * b7 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) +// func addn16(a *[16]uint64, b *[16]uint64) uint64 +TEXT ·addn16(SB), NOSPLIT, $48-24 + // | + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI - // | a7 * b7 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | + MOVQ (DI), CX + ADDQ (SI), CX + MOVQ 8(DI), DX + ADCQ 8(SI), DX + MOVQ 16(DI), R8 + ADCQ 16(SI), R8 + MOVQ 24(DI), R9 + ADCQ 24(SI), R9 + MOVQ 32(DI), R10 + ADCQ 32(SI), R10 + MOVQ 40(DI), R11 + ADCQ 40(SI), R11 + MOVQ 48(DI), R12 + ADCQ 48(SI), R12 + MOVQ 56(DI), R13 + ADCQ 56(SI), R13 + MOVQ 64(DI), R14 + ADCQ 64(SI), R14 + MOVQ 72(DI), R15 + ADCQ 72(SI), R15 + MOVQ 80(DI), BX + ADCQ 80(SI), BX + MOVQ BX, (SP) + MOVQ 88(DI), BX + ADCQ 88(SI), BX + MOVQ BX, 8(SP) + MOVQ 96(DI), BX + ADCQ 96(SI), BX + MOVQ BX, 16(SP) + MOVQ 104(DI), BX + ADCQ 104(SI), BX + MOVQ BX, 24(SP) + MOVQ 112(DI), BX + ADCQ 112(SI), BX + MOVQ BX, 32(SP) + MOVQ 120(DI), BX + ADCQ 120(SI), BX + MOVQ BX, 40(SP) + ADCQ $0x00, AX - // | a8 * b7 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | + MOVQ CX, (DI) + MOVQ DX, 8(DI) + MOVQ R8, 16(DI) + MOVQ R9, 24(DI) + MOVQ R10, 32(DI) + MOVQ R11, 40(DI) + MOVQ R12, 48(DI) + MOVQ R13, 56(DI) + MOVQ R14, 64(DI) + MOVQ R15, 72(DI) + MOVQ (SP), BX + MOVQ BX, 80(DI) + MOVQ 8(SP), BX + MOVQ BX, 88(DI) + MOVQ 16(SP), BX + MOVQ BX, 96(DI) + MOVQ 24(SP), BX + MOVQ BX, 104(DI) + MOVQ 32(SP), BX + MOVQ BX, 112(DI) + MOVQ 40(SP), BX + MOVQ BX, 120(DI) + MOVQ AX, ret+16(FP) + RET - // | a9 * b7 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | - // | a10 * b7 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) +/* end */ - // | a11 * b7 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + RET +// func double16(c *[16]uint64, a *[16]uint64, p *[16]uint64) +TEXT ·double16(SB), NOSPLIT, $176-24 // | - // | b8 - MOVQ 64(SI), CX + MOVQ a+8(FP), DI + XORQ AX, AX + MOVQ (DI), CX + ADDQ CX, CX + MOVQ 8(DI), DX + ADCQ DX, DX + MOVQ 16(DI), R8 + ADCQ R8, R8 + MOVQ 24(DI), R9 + ADCQ R9, R9 + MOVQ 32(DI), R10 + ADCQ R10, R10 + MOVQ 40(DI), R11 + ADCQ R11, R11 + MOVQ 48(DI), R12 + ADCQ R12, R12 + MOVQ 56(DI), R13 + ADCQ R13, R13 + MOVQ 64(DI), R14 + ADCQ R14, R14 + MOVQ 72(DI), R15 + ADCQ R15, R15 + MOVQ 80(DI), BX + ADCQ BX, BX + MOVQ BX, (SP) + MOVQ 88(DI), BX + ADCQ BX, BX + MOVQ BX, 8(SP) + MOVQ 96(DI), BX + ADCQ BX, BX + MOVQ BX, 16(SP) + MOVQ 104(DI), BX + ADCQ BX, BX + MOVQ BX, 24(SP) + MOVQ 112(DI), BX + ADCQ BX, BX + MOVQ BX, 32(SP) + MOVQ 120(DI), BX + ADCQ BX, BX + MOVQ BX, 40(SP) + ADCQ $0x00, AX - // | a0 * b8 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | + MOVQ p+16(FP), SI + MOVQ CX, BX + SUBQ (SI), BX + MOVQ BX, 48(SP) + MOVQ DX, BX + SBBQ 8(SI), BX + MOVQ BX, 56(SP) + MOVQ R8, BX + SBBQ 16(SI), BX + MOVQ BX, 64(SP) + MOVQ R9, BX + SBBQ 24(SI), BX + MOVQ BX, 72(SP) + MOVQ R10, BX + SBBQ 32(SI), BX + MOVQ BX, 80(SP) + MOVQ R11, BX + SBBQ 40(SI), BX + MOVQ BX, 88(SP) + MOVQ R12, BX + SBBQ 48(SI), BX + MOVQ BX, 96(SP) + MOVQ R13, BX + SBBQ 56(SI), BX + MOVQ BX, 104(SP) + MOVQ R14, BX + SBBQ 64(SI), BX + MOVQ BX, 112(SP) + MOVQ R15, BX + SBBQ 72(SI), BX + MOVQ BX, 120(SP) + MOVQ (SP), BX + SBBQ 80(SI), BX + MOVQ BX, 128(SP) + MOVQ 8(SP), BX + SBBQ 88(SI), BX + MOVQ BX, 136(SP) + MOVQ 16(SP), BX + SBBQ 96(SI), BX + MOVQ BX, 144(SP) + MOVQ 24(SP), BX + SBBQ 104(SI), BX + MOVQ BX, 152(SP) + MOVQ 32(SP), BX + SBBQ 112(SI), BX + MOVQ BX, 160(SP) + MOVQ 40(SP), BX + SBBQ 120(SI), BX + MOVQ BX, 168(SP) + SBBQ $0x00, AX - // | a1 * b8 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | + MOVQ c+0(FP), DI + CMOVQCC 48(SP), CX + MOVQ CX, (DI) + CMOVQCC 56(SP), DX + MOVQ DX, 8(DI) + CMOVQCC 64(SP), R8 + MOVQ R8, 16(DI) + CMOVQCC 72(SP), R9 + MOVQ R9, 24(DI) + CMOVQCC 80(SP), R10 + MOVQ R10, 32(DI) + CMOVQCC 88(SP), R11 + MOVQ R11, 40(DI) + CMOVQCC 96(SP), R12 + MOVQ R12, 48(DI) + CMOVQCC 104(SP), R13 + MOVQ R13, 56(DI) + CMOVQCC 112(SP), R14 + MOVQ R14, 64(DI) + CMOVQCC 120(SP), R15 + MOVQ R15, 72(DI) + MOVQ (SP), BX + CMOVQCC 128(SP), BX + MOVQ BX, 80(DI) + MOVQ 8(SP), BX + CMOVQCC 136(SP), BX + MOVQ BX, 88(DI) + MOVQ 16(SP), BX + CMOVQCC 144(SP), BX + MOVQ BX, 96(DI) + MOVQ 24(SP), BX + CMOVQCC 152(SP), BX + MOVQ BX, 104(DI) + MOVQ 32(SP), BX + CMOVQCC 160(SP), BX + MOVQ BX, 112(DI) + MOVQ 40(SP), BX + CMOVQCC 168(SP), BX + MOVQ BX, 120(DI) + RET - // | a2 * b8 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | - // | a3 * b8 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) +/* end */ - // | a4 * b8 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + RET - // | a5 * b8 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) +// func sub16(c *[16]uint64, a *[16]uint64, b *[16]uint64, p *[16]uint64) +TEXT ·sub16(SB), NOSPLIT, $176-32 + // | + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + XORQ AX, AX + MOVQ (DI), CX + SUBQ (SI), CX + MOVQ 8(DI), DX + SBBQ 8(SI), DX + MOVQ 16(DI), R8 + SBBQ 16(SI), R8 + MOVQ 24(DI), R9 + SBBQ 24(SI), R9 + MOVQ 32(DI), R10 + SBBQ 32(SI), R10 + MOVQ 40(DI), R11 + SBBQ 40(SI), R11 + MOVQ 48(DI), R12 + SBBQ 48(SI), R12 + MOVQ 56(DI), R13 + SBBQ 56(SI), R13 + MOVQ 64(DI), R14 + SBBQ 64(SI), R14 + MOVQ 72(DI), R15 + SBBQ 72(SI), R15 + MOVQ 80(DI), BX + SBBQ 80(SI), BX + MOVQ BX, (SP) + MOVQ 88(DI), BX + SBBQ 88(SI), BX + MOVQ BX, 8(SP) + MOVQ 96(DI), BX + SBBQ 96(SI), BX + MOVQ BX, 16(SP) + MOVQ 104(DI), BX + SBBQ 104(SI), BX + MOVQ BX, 24(SP) + MOVQ 112(DI), BX + SBBQ 112(SI), BX + MOVQ BX, 32(SP) + MOVQ 120(DI), BX + SBBQ 120(SI), BX + MOVQ BX, 40(SP) - // | a6 * b8 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | + MOVQ p+24(FP), SI + CMOVQCS (SI), AX + MOVQ AX, 48(SP) + CMOVQCS 8(SI), AX + MOVQ AX, 56(SP) + CMOVQCS 16(SI), AX + MOVQ AX, 64(SP) + CMOVQCS 24(SI), AX + MOVQ AX, 72(SP) + CMOVQCS 32(SI), AX + MOVQ AX, 80(SP) + CMOVQCS 40(SI), AX + MOVQ AX, 88(SP) + CMOVQCS 48(SI), AX + MOVQ AX, 96(SP) + CMOVQCS 56(SI), AX + MOVQ AX, 104(SP) + CMOVQCS 64(SI), AX + MOVQ AX, 112(SP) + CMOVQCS 72(SI), AX + MOVQ AX, 120(SP) + CMOVQCS 80(SI), AX + MOVQ AX, 128(SP) + CMOVQCS 88(SI), AX + MOVQ AX, 136(SP) + CMOVQCS 96(SI), AX + MOVQ AX, 144(SP) + CMOVQCS 104(SI), AX + MOVQ AX, 152(SP) + CMOVQCS 112(SI), AX + MOVQ AX, 160(SP) + CMOVQCS 120(SI), AX + MOVQ AX, 168(SP) - // | a7 * b8 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | + MOVQ c+0(FP), DI + ADDQ 48(SP), CX + MOVQ CX, (DI) + ADCQ 56(SP), DX + MOVQ DX, 8(DI) + ADCQ 64(SP), R8 + MOVQ R8, 16(DI) + ADCQ 72(SP), R9 + MOVQ R9, 24(DI) + ADCQ 80(SP), R10 + MOVQ R10, 32(DI) + ADCQ 88(SP), R11 + MOVQ R11, 40(DI) + ADCQ 96(SP), R12 + MOVQ R12, 48(DI) + ADCQ 104(SP), R13 + MOVQ R13, 56(DI) + ADCQ 112(SP), R14 + MOVQ R14, 64(DI) + ADCQ 120(SP), R15 + MOVQ R15, 72(DI) + MOVQ (SP), BX + ADCQ 128(SP), BX + MOVQ BX, 80(DI) + MOVQ 8(SP), BX + ADCQ 136(SP), BX + MOVQ BX, 88(DI) + MOVQ 16(SP), BX + ADCQ 144(SP), BX + MOVQ BX, 96(DI) + MOVQ 24(SP), BX + ADCQ 152(SP), BX + MOVQ BX, 104(DI) + MOVQ 32(SP), BX + ADCQ 160(SP), BX + MOVQ BX, 112(DI) + MOVQ 40(SP), BX + ADCQ 168(SP), BX + MOVQ BX, 120(DI) + RET - // | a8 * b8 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | - // | a9 * b8 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) +/* end */ - // | a10 * b8 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + RET - // | a11 * b8 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) +// func subn16(a *[16]uint64, b *[16]uint64) uint64 +TEXT ·subn16(SB), NOSPLIT, $48-24 + // | + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + XORQ AX, AX // | - // | b9 - MOVQ 72(SI), CX + MOVQ (DI), CX + SUBQ (SI), CX + MOVQ 8(DI), DX + SBBQ 8(SI), DX + MOVQ 16(DI), R8 + SBBQ 16(SI), R8 + MOVQ 24(DI), R9 + SBBQ 24(SI), R9 + MOVQ 32(DI), R10 + SBBQ 32(SI), R10 + MOVQ 40(DI), R11 + SBBQ 40(SI), R11 + MOVQ 48(DI), R12 + SBBQ 48(SI), R12 + MOVQ 56(DI), R13 + SBBQ 56(SI), R13 + MOVQ 64(DI), R14 + SBBQ 64(SI), R14 + MOVQ 72(DI), R15 + SBBQ 72(SI), R15 + MOVQ 80(DI), BX + SBBQ 80(SI), BX + MOVQ BX, (SP) + MOVQ 88(DI), BX + SBBQ 88(SI), BX + MOVQ BX, 8(SP) + MOVQ 96(DI), BX + SBBQ 96(SI), BX + MOVQ BX, 16(SP) + MOVQ 104(DI), BX + SBBQ 104(SI), BX + MOVQ BX, 24(SP) + MOVQ 112(DI), BX + SBBQ 112(SI), BX + MOVQ BX, 32(SP) + MOVQ 120(DI), BX + SBBQ 120(SI), BX + MOVQ BX, 40(SP) + ADCQ $0x00, AX - // | a0 * b9 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | + MOVQ CX, (DI) + MOVQ DX, 8(DI) + MOVQ R8, 16(DI) + MOVQ R9, 24(DI) + MOVQ R10, 32(DI) + MOVQ R11, 40(DI) + MOVQ R12, 48(DI) + MOVQ R13, 56(DI) + MOVQ R14, 64(DI) + MOVQ R15, 72(DI) + MOVQ (SP), BX + MOVQ BX, 80(DI) + MOVQ 8(SP), BX + MOVQ BX, 88(DI) + MOVQ 16(SP), BX + MOVQ BX, 96(DI) + MOVQ 24(SP), BX + MOVQ BX, 104(DI) + MOVQ 32(SP), BX + MOVQ BX, 112(DI) + MOVQ 40(SP), BX + MOVQ BX, 120(DI) + MOVQ AX, ret+16(FP) + RET - // | a1 * b9 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | - // | a2 * b9 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) +/* end */ - // | a3 * b9 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + RET - // | a4 * b9 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) +// func _neg16(c *[16]uint64, a *[16]uint64, p *[16]uint64) +TEXT ·_neg16(SB), NOSPLIT, $48-24 + // | + MOVQ a+8(FP), DI - // | a5 * b9 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | + MOVQ p+16(FP), SI + MOVQ (SI), CX + SUBQ (DI), CX + MOVQ 8(SI), DX + SBBQ 8(DI), DX + MOVQ 16(SI), R8 + SBBQ 16(DI), R8 + MOVQ 24(SI), R9 + SBBQ 24(DI), R9 + MOVQ 32(SI), R10 + SBBQ 32(DI), R10 + MOVQ 40(SI), R11 + SBBQ 40(DI), R11 + MOVQ 48(SI), R12 + SBBQ 48(DI), R12 + MOVQ 56(SI), R13 + SBBQ 56(DI), R13 + MOVQ 64(SI), R14 + SBBQ 64(DI), R14 + MOVQ 72(SI), R15 + SBBQ 72(DI), R15 + MOVQ 80(SI), BX + SBBQ 80(DI), BX + MOVQ BX, (SP) + MOVQ 88(SI), BX + SBBQ 88(DI), BX + MOVQ BX, 8(SP) + MOVQ 96(SI), BX + SBBQ 96(DI), BX + MOVQ BX, 16(SP) + MOVQ 104(SI), BX + SBBQ 104(DI), BX + MOVQ BX, 24(SP) + MOVQ 112(SI), BX + SBBQ 112(DI), BX + MOVQ BX, 32(SP) + MOVQ 120(SI), BX + SBBQ 120(DI), BX + MOVQ BX, 40(SP) - // | a6 * b9 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | + MOVQ c+0(FP), DI + MOVQ CX, (DI) + MOVQ DX, 8(DI) + MOVQ R8, 16(DI) + MOVQ R9, 24(DI) + MOVQ R10, 32(DI) + MOVQ R11, 40(DI) + MOVQ R12, 48(DI) + MOVQ R13, 56(DI) + MOVQ R14, 64(DI) + MOVQ R15, 72(DI) + MOVQ (SP), BX + MOVQ BX, 80(DI) + MOVQ 8(SP), BX + MOVQ BX, 88(DI) + MOVQ 16(SP), BX + MOVQ BX, 96(DI) + MOVQ 24(SP), BX + MOVQ BX, 104(DI) + MOVQ 32(SP), BX + MOVQ BX, 112(DI) + MOVQ 40(SP), BX + MOVQ BX, 120(DI) + RET - // | a7 * b9 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | - // | a8 * b9 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) +/* end */ - // | a9 * b9 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + RET - // | a10 * b9 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) +// func mul_two_16(a *[16]uint64) +TEXT ·mul_two_16(SB), NOSPLIT, $0-8 + MOVQ a+0(FP), DI + XORQ AX, AX + RCLQ $0x01, (DI) + RCLQ $0x01, 8(DI) + RCLQ $0x01, 16(DI) + RCLQ $0x01, 24(DI) + RCLQ $0x01, 32(DI) + RCLQ $0x01, 40(DI) + RCLQ $0x01, 48(DI) + RCLQ $0x01, 56(DI) + RCLQ $0x01, 64(DI) + RCLQ $0x01, 72(DI) + RCLQ $0x01, 80(DI) + RCLQ $0x01, 88(DI) + RCLQ $0x01, 96(DI) + RCLQ $0x01, 104(DI) + RCLQ $0x01, 112(DI) + RCLQ $0x01, 120(DI) + RET - // | a11 * b9 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) +// func div_two_16(a *[16]uint64) +TEXT ·div_two_16(SB), NOSPLIT, $0-8 + MOVQ a+0(FP), DI + XORQ AX, AX + RCRQ $0x01, 120(DI) + RCRQ $0x01, 112(DI) + RCRQ $0x01, 104(DI) + RCRQ $0x01, 96(DI) + RCRQ $0x01, 88(DI) + RCRQ $0x01, 80(DI) + RCRQ $0x01, 72(DI) + RCRQ $0x01, 64(DI) + RCRQ $0x01, 56(DI) + RCRQ $0x01, 48(DI) + RCRQ $0x01, 40(DI) + RCRQ $0x01, 32(DI) + RCRQ $0x01, 24(DI) + RCRQ $0x01, 16(DI) + RCRQ $0x01, 8(DI) + RCRQ $0x01, (DI) + RET - // | - // | b10 - MOVQ 80(SI), CX +// func mul16(c *[16]uint64, a *[16]uint64, b *[16]uint64, p *[16]uint64, inp uint64) +TEXT ·mul16(SB), NOSPLIT, $304-40 + // | - // | a0 * b10 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) +/* inputs */ - // | a1 * b10 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + XORQ AX, AX - // | a2 * b10 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | - // | a3 * b10 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) +/* i = 0 */ - // | a4 * b10 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | a0 @ DX + MOVQ (DI), DX - // | a5 * b10 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | a0 * b0 + MULXQ (SI), AX, CX + MOVQ AX, (SP) - // | a6 * b10 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | a0 * b1 + MULXQ 8(SI), AX, R8 + ADCXQ AX, CX - // | a7 * b10 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | a0 * b2 + MULXQ 16(SI), AX, R9 + ADCXQ AX, R8 - // | a8 * b10 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + // | a0 * b3 + MULXQ 24(SI), AX, R10 + ADCXQ AX, R9 - // | a9 * b10 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) + // | a0 * b4 + MULXQ 32(SI), AX, R11 + ADCXQ AX, R10 - // | a10 * b10 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) + // | a0 * b5 + MULXQ 40(SI), AX, R12 + ADCXQ AX, R11 - // | a11 * b10 - // | (w21, w22, w23) @ (104(SP), 112(SP), 120(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, 112(SP) - ADCQ $0x00, 120(SP) + // | a0 * b6 + MULXQ 48(SI), AX, R13 + ADCXQ AX, R12 - // | - // | b11 - MOVQ 88(SI), CX + // | a0 * b7 + MULXQ 56(SI), AX, R14 + ADCXQ AX, R13 - // | a0 * b11 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | a0 * b8 + MULXQ 64(SI), AX, R15 + ADCXQ AX, R14 + ADCQ $0x00, R15 - // | a1 * b11 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | - // | a2 * b11 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) +/* i = 1 */ - // | a3 * b11 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | a1 @ DX + MOVQ 8(DI), DX + XORQ AX, AX - // | a4 * b11 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | a1 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + MOVQ CX, 8(SP) + MOVQ $0x00, CX - // | a5 * b11 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | a1 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 - // | a6 * b11 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | a1 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 - // | a7 * b11 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + // | a1 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 - // | a8 * b11 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) + // | a1 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 - // | a9 * b11 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) + // | a1 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 - // | a10 * b11 - // | (w21, w22, w23) @ (104(SP), 112(SP), 120(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, 112(SP) - ADCQ $0x00, 120(SP) + // | a1 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 - // | a11 * b11 - // | (w22, w23) @ (112(SP), 120(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, 120(SP) + // | a1 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 - // | - // | Montgomerry Reduction - MOVQ R15, 128(SP) - MOVQ R14, 136(SP) - MOVQ p+24(FP), R14 + // | a1 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R15 + ADOXQ CX, CX + ADCXQ BX, CX - // | - // | (u @ CX) = (w0 @ R8) * inp - MOVQ R8, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | - // | w0 @ R8 - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI +/* i = 2 */ - // | w1 @ R9 - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + // | a2 @ DX + MOVQ 16(DI), DX + XORQ AX, AX - // | w2 @ R10 - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + // | a2 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + MOVQ R8, 16(SP) + MOVQ $0x00, R8 - // | w3 @ R11 - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + // | a2 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 - // | w4 @ R12 - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + // | a2 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 - // | w5 @ R13 - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + // | a2 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 - // | w6 @ 136(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 136(SP) - ADCQ DX, DI - ADDQ SI, 136(SP) - ADCQ $0x00, DI + // | a2 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 - // | w7 @ 128(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 128(SP) - ADCQ DX, SI - ADDQ DI, 128(SP) - ADCQ $0x00, SI + // | a2 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 - // | w8 @ (SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + // | a2 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 - // | w9 @ 8(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + // | a2 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX - // | w10 @ 16(SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + // | a2 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, CX + ADOXQ R8, R8 + ADCXQ BX, R8 - // | w11 @ 24(SP) - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI + // | - // | w12 @ 32(SP) - ADDQ SI, 32(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 +/* i = 3 */ - // | - MOVQ 40(SP), R8 + // | a3 @ DX + MOVQ 24(DI), DX + XORQ AX, AX - // | (u @ CX) = (w1 @ R9) * inp - MOVQ R9, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | a3 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + MOVQ R9, 24(SP) + MOVQ $0x00, R9 - // | w1 @ R9 - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, DI + // | a3 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 - // | w2 @ R10 - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + // | a3 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 - // | w3 @ R11 - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI + // | a3 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 - // | w4 @ R12 - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + // | a3 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 - // | w5 @ R13 - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + // | a3 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 - // | w6 @ 136(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 136(SP) - ADCQ DX, SI - ADDQ DI, 136(SP) - ADCQ $0x00, SI + // | a3 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX - // | w7 @ 128(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 128(SP) - ADCQ DX, DI - ADDQ SI, 128(SP) - ADCQ $0x00, DI + // | a3 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 - // | w8 @ (SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + // | a3 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R8 + ADOXQ R9, R9 + ADCXQ BX, R9 - // | w9 @ 8(SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + // | - // | w10 @ 16(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI +/* i = 4 */ - // | w11 @ 24(SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI - ADDQ SI, 24(SP) - ADCQ $0x00, DI + // | a4 @ DX + MOVQ 32(DI), DX + XORQ AX, AX - // | w12 @ 32(SP) - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, SI - ADDQ DI, 32(SP) - ADCQ $0x00, SI + // | a4 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + MOVQ R10, 32(SP) + MOVQ $0x00, R10 - // | w13 @ R8 - ADDQ SI, R15 - ADCQ R15, R8 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | a4 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 - // | - MOVQ 48(SP), R9 + // | a4 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 - // | (u @ CX) = (w2 @ R10) * inp - MOVQ R10, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | a4 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 - // | w2 @ R10 - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI + // | a4 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 - // | w3 @ R11 - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + // | a4 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX - // | w4 @ R12 - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + // | a4 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 - // | w5 @ R13 - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + // | a4 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 - // | w6 @ 136(SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 136(SP) - ADCQ DX, DI - ADDQ SI, 136(SP) - ADCQ $0x00, DI + // | a4 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R9 + ADOXQ R10, R10 + ADCXQ BX, R10 - // | w7 @ 128(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 128(SP) - ADCQ DX, SI - ADDQ DI, 128(SP) - ADCQ $0x00, SI + // | - // | w8 @ (SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI +/* i = 5 */ - // | w9 @ 8(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + // | a5 @ DX + MOVQ 40(DI), DX + XORQ AX, AX - // | w10 @ 16(SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + // | a5 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + MOVQ R11, 40(SP) + MOVQ $0x00, R11 - // | w11 @ 24(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI + // | a5 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 - // | w12 @ 32(SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI - ADDQ SI, 32(SP) - ADCQ $0x00, DI + // | a5 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 - // | w13 @ R8 - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + // | a5 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 - // | w14 @ R9 - ADDQ SI, R15 - ADCQ R15, R9 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | a5 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX - // | - MOVQ 56(SP), R10 + // | a5 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 - // | (u @ CX) = (w3 @ R11) * inp - MOVQ R11, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | a5 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 - // | w3 @ R11 - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, DI + // | a5 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 - // | w4 @ R12 - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + // | a5 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R10 + ADOXQ R11, R11 + ADCXQ BX, R11 - // | w5 @ R13 - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + // | - // | w6 @ 136(SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 136(SP) - ADCQ DX, SI - ADDQ DI, 136(SP) - ADCQ $0x00, SI +/* i = 6 */ - // | w7 @ 128(SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 128(SP) - ADCQ DX, DI - ADDQ SI, 128(SP) - ADCQ $0x00, DI + // | a6 @ DX + MOVQ 48(DI), DX + XORQ AX, AX - // | w8 @ (SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + // | a6 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + MOVQ R12, 48(SP) + MOVQ $0x00, R12 - // | w9 @ 8(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + // | a6 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 - // | w10 @ 16(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI + // | a6 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 - // | w11 @ 24(SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI - ADDQ SI, 24(SP) - ADCQ $0x00, DI + // | a6 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX - // | w12 @ 32(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, SI - ADDQ DI, 32(SP) - ADCQ $0x00, SI + // | a6 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 - // | w13 @ R8 - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + // | a6 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 - // | w14 @ R9 - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + // | a6 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 - // | w15 @ R10 - ADDQ SI, R15 - ADCQ R15, R10 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | a6 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 - // | - MOVQ 64(SP), R11 + // | a6 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R11 + ADOXQ R12, R12 + ADCXQ BX, R12 - // | (u @ CX) = (w4 @ R12) * inp - MOVQ R12, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | - // | w4 @ R12 - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI +/* i = 7 */ - // | w5 @ R13 - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + // | a7 @ DX + MOVQ 56(DI), DX + XORQ AX, AX - // | w6 @ 136(SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 136(SP) - ADCQ DX, DI - ADDQ SI, 136(SP) - ADCQ $0x00, DI + // | a7 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + MOVQ R13, 56(SP) + MOVQ $0x00, R13 - // | w7 @ 128(SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 128(SP) - ADCQ DX, SI - ADDQ DI, 128(SP) - ADCQ $0x00, SI + // | a7 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 - // | w8 @ (SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + // | a7 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX - // | w9 @ 8(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + // | a7 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 - // | w10 @ 16(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + // | a7 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 - // | w11 @ 24(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI + // | a7 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 - // | w12 @ 32(SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI - ADDQ SI, 32(SP) - ADCQ $0x00, DI + // | a7 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 - // | w13 @ R8 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + // | a7 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 - // | w14 @ R9 - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI + // | a7 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R12 + ADOXQ R13, R13 + ADCXQ BX, R13 - // | w15 @ R10 - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + // | - // | w16 @ R11 - ADDQ SI, R15 - ADCQ R15, R11 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 +/* i = 8 */ - // | - MOVQ 72(SP), R12 + // | a8 @ DX + MOVQ 64(DI), DX + XORQ AX, AX - // | (u @ CX) = (w5 @ R13) * inp - MOVQ R13, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | a8 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + MOVQ R14, 64(SP) + MOVQ $0x00, R14 - // | w5 @ R13 - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, DI + // | a8 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX - // | w6 @ 136(SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 136(SP) - ADCQ DX, SI - ADDQ DI, 136(SP) - ADCQ $0x00, SI + // | a8 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 - // | w7 @ 128(SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 128(SP) - ADCQ DX, DI - ADDQ SI, 128(SP) - ADCQ $0x00, DI + // | a8 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 - // | w8 @ (SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + // | a8 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 - // | w9 @ 8(SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + // | a8 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 - // | w10 @ 16(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI + // | a8 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 - // | w11 @ 24(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI - ADDQ SI, 24(SP) - ADCQ $0x00, DI + // | a8 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 - // | w12 @ 32(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, SI - ADDQ DI, 32(SP) - ADCQ $0x00, SI + // | a8 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R13 + ADOXQ R14, R14 + ADCXQ BX, R14 - // | w13 @ R8 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + // | - // | w14 @ R9 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI +/* i = 9 */ - // | w15 @ R10 - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + // | a9 @ DX + MOVQ 72(DI), DX + XORQ AX, AX - // | w16 @ R11 - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + // | a9 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + MOVQ R15, 72(SP) + MOVQ $0x00, R15 - // | w17 @ R12 - ADDQ SI, R15 - ADCQ R15, R12 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | a9 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 - // | - MOVQ 80(SP), R13 + // | a9 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 - // | (u @ CX) = (w6 @ 136(SP)) * inp - MOVQ 136(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + // | a9 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 - // | w6 @ 136(SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, 136(SP) - ADCQ DX, DI + // | a9 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 - // | w7 @ 128(SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 128(SP) - ADCQ DX, SI - ADDQ DI, 128(SP) - ADCQ $0x00, SI + // | a9 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 - // | w8 @ (SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + // | a9 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 - // | w9 @ 8(SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + // | a9 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 - // | w10 @ 16(SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + // | a9 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R14 + ADOXQ R15, R15 + ADCXQ BX, R15 - // | w11 @ 24(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI + // | - // | w12 @ 32(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI - ADDQ SI, 32(SP) - ADCQ $0x00, DI +/* i = 10 */ - // | w13 @ R8 - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + // | a10 @ DX + MOVQ 80(DI), DX + XORQ AX, AX - // | w14 @ R9 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI + // | a10 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + MOVQ CX, 80(SP) + MOVQ $0x00, CX - // | w15 @ R10 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + // | a10 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 - // | w16 @ R11 - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI + // | a10 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 - // | w17 @ R12 - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + // | a10 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 - // | w18 @ R13 - ADDQ SI, R15 - ADCQ R15, R13 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | a10 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 - // | - MOVQ 88(SP), BX - MOVQ BX, 136(SP) + // | a10 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 - // | (u @ CX) = (w7 @ 128(SP)) * inp - MOVQ 128(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + // | a10 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 - // | w7 @ 128(SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, 128(SP) - ADCQ DX, DI + // | a10 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 - // | w8 @ (SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + // | a10 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R15 + ADOXQ CX, CX + ADCXQ BX, CX - // | w9 @ 8(SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + // | - // | w10 @ 16(SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI +/* i = 11 */ - // | w11 @ 24(SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI - ADDQ SI, 24(SP) - ADCQ $0x00, DI + // | a11 @ DX + MOVQ 88(DI), DX + XORQ AX, AX - // | w12 @ 32(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, SI - ADDQ DI, 32(SP) - ADCQ $0x00, SI + // | a11 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + MOVQ R8, 88(SP) + MOVQ $0x00, R8 - // | w13 @ R8 - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + // | a11 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a11 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 - // | w14 @ R9 - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + // | a11 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 - // | w15 @ R10 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + // | a11 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 - // | w16 @ R11 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + // | a11 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 - // | w17 @ R12 - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + // | a11 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 - // | w18 @ R13 - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + // | a11 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX - // | w19 @ 136(SP) - ADDQ SI, R15 - ADCQ R15, 136(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | a11 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, CX + ADOXQ R8, R8 + ADCXQ BX, R8 - // | - MOVQ 96(SP), BX - MOVQ BX, 128(SP) + // | - // | (u @ CX) = (w8 @ (SP)) * inp - MOVQ (SP), AX - MULQ inp+32(FP) - MOVQ AX, CX +/* i = 12 */ - // | w8 @ (SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI + // | a12 @ DX + MOVQ 96(DI), DX + XORQ AX, AX - // | w9 @ 8(SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + // | a12 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + MOVQ R9, 96(SP) + MOVQ $0x00, R9 - // | w10 @ 16(SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + // | a12 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 - // | w11 @ 24(SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI + // | a12 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 - // | w12 @ 32(SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI - ADDQ SI, 32(SP) - ADCQ $0x00, DI + // | a12 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 - // | w13 @ R8 - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + // | a12 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 - // | w14 @ R9 - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI + // | a12 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 - // | w15 @ R10 - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + // | a12 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX - // | w16 @ R11 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI + // | a12 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 - // | w17 @ R12 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + // | a12 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R8 + ADOXQ R9, R9 + ADCXQ BX, R9 - // | w18 @ R13 - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + // | - // | w19 @ 136(SP) - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, 136(SP) - ADCQ DX, SI - ADDQ DI, 136(SP) - ADCQ $0x00, SI +/* i = 13 */ - // | w20 @ 128(SP) - ADDQ SI, R15 - ADCQ R15, 128(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | a13 @ DX + MOVQ 104(DI), DX + XORQ AX, AX - // | - MOVQ 104(SP), BX - MOVQ BX, (SP) + // | a13 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + MOVQ R10, 104(SP) + MOVQ $0x00, R10 - // | (u @ CX) = (w9 @ 8(SP)) * inp - MOVQ 8(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + // | a13 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 - // | w9 @ 8(SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI + // | a13 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 - // | w10 @ 16(SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI + // | a13 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 - // | w11 @ 24(SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI - ADDQ SI, 24(SP) - ADCQ $0x00, DI + // | a13 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 - // | w12 @ 32(SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, SI - ADDQ DI, 32(SP) - ADCQ $0x00, SI + // | a13 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX - // | w13 @ R8 - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + // | a13 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 - // | w14 @ R9 - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + // | a13 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 - // | w15 @ R10 - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + // | a13 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R9 + ADOXQ R10, R10 + ADCXQ BX, R10 - // | w16 @ R11 - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + // | - // | w17 @ R12 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI +/* i = 14 */ - // | w18 @ R13 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + // | a14 @ DX + MOVQ 112(DI), DX + XORQ AX, AX - // | w19 @ 136(SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, 136(SP) - ADCQ DX, DI - ADDQ SI, 136(SP) - ADCQ $0x00, DI + // | a14 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + MOVQ R11, 112(SP) + MOVQ $0x00, R11 - // | w20 @ 128(SP) - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, 128(SP) - ADCQ DX, SI - ADDQ DI, 128(SP) - ADCQ $0x00, SI + // | a14 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 - // | w21 @ (SP) - ADDQ SI, R15 - ADCQ R15, (SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | a14 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 - // | - MOVQ 112(SP), BX - MOVQ BX, 8(SP) + // | a14 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 - // | (u @ CX) = (w10 @ 16(SP)) * inp - MOVQ 16(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + // | a14 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX - // | w10 @ 16(SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI + // | a14 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 - // | w11 @ 24(SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI + // | a14 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 - // | w12 @ 32(SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI - ADDQ SI, 32(SP) - ADCQ $0x00, DI + // | a14 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + + // | a14 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R10 + ADOXQ R11, R11 + ADCXQ BX, R11 - // | w13 @ R8 - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + // | - // | w14 @ R9 - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI +/* i = 15 */ - // | w15 @ R10 - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + // | a15 @ DX + MOVQ 120(DI), DX + XORQ AX, AX - // | w16 @ R11 - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI + // | a15 * b0 + MULXQ (SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + MOVQ R12, 120(SP) + MOVQ $0x00, R12 - // | w17 @ R12 - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + // | a15 * b1 + MULXQ 8(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 - // | w18 @ R13 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + // | a15 * b2 + MULXQ 16(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 - // | w19 @ 136(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 136(SP) - ADCQ DX, SI - ADDQ DI, 136(SP) - ADCQ $0x00, SI + // | a15 * b3 + MULXQ 24(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX - // | w20 @ 128(SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, 128(SP) - ADCQ DX, DI - ADDQ SI, 128(SP) - ADCQ $0x00, DI + // | a15 * b4 + MULXQ 32(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 - // | w21 @ (SP) - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + // | a15 * b5 + MULXQ 40(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 - // | w22 @ 8(SP) - ADDQ SI, R15 - ADCQ R15, 8(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | a15 * b6 + MULXQ 48(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 - // | - MOVQ 120(SP), BX - MOVQ BX, 16(SP) + // | a15 * b7 + MULXQ 56(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 - // | (u @ CX) = (w11 @ 24(SP)) * inp - MOVQ 24(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + // | a15 * b8 + MULXQ 64(SI), AX, BX + ADOXQ AX, R11 + ADOXQ BX, R12 + ADCQ $0x00, R12 - // | w11 @ 24(SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI + // | - // | w12 @ 32(SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, SI - ADDQ DI, 32(SP) - ADCQ $0x00, SI +/* */ - // | w13 @ R8 - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + // | + // | W right + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 112(SP) | 15 120(SP) + // | 16 R13 | 17 R14 | 18 R15 | 19 CX | 20 R8 | 21 R9 | 22 R10 | 23 R11 | 24 R12 | 25 - | 26 - | 27 - | 28 - | 29 - | 30 - | 31 - - // | w14 @ R9 - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI - // | w15 @ R10 - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + MOVQ R13, 128(SP) + MOVQ R14, 136(SP) + MOVQ R15, 144(SP) + MOVQ CX, 152(SP) + MOVQ R8, 160(SP) + MOVQ R9, 168(SP) + MOVQ R10, 176(SP) + MOVQ R11, 184(SP) + MOVQ R12, 192(SP) - // | w16 @ R11 - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + // | + // | W right at stack + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 112(SP) | 15 120(SP) + // | 16 128(SP) | 17 136(SP) | 18 144(SP) | 19 152(SP) | 20 160(SP) | 21 168(SP) | 22 176(SP) | 23 184(SP) | 24 192(SP) | 25 - | 26 - | 27 - | 28 - | 29 - | 30 - | 31 - - // | w17 @ R12 - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI - // | w18 @ R13 - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + XORQ AX, AX - // | w19 @ 136(SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 136(SP) - ADCQ DX, DI - ADDQ SI, 136(SP) - ADCQ $0x00, DI + // | - // | w20 @ 128(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 128(SP) - ADCQ DX, SI - ADDQ DI, 128(SP) - ADCQ $0x00, SI +/* i = 0 */ - // | w21 @ (SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + // | a0 @ DX + MOVQ (DI), DX - // | w22 @ 8(SP) - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + // | a0 * b9 + MULXQ 72(SI), AX, CX + MOVQ AX, 200(SP) - // | w23 @ 16(SP) - ADDQ SI, R15 - ADCQ R15, 16(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | a0 * b10 + MULXQ 80(SI), AX, R8 + ADCXQ AX, CX - // | Reduce by modulus - MOVQ 32(SP), CX - SUBQ (R14), CX - MOVQ R8, AX - SBBQ 8(R14), AX - MOVQ R9, DX - SBBQ 16(R14), DX - MOVQ R10, BX - SBBQ 24(R14), BX - MOVQ BX, 24(SP) - MOVQ R11, BX - SBBQ 32(R14), BX - MOVQ BX, 144(SP) - MOVQ R12, BX - SBBQ 40(R14), BX - MOVQ BX, 152(SP) - MOVQ R13, BX - SBBQ 48(R14), BX - MOVQ BX, 160(SP) - MOVQ 136(SP), BX - SBBQ 56(R14), BX - MOVQ BX, 168(SP) - MOVQ 128(SP), BX - SBBQ 64(R14), BX - MOVQ BX, 176(SP) - MOVQ (SP), BX - SBBQ 72(R14), BX - MOVQ BX, 184(SP) - MOVQ 8(SP), BX - SBBQ 80(R14), BX - MOVQ BX, 192(SP) - MOVQ 16(SP), BX - SBBQ 88(R14), BX - MOVQ BX, 200(SP) - SBBQ $0x00, R15 + // | a0 * b11 + MULXQ 88(SI), AX, R9 + ADCXQ AX, R8 - // | Compare & Return - MOVQ c+0(FP), DI - CMOVQCS 32(SP), CX - MOVQ CX, (DI) - CMOVQCC AX, R8 - MOVQ R8, 8(DI) - CMOVQCC DX, R9 - MOVQ R9, 16(DI) - CMOVQCC 24(SP), R10 - MOVQ R10, 24(DI) - CMOVQCC 144(SP), R11 - MOVQ R11, 32(DI) - CMOVQCC 152(SP), R12 - MOVQ R12, 40(DI) - CMOVQCC 160(SP), R13 - MOVQ R13, 48(DI) - MOVQ 136(SP), BX - CMOVQCC 168(SP), BX - MOVQ BX, 56(DI) - MOVQ 128(SP), BX - CMOVQCC 176(SP), BX - MOVQ BX, 64(DI) - MOVQ (SP), BX - CMOVQCC 184(SP), BX - MOVQ BX, 72(DI) - MOVQ 8(SP), BX - CMOVQCC 192(SP), BX - MOVQ BX, 80(DI) - MOVQ 16(SP), BX - CMOVQCC 200(SP), BX - MOVQ BX, 88(DI) - RET + // | a0 * b12 + MULXQ 96(SI), AX, R10 + ADCXQ AX, R9 + // | a0 * b13 + MULXQ 104(SI), AX, R11 + ADCXQ AX, R10 -// func mul13(c *[26]uint64, a *[13]uint64, b *[13]uint64, p *[13]uint64, inp uint64) -TEXT ·mul13(SB), $232-40 -/* inputs */ - // | - // | Multiplication - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI + // | a0 * b14 + MULXQ 112(SI), AX, R12 + ADCXQ AX, R11 - // | - // | - XORQ R10, R10 - XORQ R11, R11 - XORQ R12, R12 - XORQ R13, R13 + // | a0 * b15 + MULXQ 120(SI), AX, R13 + ADCXQ AX, R12 + ADCQ $0x00, R13 + + // | + +/* i = 1 */ + + // | a1 @ DX + MOVQ 8(DI), DX XORQ R14, R14 - XORQ R15, R15 - MOVQ $0x00000000, (SP) - MOVQ $0x00000000, 8(SP) - MOVQ $0x00000000, 16(SP) - MOVQ $0x00000000, 24(SP) - MOVQ $0x00000000, 32(SP) - MOVQ $0x00000000, 40(SP) - MOVQ $0x00000000, 48(SP) - MOVQ $0x00000000, 56(SP) - MOVQ $0x00000000, 64(SP) - MOVQ $0x00000000, 72(SP) - MOVQ $0x00000000, 80(SP) - MOVQ $0x00000000, 88(SP) - MOVQ $0x00000000, 96(SP) - MOVQ $0x00000000, 104(SP) - MOVQ $0x00000000, 112(SP) - MOVQ $0x00000000, 120(SP) - MOVQ $0x00000000, 128(SP) - MOVQ $0x00000000, 136(SP) - - // | - // | b0 - MOVQ (SI), CX - // | a0 * b0 - // | (w0, w1) @ (R8, R9) - MOVQ (DI), AX - MULQ CX - MOVQ AX, R8 - MOVQ DX, R9 + // | a1 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + MOVQ CX, 208(SP) - // | a1 * b0 - // | (w1, w2) @ (R9, R10) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, R10 + // | a1 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 - // | a2 * b0 - // | (w2, w3) @ (R10, R11) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, R11 + // | a1 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 - // | a3 * b0 - // | (w3, w4) @ (R11, R12) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, R12 + // | a1 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 - // | a4 * b0 - // | (w4, w5) @ (R12, R13) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, R13 + // | a1 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 - // | a5 * b0 - // | (w5, w6) @ (R13, R14) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 + // | a1 * b14 + MULXQ 112(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 - // | a6 * b0 - // | (w6, w7) @ (R14, R15) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 + // | a1 * b15 + MULXQ 120(SI), AX, BX + ADOXQ AX, R13 + ADOXQ R14, R14 + ADCXQ BX, R14 - // | a7 * b0 - // | (w7, w8) @ (R15, (SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) + // | - // | a8 * b0 - // | (w8, w9) @ ((SP), 8(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) +/* i = 2 */ - // | a9 * b0 - // | (w9, w10) @ (8(SP), 16(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) + // | a2 @ DX + MOVQ 16(DI), DX + XORQ R15, R15 + + // | a2 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + MOVQ R8, 216(SP) + + // | a2 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 - // | a10 * b0 - // | (w10, w11) @ (16(SP), 24(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) + // | a2 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 - // | a11 * b0 - // | (w11, w12) @ (24(SP), 32(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) + // | a2 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 - // | a12 * b0 - // | (w12, w13) @ (32(SP), 40(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) + // | a2 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 - // | - // | b1 - MOVQ 8(SI), CX + // | a2 * b14 + MULXQ 112(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 - // | a0 * b1 - // | (w1, w2, w3, w4) @ (R9, R10, R11, R12) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, R10 - ADCQ $0x00, R11 - ADCQ $0x00, R12 + // | a2 * b15 + MULXQ 120(SI), AX, BX + ADOXQ AX, R14 + ADOXQ R15, R15 + ADCXQ BX, R15 - // | a1 * b1 - // | (w2, w3, w4, w5) @ (R10, R11, R12, R13) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, R11 - ADCQ $0x00, R12 - ADCQ $0x00, R13 + // | - // | a2 * b1 - // | (w3, w4, w5, w6) @ (R11, R12, R13, R14) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, R12 - ADCQ $0x00, R13 - ADCQ $0x00, R14 +/* i = 3 */ - // | a3 * b1 - // | (w4, w5, w6, w7) @ (R12, R13, R14, R15) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, R13 - ADCQ $0x00, R14 - ADCQ $0x00, R15 + // | a3 @ DX + MOVQ 24(DI), DX + XORQ CX, CX - // | a4 * b1 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) + // | a3 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + MOVQ R9, 224(SP) - // | a5 * b1 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + // | a3 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 - // | a6 * b1 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + // | a3 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 - // | a7 * b1 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | a3 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 - // | a8 * b1 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | a3 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 - // | a9 * b1 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | a3 * b14 + MULXQ 112(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 - // | a10 * b1 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | a3 * b15 + MULXQ 120(SI), AX, BX + ADOXQ AX, R15 + ADOXQ CX, CX + ADCXQ BX, CX - // | a11 * b1 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | - // | a12 * b1 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) +/* i = 4 */ - // | - // | b2 - MOVQ 16(SI), CX + // | a4 @ DX + MOVQ 32(DI), DX + XORQ R8, R8 - // | a0 * b2 - // | (w2, w3, w4, w5) @ (R10, R11, R12, R13) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, R11 - ADCQ $0x00, R12 - ADCQ $0x00, R13 + // | a4 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 + MOVQ R10, 232(SP) - // | a1 * b2 - // | (w3, w4, w5, w6) @ (R11, R12, R13, R14) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, R12 - ADCQ $0x00, R13 - ADCQ $0x00, R14 + // | a4 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 - // | a2 * b2 - // | (w4, w5, w6, w7) @ (R12, R13, R14, R15) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, R13 - ADCQ $0x00, R14 - ADCQ $0x00, R15 + // | a4 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 - // | a3 * b2 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) + // | a4 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 - // | a4 * b2 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + // | a4 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 - // | a5 * b2 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + // | a4 * b14 + MULXQ 112(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX - // | a6 * b2 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | a4 * b15 + MULXQ 120(SI), AX, BX + ADOXQ AX, CX + ADOXQ R8, R8 + ADCXQ BX, R8 - // | a7 * b2 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | - // | a8 * b2 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) +/* i = 5 */ - // | a9 * b2 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | a5 @ DX + MOVQ 40(DI), DX + XORQ R9, R9 - // | a10 * b2 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | a5 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 + MOVQ R11, 240(SP) - // | a11 * b2 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | a5 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 - // | a12 * b2 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | a5 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 - // | - // | b3 - MOVQ 24(SI), CX + // | a5 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 - // | a0 * b3 - // | (w3, w4, w5, w6) @ (R11, R12, R13, R14) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, R12 - ADCQ $0x00, R13 - ADCQ $0x00, R14 + // | a5 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX - // | a1 * b3 - // | (w4, w5, w6, w7) @ (R12, R13, R14, R15) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, R13 - ADCQ $0x00, R14 - ADCQ $0x00, R15 + // | a5 * b14 + MULXQ 112(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 - // | a2 * b3 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) + // | a5 * b15 + MULXQ 120(SI), AX, BX + ADOXQ AX, R8 + ADOXQ R9, R9 + ADCXQ BX, R9 - // | a3 * b3 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + // | - // | a4 * b3 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) +/* i = 6 */ - // | a5 * b3 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | a6 @ DX + MOVQ 48(DI), DX + XORQ R10, R10 - // | a6 * b3 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | a6 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 + MOVQ R12, 248(SP) - // | a7 * b3 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | a6 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 - // | a8 * b3 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | a6 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 - // | a9 * b3 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | a6 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX - // | a10 * b3 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | a6 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 - // | a11 * b3 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | a6 * b14 + MULXQ 112(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 - // | a12 * b3 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | a6 * b15 + MULXQ 120(SI), AX, BX + ADOXQ AX, R9 + ADOXQ R10, R10 + ADCXQ BX, R10 - // | - // | b4 - MOVQ 32(SI), CX + // | - // | a0 * b4 - // | (w4, w5, w6, w7) @ (R12, R13, R14, R15) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, R13 - ADCQ $0x00, R14 - ADCQ $0x00, R15 +/* i = 7 */ - // | a1 * b4 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) + // | a7 @ DX + MOVQ 56(DI), DX + XORQ R11, R11 - // | a2 * b4 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + // | a7 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 + MOVQ R13, 256(SP) - // | a3 * b4 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + // | a7 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 - // | a4 * b4 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | a7 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX - // | a5 * b4 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | a7 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 - // | a6 * b4 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | a7 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 - // | a7 * b4 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | a7 * b14 + MULXQ 112(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 - // | a8 * b4 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | a7 * b15 + MULXQ 120(SI), AX, BX + ADOXQ AX, R10 + ADOXQ R11, R11 + ADCXQ BX, R11 - // | a9 * b4 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | - // | a10 * b4 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) +/* i = 8 */ - // | a11 * b4 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | a8 @ DX + MOVQ 64(DI), DX + XORQ R12, R12 - // | a12 * b4 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | a8 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 + MOVQ R14, 264(SP) - // | - // | b5 - MOVQ 40(SI), CX + // | a8 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX - // | a0 * b5 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) + // | a8 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 - // | a1 * b5 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + // | a8 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 - // | a2 * b5 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + // | a8 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 - // | a3 * b5 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | a8 * b14 + MULXQ 112(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 - // | a4 * b5 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | a8 * b15 + MULXQ 120(SI), AX, BX + ADOXQ AX, R11 + ADOXQ R12, R12 + ADCXQ BX, R12 - // | a5 * b5 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | - // | a6 * b5 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) +/* i = 9 */ - // | a7 * b5 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | a9 @ DX + MOVQ 72(DI), DX + XORQ R13, R13 - // | a8 * b5 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | a9 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX + MOVQ R15, 272(SP) - // | a9 * b5 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | a9 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 - // | a10 * b5 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | a9 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 - // | a11 * b5 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | a9 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 - // | a12 * b5 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | a9 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 - // | - // | b6 - MOVQ 48(SI), CX + // | a9 * b14 + MULXQ 112(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 - // | a0 * b6 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + // | a9 * b15 + MULXQ 120(SI), AX, BX + ADOXQ AX, R12 + ADOXQ R13, R13 + ADCXQ BX, R13 - // | a1 * b6 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + // | - // | a2 * b6 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) +/* i = 10 */ - // | a3 * b6 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | a10 @ DX + MOVQ 80(DI), DX + XORQ R14, R14 - // | a4 * b6 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | a10 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 + MOVQ CX, 280(SP) - // | a5 * b6 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | a10 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 - // | a6 * b6 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | a10 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 - // | a7 * b6 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | a10 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 - // | a8 * b6 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | a10 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 - // | a9 * b6 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | a10 * b14 + MULXQ 112(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 - // | a10 * b6 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | a10 * b15 + MULXQ 120(SI), AX, BX + ADOXQ AX, R13 + ADOXQ R14, R14 + ADCXQ BX, R14 - // | a11 * b6 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | - // | a12 * b6 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) +/* i = 11 */ - // | - // | b7 - MOVQ 56(SI), CX + // | a11 @ DX + MOVQ 88(DI), DX + XORQ R15, R15 - // | a0 * b7 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + // | a11 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 + MOVQ R8, 288(SP) - // | a1 * b7 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | a11 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 - // | a2 * b7 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | a11 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 - // | a3 * b7 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | a11 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 - // | a4 * b7 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | a11 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 - // | a5 * b7 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | a11 * b14 + MULXQ 112(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 - // | a6 * b7 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | a11 * b15 + MULXQ 120(SI), AX, BX + ADOXQ AX, R14 + ADOXQ R15, R15 + ADCXQ BX, R15 - // | a7 * b7 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | - // | a8 * b7 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) +/* i = 12 */ - // | a9 * b7 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | a12 @ DX + MOVQ 96(DI), DX + XORQ CX, CX - // | a10 * b7 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | a12 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R9 + ADCXQ BX, R10 + MOVQ R9, 296(SP) - // | a11 * b7 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + // | a12 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 - // | a12 * b7 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) + // | a12 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 - // | - // | b8 - MOVQ 64(SI), CX + // | a12 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 - // | a0 * b8 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | a12 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 - // | a1 * b8 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | a12 * b14 + MULXQ 112(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 - // | a2 * b8 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | a12 * b15 + MULXQ 120(SI), AX, BX + ADOXQ AX, R15 + ADOXQ CX, CX + ADCXQ BX, CX - // | a3 * b8 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | - // | a4 * b8 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) +/* i = 13 */ - // | a5 * b8 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | a13 @ DX + MOVQ 104(DI), DX + XORQ R8, R8 - // | a6 * b8 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | a13 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R10 + ADCXQ BX, R11 - // | a7 * b8 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | a13 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 - // | a8 * b8 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | a13 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 - // | a9 * b8 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | a13 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 - // | a10 * b8 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + // | a13 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 - // | a11 * b8 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) + // | a13 * b14 + MULXQ 112(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX - // | a12 * b8 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) + // | a13 * b15 + MULXQ 120(SI), AX, BX + ADOXQ AX, CX + ADOXQ R8, R8 + ADCXQ BX, R8 - // | - // | b9 - MOVQ 72(SI), CX + // | - // | a0 * b9 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) +/* i = 14 */ - // | a1 * b9 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | a14 @ DX + MOVQ 112(DI), DX + XORQ R9, R9 - // | a2 * b9 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | a14 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R11 + ADCXQ BX, R12 - // | a3 * b9 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | a14 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 - // | a4 * b9 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | a14 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 - // | a5 * b9 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | a14 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 - // | a6 * b9 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | a14 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX - // | a7 * b9 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | a14 * b14 + MULXQ 112(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 - // | a8 * b9 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | a14 * b15 + MULXQ 120(SI), AX, BX + ADOXQ AX, R8 + ADOXQ R9, R9 + ADCXQ BX, R9 - // | a9 * b9 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + // | - // | a10 * b9 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) +/* i = 15 */ - // | a11 * b9 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) + // | a15 @ DX + MOVQ 120(DI), DX + XORQ DI, DI - // | a12 * b9 - // | (w21, w22, w23, w24) @ (104(SP), 112(SP), 120(SP), 128(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, 112(SP) - ADCQ $0x00, 120(SP) - ADCQ $0x00, 128(SP) + // | a15 * b9 + MULXQ 72(SI), AX, BX + ADOXQ AX, R12 + ADCXQ BX, R13 - // | - // | b10 - MOVQ 80(SI), CX + // | a15 * b10 + MULXQ 80(SI), AX, BX + ADOXQ AX, R13 + ADCXQ BX, R14 - // | a0 * b10 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | a15 * b11 + MULXQ 88(SI), AX, BX + ADOXQ AX, R14 + ADCXQ BX, R15 - // | a1 * b10 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | a15 * b12 + MULXQ 96(SI), AX, BX + ADOXQ AX, R15 + ADCXQ BX, CX - // | a2 * b10 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | a15 * b13 + MULXQ 104(SI), AX, BX + ADOXQ AX, CX + ADCXQ BX, R8 - // | a3 * b10 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | a15 * b14 + MULXQ 112(SI), AX, BX + ADOXQ AX, R8 + ADCXQ BX, R9 - // | a4 * b10 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | a15 * b15 + MULXQ 120(SI), AX, BX + ADOXQ AX, R9 + ADOXQ BX, DI + ADCQ $0x00, DI - // | a5 * b10 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | - // | a6 * b10 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) +/* */ - // | a7 * b10 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | + // | W left + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 200(SP) | 10 208(SP) | 11 216(SP) | 12 224(SP) | 13 232(SP) | 14 240(SP) | 15 248(SP) + // | 16 256(SP) | 17 264(SP) | 18 272(SP) | 19 280(SP) | 20 288(SP) | 21 296(SP) | 22 R10 | 23 R11 | 24 R12 | 25 R13 | 26 R14 | 27 R15 | 28 CX | 29 R8 | 30 R9 | 31 DI - // | a8 * b10 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) - // | a9 * b10 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) + // | + // | W right + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 112(SP) | 15 120(SP) + // | 16 128(SP) | 17 136(SP) | 18 144(SP) | 19 152(SP) | 20 160(SP) | 21 168(SP) | 22 176(SP) | 23 184(SP) | 24 192(SP) | 25 - | 26 - | 27 - | 28 - | 29 - | 30 - | 31 - - // | a10 * b10 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) - // | a11 * b10 - // | (w21, w22, w23, w24) @ (104(SP), 112(SP), 120(SP), 128(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, 112(SP) - ADCQ $0x00, 120(SP) - ADCQ $0x00, 128(SP) + MOVQ 72(SP), AX + ADDQ AX, 200(SP) + MOVQ 80(SP), AX + ADCQ AX, 208(SP) + MOVQ 88(SP), AX + ADCQ AX, 216(SP) + MOVQ 96(SP), AX + ADCQ AX, 224(SP) + MOVQ 104(SP), AX + ADCQ AX, 232(SP) + MOVQ 112(SP), AX + ADCQ AX, 240(SP) + MOVQ 120(SP), AX + ADCQ AX, 248(SP) + MOVQ 128(SP), AX + ADCQ AX, 256(SP) + MOVQ 136(SP), AX + ADCQ AX, 264(SP) + MOVQ 144(SP), AX + ADCQ AX, 272(SP) + MOVQ 152(SP), AX + ADCQ AX, 280(SP) + MOVQ 160(SP), AX + ADCQ AX, 288(SP) + MOVQ 168(SP), AX + ADCQ AX, 296(SP) + ADCQ 176(SP), R10 + ADCQ 184(SP), R11 + ADCQ 192(SP), R12 + ADCQ $0x00, R13 + ADCQ $0x00, R14 + ADCQ $0x00, R15 + ADCQ $0x00, CX + ADCQ $0x00, R8 + ADCQ $0x00, R9 + ADCQ $0x00, DI - // | a12 * b10 - // | (w22, w23, w24, w25) @ (112(SP), 120(SP), 128(SP), 136(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, 120(SP) - ADCQ $0x00, 128(SP) - ADCQ $0x00, 136(SP) + // | + // | W combined + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 200(SP) | 10 208(SP) | 11 216(SP) | 12 224(SP) | 13 232(SP) | 14 240(SP) | 15 248(SP) + // | 16 256(SP) | 17 264(SP) | 18 272(SP) | 19 280(SP) | 20 288(SP) | 21 296(SP) | 22 R10 | 23 R11 | 24 R12 | 25 R13 | 26 R14 | 27 R15 | 28 CX | 29 R8 | 30 R9 | 31 DI - // | - // | b11 - MOVQ 88(SI), CX - // | a0 * b11 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + MOVQ (SP), BX + MOVQ 8(SP), SI + MOVQ DI, (SP) + MOVQ 16(SP), DI + MOVQ R9, 8(SP) + MOVQ 24(SP), R9 + MOVQ R8, 16(SP) + MOVQ 32(SP), R8 + MOVQ CX, 24(SP) + MOVQ 40(SP), CX + MOVQ R15, 32(SP) + MOVQ 48(SP), R15 + MOVQ R14, 40(SP) + MOVQ 56(SP), R14 + MOVQ R13, 48(SP) + MOVQ 64(SP), R13 + MOVQ R12, 56(SP) + MOVQ 200(SP), R12 + MOVQ R11, 64(SP) + MOVQ R10, 72(SP) - // | a1 * b11 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | fetch modulus + MOVQ p+24(FP), R10 - // | a2 * b11 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | + // | W ready to mont + // | 0 BX | 1 SI | 2 DI | 3 R9 | 4 R8 | 5 CX | 6 R15 | 7 R14 | 8 R13 | 9 R12 | 10 208(SP) | 11 216(SP) | 12 224(SP) | 13 232(SP) | 14 240(SP) | 15 248(SP) + // | 16 256(SP) | 17 264(SP) | 18 272(SP) | 19 280(SP) | 20 288(SP) | 21 296(SP) | 22 72(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | a3 * b11 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) - // | a4 * b11 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | - // | a5 * b11 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) +/* montgomery reduction q1 */ - // | a6 * b11 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | clear flags + XORQ AX, AX - // | a7 * b11 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + // | - // | a8 * b11 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) +/* i = 0 */ - // | a9 * b11 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) + // | + // | W + // | 0 BX | 1 SI | 2 DI | 3 R9 | 4 R8 | 5 CX | 6 R15 | 7 R14 | 8 R13 | 9 R12 | 10 208(SP) | 11 216(SP) | 12 224(SP) | 13 232(SP) | 14 240(SP) | 15 248(SP) + // | 16 256(SP) | 17 264(SP) | 18 272(SP) | 19 280(SP) | 20 288(SP) | 21 296(SP) | 22 72(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | a10 * b11 - // | (w21, w22, w23, w24) @ (104(SP), 112(SP), 120(SP), 128(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, 112(SP) - ADCQ $0x00, 120(SP) - ADCQ $0x00, 128(SP) - // | a11 * b11 - // | (w22, w23, w24, w25) @ (112(SP), 120(SP), 128(SP), 136(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, 120(SP) - ADCQ $0x00, 128(SP) - ADCQ $0x00, 136(SP) + // | | u0 = w0 * inp + MOVQ BX, DX + MULXQ inp+32(FP), DX, R11 - // | a12 * b11 - // | (w23, w24, w25) @ (120(SP), 128(SP), 136(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 120(SP) - ADCQ DX, 128(SP) - ADCQ $0x00, 136(SP) + // | save u0 + MOVQ DX, 80(SP) - // | - // | b12 - MOVQ 96(SI), CX + // | - // | a0 * b12 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) +/* */ - // | a1 * b12 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | j0 - // | a2 * b12 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | w0 @ BX + MULXQ (R10), AX, R11 + ADOXQ AX, BX + ADCXQ R11, SI - // | a3 * b12 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | j1 - // | a4 * b12 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | w1 @ SI + MULXQ 8(R10), AX, R11 + ADOXQ AX, SI + ADCXQ R11, DI - // | a5 * b12 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | j2 - // | a6 * b12 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + // | w2 @ DI + MULXQ 16(R10), AX, R11 + ADOXQ AX, DI + ADCXQ R11, R9 - // | a7 * b12 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) + // | j3 - // | a8 * b12 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) + // | w3 @ R9 + MULXQ 24(R10), AX, R11 + ADOXQ AX, R9 + ADCXQ R11, R8 - // | a9 * b12 - // | (w21, w22, w23, w24) @ (104(SP), 112(SP), 120(SP), 128(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, 112(SP) - ADCQ $0x00, 120(SP) - ADCQ $0x00, 128(SP) + // | j4 - // | a10 * b12 - // | (w22, w23, w24, w25) @ (112(SP), 120(SP), 128(SP), 136(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, 120(SP) - ADCQ $0x00, 128(SP) - ADCQ $0x00, 136(SP) + // | w4 @ R8 + MULXQ 32(R10), AX, R11 + ADOXQ AX, R8 + ADCXQ R11, CX - // | a11 * b12 - // | (w23, w24, w25) @ (120(SP), 128(SP), 136(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 120(SP) - ADCQ DX, 128(SP) - ADCQ $0x00, 136(SP) + // | j5 - // | a12 * b12 - // | (w24, w25) @ (128(SP), 136(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 128(SP) - ADCQ DX, 136(SP) + // | w5 @ CX + MULXQ 40(R10), AX, R11 + ADOXQ AX, CX + ADCXQ R11, R15 - // | - // | Montgomerry Reduction - MOVQ R15, 144(SP) - MOVQ R14, 152(SP) - MOVQ p+24(FP), R14 + // | j6 - // | - // | (u @ CX) = (w0 @ R8) * inp - MOVQ R8, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | w6 @ R15 + MULXQ 48(R10), AX, R11 + ADOXQ AX, R15 + ADCXQ R11, R14 - // | w0 @ R8 - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI + // | j7 - // | w1 @ R9 - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + // | w7 @ R14 + MULXQ 56(R10), AX, R11 + ADOXQ AX, R14 + ADCXQ R11, R13 - // | w2 @ R10 - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + // | j8 - // | w3 @ R11 - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + // | w8 @ R13 + MULXQ 64(R10), AX, R11 + ADOXQ AX, R13 + ADCXQ R11, R12 + ADOXQ BX, R12 + ADCXQ BX, BX + MOVQ $0x00, AX + ADOXQ AX, BX - // | w4 @ R12 - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + // | clear flags + XORQ AX, AX - // | w5 @ R13 - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + // | - // | w6 @ 152(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 152(SP) - ADCQ DX, DI - ADDQ SI, 152(SP) - ADCQ $0x00, DI +/* i = 1 */ - // | w7 @ 144(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 144(SP) - ADCQ DX, SI - ADDQ DI, 144(SP) - ADCQ $0x00, SI + // | + // | W + // | 0 - | 1 SI | 2 DI | 3 R9 | 4 R8 | 5 CX | 6 R15 | 7 R14 | 8 R13 | 9 R12 | 10 208(SP) | 11 216(SP) | 12 224(SP) | 13 232(SP) | 14 240(SP) | 15 248(SP) + // | 16 256(SP) | 17 264(SP) | 18 272(SP) | 19 280(SP) | 20 288(SP) | 21 296(SP) | 22 72(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | w8 @ (SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI - // | w9 @ 8(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + // | | u1 = w1 * inp + MOVQ SI, DX + MULXQ inp+32(FP), DX, R11 - // | w10 @ 16(SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + // | save u1 + MOVQ DX, 88(SP) - // | w11 @ 24(SP) - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI + // | - // | w12 @ 32(SP) - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI - ADDQ SI, 32(SP) - ADCQ $0x00, DI +/* */ - // | w13 @ 40(SP) - ADDQ DI, 40(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | j0 - // | - MOVQ 48(SP), R8 + // | w1 @ SI + MULXQ (R10), AX, R11 + ADOXQ AX, SI + ADCXQ R11, DI - // | (u @ CX) = (w1 @ R9) * inp - MOVQ R9, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | j1 - // | w1 @ R9 - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, DI + // | w2 @ DI + MULXQ 8(R10), AX, R11 + ADOXQ AX, DI + ADCXQ R11, R9 - // | w2 @ R10 - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + // | j2 - // | w3 @ R11 - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI + // | w3 @ R9 + MULXQ 16(R10), AX, R11 + ADOXQ AX, R9 + ADCXQ R11, R8 - // | w4 @ R12 - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + // | j3 - // | w5 @ R13 - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + // | w4 @ R8 + MULXQ 24(R10), AX, R11 + ADOXQ AX, R8 + ADCXQ R11, CX - // | w6 @ 152(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 152(SP) - ADCQ DX, SI - ADDQ DI, 152(SP) - ADCQ $0x00, SI + // | j4 - // | w7 @ 144(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 144(SP) - ADCQ DX, DI - ADDQ SI, 144(SP) - ADCQ $0x00, DI + // | w5 @ CX + MULXQ 32(R10), AX, R11 + ADOXQ AX, CX + ADCXQ R11, R15 - // | w8 @ (SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + // | j5 - // | w9 @ 8(SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + // | w6 @ R15 + MULXQ 40(R10), AX, R11 + ADOXQ AX, R15 + ADCXQ R11, R14 + + // | j6 - // | w10 @ 16(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI + // | w7 @ R14 + MULXQ 48(R10), AX, R11 + ADOXQ AX, R14 + ADCXQ R11, R13 - // | w11 @ 24(SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI - ADDQ SI, 24(SP) - ADCQ $0x00, DI + // | j7 - // | w12 @ 32(SP) - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, SI - ADDQ DI, 32(SP) - ADCQ $0x00, SI + // | w8 @ R13 + MULXQ 56(R10), AX, R11 + ADOXQ AX, R13 + ADCXQ R11, R12 - // | w13 @ 40(SP) - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, DI - ADDQ SI, 40(SP) - ADCQ $0x00, DI + // | j8 - // | w14 @ R8 - ADDQ DI, R15 - ADCQ R15, R8 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | w9 @ R12 + MULXQ 64(R10), AX, R11 + ADOXQ AX, R12 - // | - MOVQ 56(SP), R9 + // | w10 @ 208(SP) + // | move to temp register + MOVQ 208(SP), AX + ADCXQ R11, AX + ADOXQ BX, AX - // | (u @ CX) = (w2 @ R10) * inp - MOVQ R10, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | move to an idle register + // | w10 @ AX + MOVQ AX, BX + ADCXQ SI, SI + MOVQ $0x00, AX + ADOXQ AX, SI - // | w2 @ R10 - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI + // | clear flags + XORQ AX, AX - // | w3 @ R11 - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + // | - // | w4 @ R12 - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI +/* i = 2 */ - // | w5 @ R13 - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + // | + // | W + // | 0 - | 1 - | 2 DI | 3 R9 | 4 R8 | 5 CX | 6 R15 | 7 R14 | 8 R13 | 9 R12 | 10 BX | 11 216(SP) | 12 224(SP) | 13 232(SP) | 14 240(SP) | 15 248(SP) + // | 16 256(SP) | 17 264(SP) | 18 272(SP) | 19 280(SP) | 20 288(SP) | 21 296(SP) | 22 72(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | w6 @ 152(SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 152(SP) - ADCQ DX, DI - ADDQ SI, 152(SP) - ADCQ $0x00, DI - // | w7 @ 144(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 144(SP) - ADCQ DX, SI - ADDQ DI, 144(SP) - ADCQ $0x00, SI + // | | u2 = w2 * inp + MOVQ DI, DX + MULXQ inp+32(FP), DX, R11 - // | w8 @ (SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + // | save u2 + MOVQ DX, 96(SP) - // | w9 @ 8(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + // | - // | w10 @ 16(SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI +/* */ - // | w11 @ 24(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI + // | j0 - // | w12 @ 32(SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI - ADDQ SI, 32(SP) - ADCQ $0x00, DI + // | w2 @ DI + MULXQ (R10), AX, R11 + ADOXQ AX, DI + ADCXQ R11, R9 - // | w13 @ 40(SP) - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, SI - ADDQ DI, 40(SP) - ADCQ $0x00, SI + // | j1 - // | w14 @ R8 - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + // | w3 @ R9 + MULXQ 8(R10), AX, R11 + ADOXQ AX, R9 + ADCXQ R11, R8 - // | w15 @ R9 - ADDQ DI, R15 - ADCQ R15, R9 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | j2 - // | - MOVQ 64(SP), R10 + // | w4 @ R8 + MULXQ 16(R10), AX, R11 + ADOXQ AX, R8 + ADCXQ R11, CX - // | (u @ CX) = (w3 @ R11) * inp - MOVQ R11, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | j3 - // | w3 @ R11 - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, DI + // | w5 @ CX + MULXQ 24(R10), AX, R11 + ADOXQ AX, CX + ADCXQ R11, R15 - // | w4 @ R12 - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + // | j4 - // | w5 @ R13 - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + // | w6 @ R15 + MULXQ 32(R10), AX, R11 + ADOXQ AX, R15 + ADCXQ R11, R14 - // | w6 @ 152(SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 152(SP) - ADCQ DX, SI - ADDQ DI, 152(SP) - ADCQ $0x00, SI + // | j5 - // | w7 @ 144(SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 144(SP) - ADCQ DX, DI - ADDQ SI, 144(SP) - ADCQ $0x00, DI + // | w7 @ R14 + MULXQ 40(R10), AX, R11 + ADOXQ AX, R14 + ADCXQ R11, R13 - // | w8 @ (SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + // | j6 - // | w9 @ 8(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + // | w8 @ R13 + MULXQ 48(R10), AX, R11 + ADOXQ AX, R13 + ADCXQ R11, R12 - // | w10 @ 16(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI + // | j7 - // | w11 @ 24(SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI - ADDQ SI, 24(SP) - ADCQ $0x00, DI + // | w9 @ R12 + MULXQ 56(R10), AX, R11 + ADOXQ AX, R12 + ADCXQ R11, BX - // | w12 @ 32(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, SI - ADDQ DI, 32(SP) - ADCQ $0x00, SI + // | j8 - // | w13 @ 40(SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, DI - ADDQ SI, 40(SP) - ADCQ $0x00, DI + // | w10 @ BX + MULXQ 64(R10), AX, R11 + ADOXQ AX, BX - // | w14 @ R8 - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + // | w11 @ 216(SP) + // | move to temp register + MOVQ 216(SP), AX + ADCXQ R11, AX + ADOXQ SI, AX + + // | move to an idle register + // | w11 @ AX + MOVQ AX, SI + ADCXQ DI, DI + MOVQ $0x00, AX + ADOXQ AX, DI - // | w15 @ R9 - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI + // | clear flags + XORQ AX, AX - // | w16 @ R10 - ADDQ DI, R15 - ADCQ R15, R10 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | - // | - MOVQ 72(SP), R11 +/* i = 3 */ - // | (u @ CX) = (w4 @ R12) * inp - MOVQ R12, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | + // | W + // | 0 - | 1 - | 2 - | 3 R9 | 4 R8 | 5 CX | 6 R15 | 7 R14 | 8 R13 | 9 R12 | 10 BX | 11 SI | 12 224(SP) | 13 232(SP) | 14 240(SP) | 15 248(SP) + // | 16 256(SP) | 17 264(SP) | 18 272(SP) | 19 280(SP) | 20 288(SP) | 21 296(SP) | 22 72(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | w4 @ R12 - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI - // | w5 @ R13 - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + // | | u3 = w3 * inp + MOVQ R9, DX + MULXQ inp+32(FP), DX, R11 - // | w6 @ 152(SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 152(SP) - ADCQ DX, DI - ADDQ SI, 152(SP) - ADCQ $0x00, DI + // | save u3 + MOVQ DX, 104(SP) - // | w7 @ 144(SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 144(SP) - ADCQ DX, SI - ADDQ DI, 144(SP) - ADCQ $0x00, SI + // | - // | w8 @ (SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI +/* */ - // | w9 @ 8(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + // | j0 - // | w10 @ 16(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + // | w3 @ R9 + MULXQ (R10), AX, R11 + ADOXQ AX, R9 + ADCXQ R11, R8 - // | w11 @ 24(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI + // | j1 - // | w12 @ 32(SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI - ADDQ SI, 32(SP) - ADCQ $0x00, DI + // | w4 @ R8 + MULXQ 8(R10), AX, R11 + ADOXQ AX, R8 + ADCXQ R11, CX - // | w13 @ 40(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, SI - ADDQ DI, 40(SP) - ADCQ $0x00, SI + // | j2 - // | w14 @ R8 - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + // | w5 @ CX + MULXQ 16(R10), AX, R11 + ADOXQ AX, CX + ADCXQ R11, R15 - // | w15 @ R9 - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + // | j3 - // | w16 @ R10 - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + // | w6 @ R15 + MULXQ 24(R10), AX, R11 + ADOXQ AX, R15 + ADCXQ R11, R14 + + // | j4 - // | w17 @ R11 - ADDQ DI, R15 - ADCQ R15, R11 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | w7 @ R14 + MULXQ 32(R10), AX, R11 + ADOXQ AX, R14 + ADCXQ R11, R13 - // | - MOVQ 80(SP), R12 + // | j5 - // | (u @ CX) = (w5 @ R13) * inp - MOVQ R13, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | w8 @ R13 + MULXQ 40(R10), AX, R11 + ADOXQ AX, R13 + ADCXQ R11, R12 - // | w5 @ R13 - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, DI + // | j6 - // | w6 @ 152(SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 152(SP) - ADCQ DX, SI - ADDQ DI, 152(SP) - ADCQ $0x00, SI + // | w9 @ R12 + MULXQ 48(R10), AX, R11 + ADOXQ AX, R12 + ADCXQ R11, BX - // | w7 @ 144(SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 144(SP) - ADCQ DX, DI - ADDQ SI, 144(SP) - ADCQ $0x00, DI + // | j7 - // | w8 @ (SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + // | w10 @ BX + MULXQ 56(R10), AX, R11 + ADOXQ AX, BX + ADCXQ R11, SI - // | w9 @ 8(SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + // | j8 - // | w10 @ 16(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI + // | w11 @ SI + MULXQ 64(R10), AX, R11 + ADOXQ AX, SI - // | w11 @ 24(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI - ADDQ SI, 24(SP) - ADCQ $0x00, DI + // | w12 @ 224(SP) + // | move to temp register + MOVQ 224(SP), AX + ADCXQ R11, AX + ADOXQ DI, AX - // | w12 @ 32(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, SI - ADDQ DI, 32(SP) - ADCQ $0x00, SI + // | move to an idle register + // | w12 @ AX + MOVQ AX, DI + ADCXQ R9, R9 + MOVQ $0x00, AX + ADOXQ AX, R9 - // | w13 @ 40(SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, DI - ADDQ SI, 40(SP) - ADCQ $0x00, DI + // | clear flags + XORQ AX, AX - // | w14 @ R8 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + // | - // | w15 @ R9 - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI +/* i = 4 */ - // | w16 @ R10 - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 R8 | 5 CX | 6 R15 | 7 R14 | 8 R13 | 9 R12 | 10 BX | 11 SI | 12 DI | 13 232(SP) | 14 240(SP) | 15 248(SP) + // | 16 256(SP) | 17 264(SP) | 18 272(SP) | 19 280(SP) | 20 288(SP) | 21 296(SP) | 22 72(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | w17 @ R11 - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI - // | w18 @ R12 - ADDQ DI, R15 - ADCQ R15, R12 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | | u4 = w4 * inp + MOVQ R8, DX + MULXQ inp+32(FP), DX, R11 - // | - MOVQ 88(SP), R13 + // | save u4 + MOVQ DX, 112(SP) - // | (u @ CX) = (w6 @ 152(SP)) * inp - MOVQ 152(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + // | - // | w6 @ 152(SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, 152(SP) - ADCQ DX, DI +/* */ - // | w7 @ 144(SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 144(SP) - ADCQ DX, SI - ADDQ DI, 144(SP) - ADCQ $0x00, SI + // | j0 - // | w8 @ (SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + // | w4 @ R8 + MULXQ (R10), AX, R11 + ADOXQ AX, R8 + ADCXQ R11, CX - // | w9 @ 8(SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + // | j1 - // | w10 @ 16(SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + // | w5 @ CX + MULXQ 8(R10), AX, R11 + ADOXQ AX, CX + ADCXQ R11, R15 - // | w11 @ 24(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI + // | j2 - // | w12 @ 32(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI - ADDQ SI, 32(SP) - ADCQ $0x00, DI + // | w6 @ R15 + MULXQ 16(R10), AX, R11 + ADOXQ AX, R15 + ADCXQ R11, R14 - // | w13 @ 40(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, SI - ADDQ DI, 40(SP) - ADCQ $0x00, SI + // | j3 - // | w14 @ R8 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + // | w7 @ R14 + MULXQ 24(R10), AX, R11 + ADOXQ AX, R14 + ADCXQ R11, R13 - // | w15 @ R9 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + // | j4 - // | w16 @ R10 - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + // | w8 @ R13 + MULXQ 32(R10), AX, R11 + ADOXQ AX, R13 + ADCXQ R11, R12 - // | w17 @ R11 - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + // | j5 - // | w18 @ R12 - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + // | w9 @ R12 + MULXQ 40(R10), AX, R11 + ADOXQ AX, R12 + ADCXQ R11, BX - // | w19 @ R13 - ADDQ DI, R15 - ADCQ R15, R13 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | j6 - // | - MOVQ 96(SP), BX - MOVQ BX, 152(SP) + // | w10 @ BX + MULXQ 48(R10), AX, R11 + ADOXQ AX, BX + ADCXQ R11, SI - // | (u @ CX) = (w7 @ 144(SP)) * inp - MOVQ 144(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + // | j7 - // | w7 @ 144(SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, 144(SP) - ADCQ DX, DI + // | w11 @ SI + MULXQ 56(R10), AX, R11 + ADOXQ AX, SI + ADCXQ R11, DI - // | w8 @ (SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + // | j8 - // | w9 @ 8(SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + // | w12 @ DI + MULXQ 64(R10), AX, R11 + ADOXQ AX, DI - // | w10 @ 16(SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI + // | w13 @ 232(SP) + // | move to temp register + MOVQ 232(SP), AX + ADCXQ R11, AX + ADOXQ R9, AX + + // | move to an idle register + // | w13 @ AX + MOVQ AX, R9 + ADCXQ R8, R8 + MOVQ $0x00, AX + ADOXQ AX, R8 - // | w11 @ 24(SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI - ADDQ SI, 24(SP) - ADCQ $0x00, DI + // | clear flags + XORQ AX, AX - // | w12 @ 32(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, SI - ADDQ DI, 32(SP) - ADCQ $0x00, SI + // | - // | w13 @ 40(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, DI - ADDQ SI, 40(SP) - ADCQ $0x00, DI +/* i = 5 */ - // | w14 @ R8 - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 CX | 6 R15 | 7 R14 | 8 R13 | 9 R12 | 10 BX | 11 SI | 12 DI | 13 R9 | 14 240(SP) | 15 248(SP) + // | 16 256(SP) | 17 264(SP) | 18 272(SP) | 19 280(SP) | 20 288(SP) | 21 296(SP) | 22 72(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | w15 @ R9 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI - // | w16 @ R10 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + // | | u5 = w5 * inp + MOVQ CX, DX + MULXQ inp+32(FP), DX, R11 - // | w17 @ R11 - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI + // | save u5 + MOVQ DX, 120(SP) - // | w18 @ R12 - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + // | - // | w19 @ R13 - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI +/* */ - // | w20 @ 152(SP) - ADDQ DI, R15 - ADCQ R15, 152(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | j0 - // | - MOVQ 104(SP), BX - MOVQ BX, 144(SP) + // | w5 @ CX + MULXQ (R10), AX, R11 + ADOXQ AX, CX + ADCXQ R11, R15 - // | (u @ CX) = (w8 @ (SP)) * inp - MOVQ (SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + // | j1 - // | w8 @ (SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI + // | w6 @ R15 + MULXQ 8(R10), AX, R11 + ADOXQ AX, R15 + ADCXQ R11, R14 - // | w9 @ 8(SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + // | j2 + + // | w7 @ R14 + MULXQ 16(R10), AX, R11 + ADOXQ AX, R14 + ADCXQ R11, R13 + + // | j3 + + // | w8 @ R13 + MULXQ 24(R10), AX, R11 + ADOXQ AX, R13 + ADCXQ R11, R12 + + // | j4 - // | w10 @ 16(SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + // | w9 @ R12 + MULXQ 32(R10), AX, R11 + ADOXQ AX, R12 + ADCXQ R11, BX - // | w11 @ 24(SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI + // | j5 - // | w12 @ 32(SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI - ADDQ SI, 32(SP) - ADCQ $0x00, DI + // | w10 @ BX + MULXQ 40(R10), AX, R11 + ADOXQ AX, BX + ADCXQ R11, SI - // | w13 @ 40(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, SI - ADDQ DI, 40(SP) - ADCQ $0x00, SI + // | j6 - // | w14 @ R8 - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + // | w11 @ SI + MULXQ 48(R10), AX, R11 + ADOXQ AX, SI + ADCXQ R11, DI - // | w15 @ R9 - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + // | j7 - // | w16 @ R10 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + // | w12 @ DI + MULXQ 56(R10), AX, R11 + ADOXQ AX, DI + ADCXQ R11, R9 - // | w17 @ R11 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + // | j8 - // | w18 @ R12 - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + // | w13 @ R9 + MULXQ 64(R10), AX, R11 + ADOXQ AX, R9 - // | w19 @ R13 - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + // | w14 @ 240(SP) + // | move to temp register + MOVQ 240(SP), AX + ADCXQ R11, AX + ADOXQ R8, AX - // | w20 @ 152(SP) - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, 152(SP) - ADCQ DX, DI - ADDQ SI, 152(SP) - ADCQ $0x00, DI + // | move to an idle register + // | w14 @ AX + MOVQ AX, R8 + ADCXQ CX, CX + MOVQ $0x00, AX + ADOXQ AX, CX - // | w21 @ 144(SP) - ADDQ DI, R15 - ADCQ R15, 144(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | clear flags + XORQ AX, AX - // | - MOVQ 112(SP), BX - MOVQ BX, (SP) + // | - // | (u @ CX) = (w9 @ 8(SP)) * inp - MOVQ 8(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX +/* i = 6 */ - // | w9 @ 8(SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 R15 | 7 R14 | 8 R13 | 9 R12 | 10 BX | 11 SI | 12 DI | 13 R9 | 14 R8 | 15 248(SP) + // | 16 256(SP) | 17 264(SP) | 18 272(SP) | 19 280(SP) | 20 288(SP) | 21 296(SP) | 22 72(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | w10 @ 16(SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI - // | w11 @ 24(SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI - ADDQ SI, 24(SP) - ADCQ $0x00, DI + // | | u6 = w6 * inp + MOVQ R15, DX + MULXQ inp+32(FP), DX, R11 - // | w12 @ 32(SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, SI - ADDQ DI, 32(SP) - ADCQ $0x00, SI + // | save u6 + MOVQ DX, 128(SP) - // | w13 @ 40(SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, DI - ADDQ SI, 40(SP) - ADCQ $0x00, DI + // | - // | w14 @ R8 - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI +/* */ - // | w15 @ R9 - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI + // | j0 - // | w16 @ R10 - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + // | w6 @ R15 + MULXQ (R10), AX, R11 + ADOXQ AX, R15 + ADCXQ R11, R14 - // | w17 @ R11 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI + // | j1 - // | w18 @ R12 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + // | w7 @ R14 + MULXQ 8(R10), AX, R11 + ADOXQ AX, R14 + ADCXQ R11, R13 - // | w19 @ R13 - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + // | j2 - // | w20 @ 152(SP) - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, 152(SP) - ADCQ DX, SI - ADDQ DI, 152(SP) - ADCQ $0x00, SI + // | w8 @ R13 + MULXQ 16(R10), AX, R11 + ADOXQ AX, R13 + ADCXQ R11, R12 - // | w21 @ 144(SP) - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, 144(SP) - ADCQ DX, DI - ADDQ SI, 144(SP) - ADCQ $0x00, DI + // | j3 - // | w22 @ (SP) - ADDQ DI, R15 - ADCQ R15, (SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | w9 @ R12 + MULXQ 24(R10), AX, R11 + ADOXQ AX, R12 + ADCXQ R11, BX - // | - MOVQ 120(SP), BX - MOVQ BX, 8(SP) + // | j4 - // | (u @ CX) = (w10 @ 16(SP)) * inp - MOVQ 16(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + // | w10 @ BX + MULXQ 32(R10), AX, R11 + ADOXQ AX, BX + ADCXQ R11, SI - // | w10 @ 16(SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI + // | j5 - // | w11 @ 24(SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI + // | w11 @ SI + MULXQ 40(R10), AX, R11 + ADOXQ AX, SI + ADCXQ R11, DI - // | w12 @ 32(SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI - ADDQ SI, 32(SP) - ADCQ $0x00, DI + // | j6 - // | w13 @ 40(SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, SI - ADDQ DI, 40(SP) - ADCQ $0x00, SI + // | w12 @ DI + MULXQ 48(R10), AX, R11 + ADOXQ AX, DI + ADCXQ R11, R9 - // | w14 @ R8 - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + // | j7 - // | w15 @ R9 - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + // | w13 @ R9 + MULXQ 56(R10), AX, R11 + ADOXQ AX, R9 + ADCXQ R11, R8 - // | w16 @ R10 - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + // | j8 - // | w17 @ R11 - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + // | w14 @ R8 + MULXQ 64(R10), AX, R11 + ADOXQ AX, R8 - // | w18 @ R12 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + // | w15 @ 248(SP) + // | move to temp register + MOVQ 248(SP), AX + ADCXQ R11, AX + ADOXQ CX, AX - // | w19 @ R13 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + // | move to an idle register + // | w15 @ AX + MOVQ AX, CX + ADCXQ R15, R15 + MOVQ $0x00, AX + ADOXQ AX, R15 - // | w20 @ 152(SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, 152(SP) - ADCQ DX, DI - ADDQ SI, 152(SP) - ADCQ $0x00, DI + // | clear flags + XORQ AX, AX - // | w21 @ 144(SP) - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, 144(SP) - ADCQ DX, SI - ADDQ DI, 144(SP) - ADCQ $0x00, SI + // | - // | w22 @ (SP) - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI +/* i = 7 */ - // | w23 @ 8(SP) - ADDQ DI, R15 - ADCQ R15, 8(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 R14 | 8 R13 | 9 R12 | 10 BX | 11 SI | 12 DI | 13 R9 | 14 R8 | 15 CX + // | 16 256(SP) | 17 264(SP) | 18 272(SP) | 19 280(SP) | 20 288(SP) | 21 296(SP) | 22 72(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | - MOVQ 128(SP), BX - MOVQ BX, 16(SP) - // | (u @ CX) = (w11 @ 24(SP)) * inp - MOVQ 24(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + // | | u7 = w7 * inp + MOVQ R14, DX + MULXQ inp+32(FP), DX, R11 - // | w11 @ 24(SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI + // | save u7 + MOVQ DX, 136(SP) - // | w12 @ 32(SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, SI - ADDQ DI, 32(SP) - ADCQ $0x00, SI + // | - // | w13 @ 40(SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, DI - ADDQ SI, 40(SP) - ADCQ $0x00, DI +/* */ - // | w14 @ R8 - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + // | j0 - // | w15 @ R9 - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI + // | w7 @ R14 + MULXQ (R10), AX, R11 + ADOXQ AX, R14 + ADCXQ R11, R13 - // | w16 @ R10 - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + // | j1 - // | w17 @ R11 - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI + // | w8 @ R13 + MULXQ 8(R10), AX, R11 + ADOXQ AX, R13 + ADCXQ R11, R12 - // | w18 @ R12 - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + // | j2 - // | w19 @ R13 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + // | w9 @ R12 + MULXQ 16(R10), AX, R11 + ADOXQ AX, R12 + ADCXQ R11, BX - // | w20 @ 152(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 152(SP) - ADCQ DX, SI - ADDQ DI, 152(SP) - ADCQ $0x00, SI + // | j3 - // | w21 @ 144(SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, 144(SP) - ADCQ DX, DI - ADDQ SI, 144(SP) - ADCQ $0x00, DI + // | w10 @ BX + MULXQ 24(R10), AX, R11 + ADOXQ AX, BX + ADCXQ R11, SI - // | w22 @ (SP) - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + // | j4 - // | w23 @ 8(SP) - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + // | w11 @ SI + MULXQ 32(R10), AX, R11 + ADOXQ AX, SI + ADCXQ R11, DI - // | w24 @ 16(SP) - ADDQ DI, R15 - ADCQ R15, 16(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | j5 - // | - MOVQ 136(SP), BX - MOVQ BX, 24(SP) + // | w12 @ DI + MULXQ 40(R10), AX, R11 + ADOXQ AX, DI + ADCXQ R11, R9 - // | (u @ CX) = (w12 @ 32(SP)) * inp - MOVQ 32(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + // | j6 - // | w12 @ 32(SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI + // | w13 @ R9 + MULXQ 48(R10), AX, R11 + ADOXQ AX, R9 + ADCXQ R11, R8 - // | w13 @ 40(SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, SI - ADDQ DI, 40(SP) - ADCQ $0x00, SI + // | j7 // | w14 @ R8 - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + MULXQ 56(R10), AX, R11 + ADOXQ AX, R8 + ADCXQ R11, CX - // | w15 @ R9 - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + // | j8 - // | w16 @ R10 - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + // | w15 @ CX + MULXQ 64(R10), AX, R11 + ADOXQ AX, CX - // | w17 @ R11 - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + // | w16 @ 256(SP) + // | move to temp register + MOVQ 256(SP), AX + ADCXQ R11, AX + ADOXQ R15, AX + + // | move to an idle register + // | w16 @ AX + MOVQ AX, R15 + ADCXQ R14, R14 + MOVQ $0x00, AX + ADOXQ AX, R14 - // | w18 @ R12 - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + // | clear flags + XORQ AX, AX - // | w19 @ R13 - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + // | - // | w20 @ 152(SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 152(SP) - ADCQ DX, DI - ADDQ SI, 152(SP) - ADCQ $0x00, DI +/* i = 8 */ - // | w21 @ 144(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 144(SP) - ADCQ DX, SI - ADDQ DI, 144(SP) - ADCQ $0x00, SI + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R13 | 9 R12 | 10 BX | 11 SI | 12 DI | 13 R9 | 14 R8 | 15 CX + // | 16 R15 | 17 264(SP) | 18 272(SP) | 19 280(SP) | 20 288(SP) | 21 296(SP) | 22 72(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | w22 @ (SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI - // | w23 @ 8(SP) - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + // | | u8 = w8 * inp + MOVQ R13, DX + MULXQ inp+32(FP), DX, R11 - // | w24 @ 16(SP) - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + // | save u8 + MOVQ DX, 144(SP) - // | w25 @ 24(SP) - ADDQ DI, R15 - ADCQ R15, 24(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | - // | Reduce by modulus - MOVQ 40(SP), CX - SUBQ (R14), CX - MOVQ R8, AX - SBBQ 8(R14), AX - MOVQ R9, DX - SBBQ 16(R14), DX - MOVQ R10, BX - SBBQ 24(R14), BX - MOVQ BX, 32(SP) - MOVQ R11, BX - SBBQ 32(R14), BX - MOVQ BX, 160(SP) - MOVQ R12, BX - SBBQ 40(R14), BX - MOVQ BX, 168(SP) - MOVQ R13, BX - SBBQ 48(R14), BX - MOVQ BX, 176(SP) - MOVQ 152(SP), BX - SBBQ 56(R14), BX - MOVQ BX, 184(SP) - MOVQ 144(SP), BX - SBBQ 64(R14), BX - MOVQ BX, 192(SP) - MOVQ (SP), BX - SBBQ 72(R14), BX - MOVQ BX, 200(SP) - MOVQ 8(SP), BX - SBBQ 80(R14), BX - MOVQ BX, 208(SP) - MOVQ 16(SP), BX - SBBQ 88(R14), BX - MOVQ BX, 216(SP) - MOVQ 24(SP), BX - SBBQ 96(R14), BX - MOVQ BX, 224(SP) - SBBQ $0x00, R15 +/* */ - // | Compare & Return - MOVQ c+0(FP), DI - CMOVQCS 40(SP), CX - MOVQ CX, (DI) - CMOVQCC AX, R8 - MOVQ R8, 8(DI) - CMOVQCC DX, R9 - MOVQ R9, 16(DI) - CMOVQCC 32(SP), R10 - MOVQ R10, 24(DI) - CMOVQCC 160(SP), R11 - MOVQ R11, 32(DI) - CMOVQCC 168(SP), R12 - MOVQ R12, 40(DI) - CMOVQCC 176(SP), R13 - MOVQ R13, 48(DI) - MOVQ 152(SP), BX - CMOVQCC 184(SP), BX - MOVQ BX, 56(DI) - MOVQ 144(SP), BX - CMOVQCC 192(SP), BX - MOVQ BX, 64(DI) - MOVQ (SP), BX - CMOVQCC 200(SP), BX - MOVQ BX, 72(DI) - MOVQ 8(SP), BX - CMOVQCC 208(SP), BX - MOVQ BX, 80(DI) - MOVQ 16(SP), BX - CMOVQCC 216(SP), BX - MOVQ BX, 88(DI) - MOVQ 24(SP), BX - CMOVQCC 224(SP), BX - MOVQ BX, 96(DI) - RET + // | j0 + // | w8 @ R13 + MULXQ (R10), AX, R11 + ADOXQ AX, R13 + ADCXQ R11, R12 -// func mul14(c *[28]uint64, a *[14]uint64, b *[14]uint64, p *[14]uint64, inp uint64) -TEXT ·mul14(SB), $256-40 -/* inputs */ - // | - // | Multiplication - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI + // | j1 - // | - // | - XORQ R10, R10 - XORQ R11, R11 - XORQ R12, R12 - XORQ R13, R13 - XORQ R14, R14 - XORQ R15, R15 - MOVQ $0x00000000, (SP) - MOVQ $0x00000000, 8(SP) - MOVQ $0x00000000, 16(SP) - MOVQ $0x00000000, 24(SP) - MOVQ $0x00000000, 32(SP) - MOVQ $0x00000000, 40(SP) - MOVQ $0x00000000, 48(SP) - MOVQ $0x00000000, 56(SP) - MOVQ $0x00000000, 64(SP) - MOVQ $0x00000000, 72(SP) - MOVQ $0x00000000, 80(SP) - MOVQ $0x00000000, 88(SP) - MOVQ $0x00000000, 96(SP) - MOVQ $0x00000000, 104(SP) - MOVQ $0x00000000, 112(SP) - MOVQ $0x00000000, 120(SP) - MOVQ $0x00000000, 128(SP) - MOVQ $0x00000000, 136(SP) - MOVQ $0x00000000, 144(SP) - MOVQ $0x00000000, 152(SP) - - // | - // | b0 - MOVQ (SI), CX + // | w9 @ R12 + MULXQ 8(R10), AX, R11 + ADOXQ AX, R12 + ADCXQ R11, BX - // | a0 * b0 - // | (w0, w1) @ (R8, R9) - MOVQ (DI), AX - MULQ CX - MOVQ AX, R8 - MOVQ DX, R9 + // | j2 - // | a1 * b0 - // | (w1, w2) @ (R9, R10) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, R10 + // | w10 @ BX + MULXQ 16(R10), AX, R11 + ADOXQ AX, BX + ADCXQ R11, SI - // | a2 * b0 - // | (w2, w3) @ (R10, R11) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, R11 + // | j3 - // | a3 * b0 - // | (w3, w4) @ (R11, R12) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, R12 + // | w11 @ SI + MULXQ 24(R10), AX, R11 + ADOXQ AX, SI + ADCXQ R11, DI - // | a4 * b0 - // | (w4, w5) @ (R12, R13) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, R13 + // | j4 - // | a5 * b0 - // | (w5, w6) @ (R13, R14) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 + // | w12 @ DI + MULXQ 32(R10), AX, R11 + ADOXQ AX, DI + ADCXQ R11, R9 - // | a6 * b0 - // | (w6, w7) @ (R14, R15) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 + // | j5 - // | a7 * b0 - // | (w7, w8) @ (R15, (SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) + // | w13 @ R9 + MULXQ 40(R10), AX, R11 + ADOXQ AX, R9 + ADCXQ R11, R8 - // | a8 * b0 - // | (w8, w9) @ ((SP), 8(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) + // | j6 - // | a9 * b0 - // | (w9, w10) @ (8(SP), 16(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) + // | w14 @ R8 + MULXQ 48(R10), AX, R11 + ADOXQ AX, R8 + ADCXQ R11, CX - // | a10 * b0 - // | (w10, w11) @ (16(SP), 24(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) + // | j7 - // | a11 * b0 - // | (w11, w12) @ (24(SP), 32(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) + // | w15 @ CX + MULXQ 56(R10), AX, R11 + ADOXQ AX, CX + ADCXQ R11, R15 - // | a12 * b0 - // | (w12, w13) @ (32(SP), 40(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) + // | j8 - // | a13 * b0 - // | (w13, w14) @ (40(SP), 48(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) + // | w16 @ R15 + MULXQ 64(R10), AX, R11 + ADOXQ AX, R15 - // | - // | b1 - MOVQ 8(SI), CX + // | w17 @ 264(SP) + // | move to temp register + MOVQ 264(SP), AX + ADCXQ R11, AX + ADOXQ R14, AX + + // | move to an idle register + // | w17 @ AX + MOVQ AX, R14 + ADCXQ R13, R13 + MOVQ $0x00, AX + ADOXQ AX, R13 - // | a0 * b1 - // | (w1, w2, w3, w4) @ (R9, R10, R11, R12) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, R10 - ADCQ $0x00, R11 - ADCQ $0x00, R12 + // | + // | W montgomery reduction q1 ends + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R12 | 10 BX | 11 SI | 12 DI | 13 R9 | 14 R8 | 15 CX + // | 16 R15 | 17 R14 | 18 272(SP) | 19 280(SP) | 20 288(SP) | 21 296(SP) | 22 72(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | a1 * b1 - // | (w2, w3, w4, w5) @ (R10, R11, R12, R13) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, R11 - ADCQ $0x00, R12 - ADCQ $0x00, R13 - // | a2 * b1 - // | (w3, w4, w5, w6) @ (R11, R12, R13, R14) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, R12 - ADCQ $0x00, R13 - ADCQ $0x00, R14 + // | carry from q1 should be added to w18 + MOVQ R13, 152(SP) - // | a3 * b1 - // | (w4, w5, w6, w7) @ (R12, R13, R14, R15) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, R13 - ADCQ $0x00, R14 - ADCQ $0x00, R15 + // | - // | a4 * b1 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) +/* montgomerry reduction q2 */ - // | a5 * b1 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + // | clear flags + XORQ R13, R13 + + // | + +/* i = 0 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R12 | 10 BX | 11 SI | 12 DI | 13 R9 | 14 R8 | 15 CX + // | 16 R15 | 17 R14 | 18 272(SP) | 19 280(SP) | 20 288(SP) | 21 296(SP) | 22 72(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) + + + // | u0 @ 80(SP) + MOVQ 80(SP), DX - // | a6 * b1 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + // | - // | a7 * b1 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) +/* */ - // | a8 * b1 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | j9 - // | a9 * b1 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | w9 @ R12 + MULXQ 72(R10), AX, R11 + ADOXQ AX, R12 + ADCXQ R11, BX - // | a10 * b1 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | j10 - // | a11 * b1 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | w10 @ BX + MULXQ 80(R10), AX, R11 + ADOXQ AX, BX + ADCXQ R11, SI - // | a12 * b1 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | j11 - // | a13 * b1 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | w11 @ SI + MULXQ 88(R10), AX, R11 + ADOXQ AX, SI + ADCXQ R11, DI - // | - // | b2 - MOVQ 16(SI), CX + // | j12 - // | a0 * b2 - // | (w2, w3, w4, w5) @ (R10, R11, R12, R13) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, R11 - ADCQ $0x00, R12 - ADCQ $0x00, R13 + // | w12 @ DI + MULXQ 96(R10), AX, R11 + ADOXQ AX, DI + ADCXQ R11, R9 - // | a1 * b2 - // | (w3, w4, w5, w6) @ (R11, R12, R13, R14) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, R12 - ADCQ $0x00, R13 - ADCQ $0x00, R14 + // | j13 - // | a2 * b2 - // | (w4, w5, w6, w7) @ (R12, R13, R14, R15) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, R13 - ADCQ $0x00, R14 - ADCQ $0x00, R15 + // | w13 @ R9 + MULXQ 104(R10), AX, R11 + ADOXQ AX, R9 + ADCXQ R11, R8 - // | a3 * b2 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) + // | j14 - // | a4 * b2 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + // | w14 @ R8 + MULXQ 112(R10), AX, R11 + ADOXQ AX, R8 + ADCXQ R11, CX - // | a5 * b2 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + // | j15 - // | a6 * b2 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | w15 @ CX + MULXQ 120(R10), AX, R11 + ADOXQ AX, CX + ADCXQ R11, R15 + ADOXQ R13, R15 + MOVQ $0x00, R13 + ADCXQ R13, R13 + MOVQ $0x00, AX + ADOXQ AX, R13 - // | a7 * b2 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | clear flags + XORQ AX, AX - // | a8 * b2 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | - // | a9 * b2 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) +/* i = 1 */ - // | a10 * b2 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R12 | 10 BX | 11 SI | 12 DI | 13 R9 | 14 R8 | 15 CX + // | 16 R15 | 17 R14 | 18 272(SP) | 19 280(SP) | 20 288(SP) | 21 296(SP) | 22 72(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | a11 * b2 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) - // | a12 * b2 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | u1 @ 88(SP) + MOVQ 88(SP), DX - // | a13 * b2 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | - // | - // | b3 - MOVQ 24(SI), CX +/* */ - // | a0 * b3 - // | (w3, w4, w5, w6) @ (R11, R12, R13, R14) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, R12 - ADCQ $0x00, R13 - ADCQ $0x00, R14 + // | j9 - // | a1 * b3 - // | (w4, w5, w6, w7) @ (R12, R13, R14, R15) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, R13 - ADCQ $0x00, R14 - ADCQ $0x00, R15 + // | w10 @ BX + MULXQ 72(R10), AX, R11 + ADOXQ AX, BX + MOVQ BX, 80(SP) + ADCXQ R11, SI - // | a2 * b3 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) + // | j10 - // | a3 * b3 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + // | w11 @ SI + MULXQ 80(R10), AX, R11 + ADOXQ AX, SI + ADCXQ R11, DI - // | a4 * b3 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + // | j11 - // | a5 * b3 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | w12 @ DI + MULXQ 88(R10), AX, R11 + ADOXQ AX, DI + ADCXQ R11, R9 - // | a6 * b3 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | j12 - // | a7 * b3 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | w13 @ R9 + MULXQ 96(R10), AX, R11 + ADOXQ AX, R9 + ADCXQ R11, R8 - // | a8 * b3 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | j13 - // | a9 * b3 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | w14 @ R8 + MULXQ 104(R10), AX, R11 + ADOXQ AX, R8 + ADCXQ R11, CX - // | a10 * b3 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | j14 - // | a11 * b3 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | w15 @ CX + MULXQ 112(R10), AX, R11 + ADOXQ AX, CX + ADCXQ R11, R15 - // | a12 * b3 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | j15 - // | a13 * b3 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | w16 @ R15 + MULXQ 120(R10), AX, R11 + ADOXQ AX, R15 + ADCXQ R11, R14 + ADOXQ R13, R14 - // | - // | b4 - MOVQ 32(SI), CX + // | bring the carry from q1 + MOVQ 152(SP), R13 + MOVQ $0x00, AX + ADCXQ AX, R13 + ADOXQ AX, R13 - // | a0 * b4 - // | (w4, w5, w6, w7) @ (R12, R13, R14, R15) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, R13 - ADCQ $0x00, R14 - ADCQ $0x00, R15 + // | clear flags + XORQ AX, AX - // | a1 * b4 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) + // | - // | a2 * b4 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) +/* i = 2 */ - // | a3 * b4 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R12 | 10 80(SP) | 11 SI | 12 DI | 13 R9 | 14 R8 | 15 CX + // | 16 R15 | 17 R14 | 18 272(SP) | 19 280(SP) | 20 288(SP) | 21 296(SP) | 22 72(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | a4 * b4 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) - // | a5 * b4 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | u2 @ 96(SP) + MOVQ 96(SP), DX - // | a6 * b4 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | - // | a7 * b4 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) +/* */ - // | a8 * b4 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | j9 - // | a9 * b4 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | w11 @ SI + MULXQ 72(R10), AX, R11 + ADOXQ AX, SI + MOVQ SI, 88(SP) + ADCXQ R11, DI - // | a10 * b4 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | j10 - // | a11 * b4 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | w12 @ DI + MULXQ 80(R10), AX, R11 + ADOXQ AX, DI + ADCXQ R11, R9 - // | a12 * b4 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | j11 - // | a13 * b4 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | w13 @ R9 + MULXQ 88(R10), AX, R11 + ADOXQ AX, R9 + ADCXQ R11, R8 - // | - // | b5 - MOVQ 40(SI), CX + // | j12 - // | a0 * b5 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) + // | w14 @ R8 + MULXQ 96(R10), AX, R11 + ADOXQ AX, R8 + ADCXQ R11, CX - // | a1 * b5 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + // | j13 - // | a2 * b5 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + // | w15 @ CX + MULXQ 104(R10), AX, R11 + ADOXQ AX, CX + ADCXQ R11, R15 - // | a3 * b5 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | j14 - // | a4 * b5 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | w16 @ R15 + MULXQ 112(R10), AX, R11 + ADOXQ AX, R15 + ADCXQ R11, R14 - // | a5 * b5 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | j15 - // | a6 * b5 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | w17 @ R14 + MULXQ 120(R10), AX, R11 + ADOXQ AX, R14 - // | a7 * b5 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | w18 @ 272(SP) + // | move to an idle register + MOVQ 272(SP), BX - // | a8 * b5 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | w18 @ BX + ADCXQ R11, BX + ADOXQ R13, BX + MOVQ $0x00, R13 + ADCXQ R13, R13 + MOVQ $0x00, AX + ADOXQ AX, R13 - // | a9 * b5 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | clear flags + XORQ AX, AX - // | a10 * b5 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | - // | a11 * b5 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) +/* i = 3 */ - // | a12 * b5 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R12 | 10 80(SP) | 11 88(SP) | 12 DI | 13 R9 | 14 R8 | 15 CX + // | 16 R15 | 17 R14 | 18 BX | 19 280(SP) | 20 288(SP) | 21 296(SP) | 22 72(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | a13 * b5 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) - // | - // | b6 - MOVQ 48(SI), CX + // | u3 @ 104(SP) + MOVQ 104(SP), DX - // | a0 * b6 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + // | - // | a1 * b6 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) +/* */ - // | a2 * b6 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | j9 - // | a3 * b6 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | w12 @ DI + MULXQ 72(R10), AX, R11 + ADOXQ AX, DI + MOVQ DI, 96(SP) + ADCXQ R11, R9 - // | a4 * b6 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | j10 - // | a5 * b6 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | w13 @ R9 + MULXQ 80(R10), AX, R11 + ADOXQ AX, R9 + ADCXQ R11, R8 - // | a6 * b6 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | j11 - // | a7 * b6 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | w14 @ R8 + MULXQ 88(R10), AX, R11 + ADOXQ AX, R8 + ADCXQ R11, CX - // | a8 * b6 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | j12 - // | a9 * b6 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | w15 @ CX + MULXQ 96(R10), AX, R11 + ADOXQ AX, CX + ADCXQ R11, R15 - // | a10 * b6 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | j13 - // | a11 * b6 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | w16 @ R15 + MULXQ 104(R10), AX, R11 + ADOXQ AX, R15 + ADCXQ R11, R14 - // | a12 * b6 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + // | j14 - // | a13 * b6 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) + // | w17 @ R14 + MULXQ 112(R10), AX, R11 + ADOXQ AX, R14 + ADCXQ R11, BX - // | - // | b7 - MOVQ 56(SI), CX + // | j15 - // | a0 * b7 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + // | w18 @ BX + MULXQ 120(R10), AX, R11 + ADOXQ AX, BX - // | a1 * b7 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | w19 @ 280(SP) + // | move to an idle register + MOVQ 280(SP), DI - // | a2 * b7 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | w19 @ DI + ADCXQ R11, DI + ADOXQ R13, DI + MOVQ $0x00, R13 + ADCXQ R13, R13 + MOVQ $0x00, AX + ADOXQ AX, R13 - // | a3 * b7 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | clear flags + XORQ AX, AX - // | a4 * b7 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | - // | a5 * b7 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) +/* i = 4 */ - // | a6 * b7 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R12 | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 R9 | 14 R8 | 15 CX + // | 16 R15 | 17 R14 | 18 BX | 19 DI | 20 288(SP) | 21 296(SP) | 22 72(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | a7 * b7 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) - // | a8 * b7 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | u4 @ 112(SP) + MOVQ 112(SP), DX + + // | - // | a9 * b7 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) +/* */ - // | a10 * b7 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | j9 - // | a11 * b7 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + // | w13 @ R9 + MULXQ 72(R10), AX, R11 + ADOXQ AX, R9 + MOVQ R9, 104(SP) + ADCXQ R11, R8 - // | a12 * b7 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) + // | j10 - // | a13 * b7 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) + // | w14 @ R8 + MULXQ 80(R10), AX, R11 + ADOXQ AX, R8 + ADCXQ R11, CX - // | - // | b8 - MOVQ 64(SI), CX + // | j11 - // | a0 * b8 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | w15 @ CX + MULXQ 88(R10), AX, R11 + ADOXQ AX, CX + ADCXQ R11, R15 - // | a1 * b8 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | j12 - // | a2 * b8 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | w16 @ R15 + MULXQ 96(R10), AX, R11 + ADOXQ AX, R15 + ADCXQ R11, R14 - // | a3 * b8 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | j13 - // | a4 * b8 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | w17 @ R14 + MULXQ 104(R10), AX, R11 + ADOXQ AX, R14 + ADCXQ R11, BX - // | a5 * b8 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | j14 - // | a6 * b8 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | w18 @ BX + MULXQ 112(R10), AX, R11 + ADOXQ AX, BX + ADCXQ R11, DI - // | a7 * b8 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | j15 - // | a8 * b8 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | w19 @ DI + MULXQ 120(R10), AX, R11 + ADOXQ AX, DI - // | a9 * b8 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | w20 @ 288(SP) + // | move to an idle register + MOVQ 288(SP), SI - // | a10 * b8 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + // | w20 @ SI + ADCXQ R11, SI + ADOXQ R13, SI + MOVQ $0x00, R13 + ADCXQ R13, R13 + MOVQ $0x00, AX + ADOXQ AX, R13 - // | a11 * b8 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) + // | clear flags + XORQ AX, AX - // | a12 * b8 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) + // | - // | a13 * b8 - // | (w21, w22, w23, w24) @ (104(SP), 112(SP), 120(SP), 128(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, 112(SP) - ADCQ $0x00, 120(SP) - ADCQ $0x00, 128(SP) +/* i = 5 */ - // | - // | b9 - MOVQ 72(SI), CX + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R12 | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 R8 | 15 CX + // | 16 R15 | 17 R14 | 18 BX | 19 DI | 20 SI | 21 296(SP) | 22 72(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | a0 * b9 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) - // | a1 * b9 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | u5 @ 120(SP) + MOVQ 120(SP), DX - // | a2 * b9 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | - // | a3 * b9 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) +/* */ - // | a4 * b9 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | j9 - // | a5 * b9 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | w14 @ R8 + MULXQ 72(R10), AX, R11 + ADOXQ AX, R8 + MOVQ R8, 112(SP) + ADCXQ R11, CX - // | a6 * b9 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | j10 - // | a7 * b9 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | w15 @ CX + MULXQ 80(R10), AX, R11 + ADOXQ AX, CX + ADCXQ R11, R15 - // | a8 * b9 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | j11 - // | a9 * b9 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + // | w16 @ R15 + MULXQ 88(R10), AX, R11 + ADOXQ AX, R15 + ADCXQ R11, R14 - // | a10 * b9 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) + // | j12 - // | a11 * b9 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) + // | w17 @ R14 + MULXQ 96(R10), AX, R11 + ADOXQ AX, R14 + ADCXQ R11, BX - // | a12 * b9 - // | (w21, w22, w23, w24) @ (104(SP), 112(SP), 120(SP), 128(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, 112(SP) - ADCQ $0x00, 120(SP) - ADCQ $0x00, 128(SP) + // | j13 - // | a13 * b9 - // | (w22, w23, w24, w25) @ (112(SP), 120(SP), 128(SP), 136(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, 120(SP) - ADCQ $0x00, 128(SP) - ADCQ $0x00, 136(SP) + // | w18 @ BX + MULXQ 104(R10), AX, R11 + ADOXQ AX, BX + ADCXQ R11, DI - // | - // | b10 - MOVQ 80(SI), CX + // | j14 - // | a0 * b10 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | w19 @ DI + MULXQ 112(R10), AX, R11 + ADOXQ AX, DI + ADCXQ R11, SI - // | a1 * b10 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | j15 - // | a2 * b10 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | w20 @ SI + MULXQ 120(R10), AX, R11 + ADOXQ AX, SI - // | a3 * b10 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | w21 @ 296(SP) + // | move to an idle register + MOVQ 296(SP), R8 - // | a4 * b10 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | w21 @ R8 + ADCXQ R11, R8 + ADOXQ R13, R8 + MOVQ $0x00, R13 + ADCXQ R13, R13 + MOVQ $0x00, AX + ADOXQ AX, R13 - // | a5 * b10 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | clear flags + XORQ AX, AX - // | a6 * b10 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | - // | a7 * b10 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) +/* i = 6 */ - // | a8 * b10 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R12 | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 112(SP) | 15 CX + // | 16 R15 | 17 R14 | 18 BX | 19 DI | 20 SI | 21 R8 | 22 72(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | a9 * b10 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) - // | a10 * b10 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) + // | u6 @ 128(SP) + MOVQ 128(SP), DX - // | a11 * b10 - // | (w21, w22, w23, w24) @ (104(SP), 112(SP), 120(SP), 128(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, 112(SP) - ADCQ $0x00, 120(SP) - ADCQ $0x00, 128(SP) + // | - // | a12 * b10 - // | (w22, w23, w24, w25) @ (112(SP), 120(SP), 128(SP), 136(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, 120(SP) - ADCQ $0x00, 128(SP) - ADCQ $0x00, 136(SP) +/* */ - // | a13 * b10 - // | (w23, w24, w25, w26) @ (120(SP), 128(SP), 136(SP), 144(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 120(SP) - ADCQ DX, 128(SP) - ADCQ $0x00, 136(SP) - ADCQ $0x00, 144(SP) + // | j9 - // | - // | b11 - MOVQ 88(SI), CX + // | w15 @ CX + MULXQ 72(R10), AX, R11 + ADOXQ AX, CX + MOVQ CX, 120(SP) + ADCXQ R11, R15 - // | a0 * b11 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | j10 - // | a1 * b11 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | w16 @ R15 + MULXQ 80(R10), AX, R11 + ADOXQ AX, R15 + ADCXQ R11, R14 - // | a2 * b11 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | j11 - // | a3 * b11 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | w17 @ R14 + MULXQ 88(R10), AX, R11 + ADOXQ AX, R14 + ADCXQ R11, BX - // | a4 * b11 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | j12 - // | a5 * b11 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | w18 @ BX + MULXQ 96(R10), AX, R11 + ADOXQ AX, BX + ADCXQ R11, DI - // | a6 * b11 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | j13 - // | a7 * b11 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + // | w19 @ DI + MULXQ 104(R10), AX, R11 + ADOXQ AX, DI + ADCXQ R11, SI - // | a8 * b11 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) + // | j14 - // | a9 * b11 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) + // | w20 @ SI + MULXQ 112(R10), AX, R11 + ADOXQ AX, SI + ADCXQ R11, R8 - // | a10 * b11 - // | (w21, w22, w23, w24) @ (104(SP), 112(SP), 120(SP), 128(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, 112(SP) - ADCQ $0x00, 120(SP) - ADCQ $0x00, 128(SP) + // | j15 - // | a11 * b11 - // | (w22, w23, w24, w25) @ (112(SP), 120(SP), 128(SP), 136(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, 120(SP) - ADCQ $0x00, 128(SP) - ADCQ $0x00, 136(SP) + // | w21 @ R8 + MULXQ 120(R10), AX, R11 + ADOXQ AX, R8 - // | a12 * b11 - // | (w23, w24, w25, w26) @ (120(SP), 128(SP), 136(SP), 144(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 120(SP) - ADCQ DX, 128(SP) - ADCQ $0x00, 136(SP) - ADCQ $0x00, 144(SP) + // | w22 @ 72(SP) + // | move to an idle register + MOVQ 72(SP), CX - // | a13 * b11 - // | (w24, w25, w26, w27) @ (128(SP), 136(SP), 144(SP), 152(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 128(SP) - ADCQ DX, 136(SP) - ADCQ $0x00, 144(SP) - ADCQ $0x00, 152(SP) + // | w22 @ CX + ADCXQ R11, CX + ADOXQ R13, CX + MOVQ $0x00, R13 + ADCXQ R13, R13 + MOVQ $0x00, AX + ADOXQ AX, R13 - // | - // | b12 - MOVQ 96(SI), CX + // | clear flags + XORQ AX, AX - // | a0 * b12 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | - // | a1 * b12 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) +/* i = 7 */ - // | a2 * b12 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R12 | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 112(SP) | 15 120(SP) + // | 16 R15 | 17 R14 | 18 BX | 19 DI | 20 SI | 21 R8 | 22 CX | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | a3 * b12 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) - // | a4 * b12 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | u7 @ 136(SP) + MOVQ 136(SP), DX - // | a5 * b12 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | - // | a6 * b12 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) +/* */ - // | a7 * b12 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) + // | j9 - // | a8 * b12 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) + // | w16 @ R15 + MULXQ 72(R10), AX, R11 + ADOXQ AX, R15 + MOVQ R15, 72(SP) + ADCXQ R11, R14 - // | a9 * b12 - // | (w21, w22, w23, w24) @ (104(SP), 112(SP), 120(SP), 128(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, 112(SP) - ADCQ $0x00, 120(SP) - ADCQ $0x00, 128(SP) + // | j10 - // | a10 * b12 - // | (w22, w23, w24, w25) @ (112(SP), 120(SP), 128(SP), 136(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, 120(SP) - ADCQ $0x00, 128(SP) - ADCQ $0x00, 136(SP) + // | w17 @ R14 + MULXQ 80(R10), AX, R11 + ADOXQ AX, R14 + ADCXQ R11, BX - // | a11 * b12 - // | (w23, w24, w25, w26) @ (120(SP), 128(SP), 136(SP), 144(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 120(SP) - ADCQ DX, 128(SP) - ADCQ $0x00, 136(SP) - ADCQ $0x00, 144(SP) + // | j11 - // | a12 * b12 - // | (w24, w25, w26, w27) @ (128(SP), 136(SP), 144(SP), 152(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 128(SP) - ADCQ DX, 136(SP) - ADCQ $0x00, 144(SP) - ADCQ $0x00, 152(SP) + // | w18 @ BX + MULXQ 88(R10), AX, R11 + ADOXQ AX, BX + ADCXQ R11, DI - // | a13 * b12 - // | (w25, w26, w27) @ (136(SP), 144(SP), 152(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 136(SP) - ADCQ DX, 144(SP) - ADCQ $0x00, 152(SP) + // | j12 - // | - // | b13 - MOVQ 104(SI), CX + // | w19 @ DI + MULXQ 96(R10), AX, R11 + ADOXQ AX, DI + ADCXQ R11, SI - // | a0 * b13 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | j13 - // | a1 * b13 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | w20 @ SI + MULXQ 104(R10), AX, R11 + ADOXQ AX, SI + ADCXQ R11, R8 - // | a2 * b13 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | j14 - // | a3 * b13 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | w21 @ R8 + MULXQ 112(R10), AX, R11 + ADOXQ AX, R8 + ADCXQ R11, CX - // | a4 * b13 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | j15 - // | a5 * b13 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + // | w22 @ CX + MULXQ 120(R10), AX, R11 + ADOXQ AX, CX - // | a6 * b13 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) + // | w23 @ 64(SP) + // | move to an idle register + MOVQ 64(SP), R9 - // | a7 * b13 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) + // | w23 @ R9 + ADCXQ R11, R9 + ADOXQ R13, R9 + MOVQ $0x00, R13 + ADCXQ R13, R13 + MOVQ $0x00, AX + ADOXQ AX, R13 - // | a8 * b13 - // | (w21, w22, w23, w24) @ (104(SP), 112(SP), 120(SP), 128(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, 112(SP) - ADCQ $0x00, 120(SP) - ADCQ $0x00, 128(SP) + // | clear flags + XORQ AX, AX - // | a9 * b13 - // | (w22, w23, w24, w25) @ (112(SP), 120(SP), 128(SP), 136(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, 120(SP) - ADCQ $0x00, 128(SP) - ADCQ $0x00, 136(SP) + // | - // | a10 * b13 - // | (w23, w24, w25, w26) @ (120(SP), 128(SP), 136(SP), 144(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 120(SP) - ADCQ DX, 128(SP) - ADCQ $0x00, 136(SP) - ADCQ $0x00, 144(SP) +/* i = 8 */ - // | a11 * b13 - // | (w24, w25, w26, w27) @ (128(SP), 136(SP), 144(SP), 152(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 128(SP) - ADCQ DX, 136(SP) - ADCQ $0x00, 144(SP) - ADCQ $0x00, 152(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R12 | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 112(SP) | 15 120(SP) + // | 16 72(SP) | 17 R14 | 18 BX | 19 DI | 20 SI | 21 R8 | 22 CX | 23 R9 | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | a12 * b13 - // | (w25, w26, w27) @ (136(SP), 144(SP), 152(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 136(SP) - ADCQ DX, 144(SP) - ADCQ $0x00, 152(SP) - // | a13 * b13 - // | (w26, w27) @ (144(SP), 152(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 144(SP) - ADCQ DX, 152(SP) + // | u8 @ 144(SP) + MOVQ 144(SP), DX - // | - // | Montgomerry Reduction - MOVQ R15, 160(SP) - MOVQ R14, 168(SP) - MOVQ p+24(FP), R14 + // | - // | - // | (u @ CX) = (w0 @ R8) * inp - MOVQ R8, AX - MULQ inp+32(FP) - MOVQ AX, CX +/* */ - // | w0 @ R8 - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI + // | j9 - // | w1 @ R9 - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + // | w17 @ R14 + MULXQ 72(R10), AX, R11 + ADOXQ AX, R14 + ADCXQ R11, BX - // | w2 @ R10 - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + // | j10 - // | w3 @ R11 - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + // | w18 @ BX + MULXQ 80(R10), AX, R11 + ADOXQ AX, BX + ADCXQ R11, DI - // | w4 @ R12 - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + // | j11 - // | w5 @ R13 - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + // | w19 @ DI + MULXQ 88(R10), AX, R11 + ADOXQ AX, DI + ADCXQ R11, SI - // | w6 @ 168(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 168(SP) - ADCQ DX, DI - ADDQ SI, 168(SP) - ADCQ $0x00, DI + // | j12 + + // | w20 @ SI + MULXQ 96(R10), AX, R11 + ADOXQ AX, SI + ADCXQ R11, R8 - // | w7 @ 160(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 160(SP) - ADCQ DX, SI - ADDQ DI, 160(SP) - ADCQ $0x00, SI + // | j13 - // | w8 @ (SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + // | w21 @ R8 + MULXQ 104(R10), AX, R11 + ADOXQ AX, R8 + ADCXQ R11, CX - // | w9 @ 8(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + // | j14 - // | w10 @ 16(SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + // | w22 @ CX + MULXQ 112(R10), AX, R11 + ADOXQ AX, CX + ADCXQ R11, R9 - // | w11 @ 24(SP) - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI + // | j15 - // | w12 @ 32(SP) - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI - ADDQ SI, 32(SP) - ADCQ $0x00, DI + // | w23 @ R9 + MULXQ 120(R10), AX, R11 + ADOXQ AX, R9 - // | w13 @ 40(SP) - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, SI - ADDQ DI, 40(SP) - ADCQ $0x00, SI + // | w24 @ 56(SP) + // | move to an idle register + MOVQ 56(SP), R15 - // | w14 @ 48(SP) - ADDQ SI, 48(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | w24 @ R15 + ADCXQ R11, R15 + ADOXQ R13, R15 + MOVQ $0x00, R13 + ADCXQ R13, R13 + MOVQ $0x00, AX + ADOXQ AX, R13 - // | - MOVQ 56(SP), R8 + // | + // | q2 ends + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R12 | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 112(SP) | 15 120(SP) + // | 16 72(SP) | 17 R14 | 18 BX | 19 DI | 20 SI | 21 R8 | 22 CX | 23 R9 | 24 R15 | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | (u @ CX) = (w1 @ R9) * inp - MOVQ R9, AX - MULQ inp+32(FP) - MOVQ AX, CX - // | w1 @ R9 - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, DI + // | save the carry from q2 + // | should be added to w25 + MOVQ R13, 152(SP) - // | w2 @ R10 - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + // | - // | w3 @ R11 - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI +/* q2 q3 transition swap */ - // | w4 @ R12 - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + MOVQ 80(SP), R13 + MOVQ R15, 56(SP) + MOVQ 88(SP), R15 + MOVQ R9, 64(SP) + MOVQ 96(SP), R9 + MOVQ CX, 80(SP) + MOVQ 104(SP), CX + MOVQ R8, 88(SP) + MOVQ 112(SP), R8 + MOVQ SI, 96(SP) + MOVQ 120(SP), SI + MOVQ DI, 104(SP) + MOVQ 72(SP), DI - // | w5 @ R13 - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + // | + // | W q2 q3 transition + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R12 | 10 R13 | 11 R15 | 12 R9 | 13 CX | 14 R8 | 15 SI + // | 16 DI | 17 R14 | 18 BX | 19 104(SP) | 20 96(SP) | 21 88(SP) | 22 80(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | w6 @ 168(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 168(SP) - ADCQ DX, SI - ADDQ DI, 168(SP) - ADCQ $0x00, SI - // | w7 @ 160(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 160(SP) - ADCQ DX, DI - ADDQ SI, 160(SP) - ADCQ $0x00, DI + // | - // | w8 @ (SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI +/* montgomery reduction q3 */ - // | w9 @ 8(SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + // | clear flags + XORQ AX, AX - // | w10 @ 16(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI + // | - // | w11 @ 24(SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI - ADDQ SI, 24(SP) - ADCQ $0x00, DI +/* i = 9 */ - // | w12 @ 32(SP) - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, SI - ADDQ DI, 32(SP) - ADCQ $0x00, SI + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 R12 | 10 R13 | 11 R15 | 12 R9 | 13 CX | 14 R8 | 15 SI + // | 16 DI | 17 R14 | 18 BX | 19 104(SP) | 20 96(SP) | 21 88(SP) | 22 80(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | w13 @ 40(SP) - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, DI - ADDQ SI, 40(SP) - ADCQ $0x00, DI - // | w14 @ 48(SP) - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, SI - ADDQ DI, 48(SP) - ADCQ $0x00, SI + // | | u9 = w9 * inp + MOVQ R12, DX + MULXQ inp+32(FP), DX, R11 - // | w15 @ R8 - ADDQ SI, R15 - ADCQ R15, R8 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | save u9 + MOVQ DX, 72(SP) - // | - MOVQ 64(SP), R9 + // | - // | (u @ CX) = (w2 @ R10) * inp - MOVQ R10, AX - MULQ inp+32(FP) - MOVQ AX, CX +/* */ - // | w2 @ R10 - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI + // | j0 - // | w3 @ R11 - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + // | w9 @ R12 + MULXQ (R10), AX, R11 + ADOXQ AX, R12 + ADCXQ R11, R13 - // | w4 @ R12 - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + // | j1 - // | w5 @ R13 - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + // | w10 @ R13 + MULXQ 8(R10), AX, R11 + ADOXQ AX, R13 + ADCXQ R11, R15 - // | w6 @ 168(SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 168(SP) - ADCQ DX, DI - ADDQ SI, 168(SP) - ADCQ $0x00, DI + // | j2 - // | w7 @ 160(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 160(SP) - ADCQ DX, SI - ADDQ DI, 160(SP) - ADCQ $0x00, SI + // | w11 @ R15 + MULXQ 16(R10), AX, R11 + ADOXQ AX, R15 + ADCXQ R11, R9 - // | w8 @ (SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + // | j3 - // | w9 @ 8(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + // | w12 @ R9 + MULXQ 24(R10), AX, R11 + ADOXQ AX, R9 + ADCXQ R11, CX - // | w10 @ 16(SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + // | j4 - // | w11 @ 24(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI + // | w13 @ CX + MULXQ 32(R10), AX, R11 + ADOXQ AX, CX + ADCXQ R11, R8 - // | w12 @ 32(SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI - ADDQ SI, 32(SP) - ADCQ $0x00, DI + // | j5 - // | w13 @ 40(SP) - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, SI - ADDQ DI, 40(SP) - ADCQ $0x00, SI + // | w14 @ R8 + MULXQ 40(R10), AX, R11 + ADOXQ AX, R8 + ADCXQ R11, SI - // | w14 @ 48(SP) - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, DI - ADDQ SI, 48(SP) - ADCQ $0x00, DI + // | j6 - // | w15 @ R8 - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + // | w15 @ SI + MULXQ 48(R10), AX, R11 + ADOXQ AX, SI + ADCXQ R11, DI - // | w16 @ R9 - ADDQ SI, R15 - ADCQ R15, R9 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | j7 - // | - MOVQ 72(SP), R10 + // | w16 @ DI + MULXQ 56(R10), AX, R11 + ADOXQ AX, DI + ADCXQ R11, R14 - // | (u @ CX) = (w3 @ R11) * inp - MOVQ R11, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | j8 - // | w3 @ R11 - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, DI + // | w17 @ R14 + MULXQ 64(R10), AX, R11 + ADOXQ AX, R14 + ADCXQ R11, BX + ADOXQ R12, BX + ADCXQ R12, R12 + MOVQ $0x00, AX + ADOXQ AX, R12 - // | w4 @ R12 - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + // | clear flags + XORQ AX, AX - // | w5 @ R13 - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + // | - // | w6 @ 168(SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 168(SP) - ADCQ DX, SI - ADDQ DI, 168(SP) - ADCQ $0x00, SI +/* i = 10 */ - // | w7 @ 160(SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 160(SP) - ADCQ DX, DI - ADDQ SI, 160(SP) - ADCQ $0x00, DI + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 R13 | 11 R15 | 12 R9 | 13 CX | 14 R8 | 15 SI + // | 16 DI | 17 R14 | 18 BX | 19 104(SP) | 20 96(SP) | 21 88(SP) | 22 80(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | w8 @ (SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI - // | w9 @ 8(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + // | | u10 = w10 * inp + MOVQ R13, DX + MULXQ inp+32(FP), DX, R11 - // | w10 @ 16(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI + // | save u10 + MOVQ DX, 112(SP) - // | w11 @ 24(SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI - ADDQ SI, 24(SP) - ADCQ $0x00, DI + // | - // | w12 @ 32(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, SI - ADDQ DI, 32(SP) - ADCQ $0x00, SI +/* */ - // | w13 @ 40(SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, DI - ADDQ SI, 40(SP) - ADCQ $0x00, DI + // | j0 - // | w14 @ 48(SP) - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, SI - ADDQ DI, 48(SP) - ADCQ $0x00, SI + // | w10 @ R13 + MULXQ (R10), AX, R11 + ADOXQ AX, R13 + ADCXQ R11, R15 - // | w15 @ R8 - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + // | j1 - // | w16 @ R9 - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + // | w11 @ R15 + MULXQ 8(R10), AX, R11 + ADOXQ AX, R15 + ADCXQ R11, R9 - // | w17 @ R10 - ADDQ SI, R15 - ADCQ R15, R10 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | j2 - // | - MOVQ 80(SP), R11 + // | w12 @ R9 + MULXQ 16(R10), AX, R11 + ADOXQ AX, R9 + ADCXQ R11, CX - // | (u @ CX) = (w4 @ R12) * inp - MOVQ R12, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | j3 - // | w4 @ R12 - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI + // | w13 @ CX + MULXQ 24(R10), AX, R11 + ADOXQ AX, CX + ADCXQ R11, R8 - // | w5 @ R13 - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + // | j4 - // | w6 @ 168(SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 168(SP) - ADCQ DX, DI - ADDQ SI, 168(SP) - ADCQ $0x00, DI + // | w14 @ R8 + MULXQ 32(R10), AX, R11 + ADOXQ AX, R8 + ADCXQ R11, SI - // | w7 @ 160(SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 160(SP) - ADCQ DX, SI - ADDQ DI, 160(SP) - ADCQ $0x00, SI + // | j5 - // | w8 @ (SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + // | w15 @ SI + MULXQ 40(R10), AX, R11 + ADOXQ AX, SI + ADCXQ R11, DI - // | w9 @ 8(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + // | j6 - // | w10 @ 16(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + // | w16 @ DI + MULXQ 48(R10), AX, R11 + ADOXQ AX, DI + ADCXQ R11, R14 - // | w11 @ 24(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI + // | j7 - // | w12 @ 32(SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI - ADDQ SI, 32(SP) - ADCQ $0x00, DI + // | w17 @ R14 + MULXQ 56(R10), AX, R11 + ADOXQ AX, R14 + ADCXQ R11, BX - // | w13 @ 40(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, SI - ADDQ DI, 40(SP) - ADCQ $0x00, SI + // | j8 - // | w14 @ 48(SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, DI - ADDQ SI, 48(SP) - ADCQ $0x00, DI + // | w18 @ BX + MULXQ 64(R10), AX, R11 + ADOXQ AX, BX - // | w15 @ R8 - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + // | w19 @ 104(SP) + // | move to temp register + MOVQ 104(SP), AX + ADCXQ R11, AX + ADOXQ R12, AX + + // | move to an idle register + // | w19 @ AX + MOVQ AX, R12 + ADCXQ R13, R13 + MOVQ $0x00, AX + ADOXQ AX, R13 - // | w16 @ R9 - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI + // | clear flags + XORQ AX, AX - // | w17 @ R10 - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + // | - // | w18 @ R11 - ADDQ SI, R15 - ADCQ R15, R11 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 +/* i = 11 */ - // | - MOVQ 88(SP), R12 + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 R15 | 12 R9 | 13 CX | 14 R8 | 15 SI + // | 16 DI | 17 R14 | 18 BX | 19 R12 | 20 96(SP) | 21 88(SP) | 22 80(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | (u @ CX) = (w5 @ R13) * inp - MOVQ R13, AX - MULQ inp+32(FP) - MOVQ AX, CX - // | w5 @ R13 - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, DI + // | | u11 = w11 * inp + MOVQ R15, DX + MULXQ inp+32(FP), DX, R11 - // | w6 @ 168(SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 168(SP) - ADCQ DX, SI - ADDQ DI, 168(SP) - ADCQ $0x00, SI + // | save u11 + MOVQ DX, 104(SP) - // | w7 @ 160(SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 160(SP) - ADCQ DX, DI - ADDQ SI, 160(SP) - ADCQ $0x00, DI + // | - // | w8 @ (SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI +/* */ - // | w9 @ 8(SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + // | j0 - // | w10 @ 16(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI + // | w11 @ R15 + MULXQ (R10), AX, R11 + ADOXQ AX, R15 + ADCXQ R11, R9 - // | w11 @ 24(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI - ADDQ SI, 24(SP) - ADCQ $0x00, DI + // | j1 - // | w12 @ 32(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, SI - ADDQ DI, 32(SP) - ADCQ $0x00, SI + // | w12 @ R9 + MULXQ 8(R10), AX, R11 + ADOXQ AX, R9 + ADCXQ R11, CX - // | w13 @ 40(SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, DI - ADDQ SI, 40(SP) - ADCQ $0x00, DI + // | j2 - // | w14 @ 48(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, SI - ADDQ DI, 48(SP) - ADCQ $0x00, SI + // | w13 @ CX + MULXQ 16(R10), AX, R11 + ADOXQ AX, CX + ADCXQ R11, R8 - // | w15 @ R8 - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + // | j3 - // | w16 @ R9 - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + // | w14 @ R8 + MULXQ 24(R10), AX, R11 + ADOXQ AX, R8 + ADCXQ R11, SI - // | w17 @ R10 - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + // | j4 - // | w18 @ R11 - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + // | w15 @ SI + MULXQ 32(R10), AX, R11 + ADOXQ AX, SI + ADCXQ R11, DI - // | w19 @ R12 - ADDQ SI, R15 - ADCQ R15, R12 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | j5 - // | - MOVQ 96(SP), R13 + // | w16 @ DI + MULXQ 40(R10), AX, R11 + ADOXQ AX, DI + ADCXQ R11, R14 - // | (u @ CX) = (w6 @ 168(SP)) * inp - MOVQ 168(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + // | j6 - // | w6 @ 168(SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, 168(SP) - ADCQ DX, DI + // | w17 @ R14 + MULXQ 48(R10), AX, R11 + ADOXQ AX, R14 + ADCXQ R11, BX - // | w7 @ 160(SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 160(SP) - ADCQ DX, SI - ADDQ DI, 160(SP) - ADCQ $0x00, SI + // | j7 - // | w8 @ (SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + // | w18 @ BX + MULXQ 56(R10), AX, R11 + ADOXQ AX, BX + ADCXQ R11, R12 - // | w9 @ 8(SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + // | j8 - // | w10 @ 16(SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + // | w19 @ R12 + MULXQ 64(R10), AX, R11 + ADOXQ AX, R12 - // | w11 @ 24(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI + // | w20 @ 96(SP) + // | move to temp register + MOVQ 96(SP), AX + ADCXQ R11, AX + ADOXQ R13, AX + + // | move to an idle register + // | w20 @ AX + MOVQ AX, R13 + ADCXQ R15, R15 + MOVQ $0x00, AX + ADOXQ AX, R15 - // | w12 @ 32(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI - ADDQ SI, 32(SP) - ADCQ $0x00, DI + // | clear flags + XORQ AX, AX - // | w13 @ 40(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, SI - ADDQ DI, 40(SP) - ADCQ $0x00, SI + // | - // | w14 @ 48(SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, DI - ADDQ SI, 48(SP) - ADCQ $0x00, DI +/* i = 12 */ - // | w15 @ R8 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 R9 | 13 CX | 14 R8 | 15 SI + // | 16 DI | 17 R14 | 18 BX | 19 R12 | 20 R13 | 21 88(SP) | 22 80(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | w16 @ R9 - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI - // | w17 @ R10 - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + // | | u12 = w12 * inp + MOVQ R9, DX + MULXQ inp+32(FP), DX, R11 - // | w18 @ R11 - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI + // | save u12 + MOVQ DX, 96(SP) - // | w19 @ R12 - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + // | - // | w20 @ R13 - ADDQ SI, R15 - ADCQ R15, R13 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 +/* */ - // | - MOVQ 104(SP), BX - MOVQ BX, 168(SP) + // | j0 - // | (u @ CX) = (w7 @ 160(SP)) * inp - MOVQ 160(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + // | w12 @ R9 + MULXQ (R10), AX, R11 + ADOXQ AX, R9 + ADCXQ R11, CX - // | w7 @ 160(SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, 160(SP) - ADCQ DX, DI + // | j1 - // | w8 @ (SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + // | w13 @ CX + MULXQ 8(R10), AX, R11 + ADOXQ AX, CX + ADCXQ R11, R8 - // | w9 @ 8(SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + // | j2 - // | w10 @ 16(SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI + // | w14 @ R8 + MULXQ 16(R10), AX, R11 + ADOXQ AX, R8 + ADCXQ R11, SI - // | w11 @ 24(SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI - ADDQ SI, 24(SP) - ADCQ $0x00, DI + // | j3 - // | w12 @ 32(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, SI - ADDQ DI, 32(SP) - ADCQ $0x00, SI + // | w15 @ SI + MULXQ 24(R10), AX, R11 + ADOXQ AX, SI + ADCXQ R11, DI - // | w13 @ 40(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, DI - ADDQ SI, 40(SP) - ADCQ $0x00, DI + // | j4 - // | w14 @ 48(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, SI - ADDQ DI, 48(SP) - ADCQ $0x00, SI + // | w16 @ DI + MULXQ 32(R10), AX, R11 + ADOXQ AX, DI + ADCXQ R11, R14 - // | w15 @ R8 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + // | j5 - // | w16 @ R9 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + // | w17 @ R14 + MULXQ 40(R10), AX, R11 + ADOXQ AX, R14 + ADCXQ R11, BX - // | w17 @ R10 - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + // | j6 - // | w18 @ R11 - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + // | w18 @ BX + MULXQ 48(R10), AX, R11 + ADOXQ AX, BX + ADCXQ R11, R12 + + // | j7 // | w19 @ R12 - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + MULXQ 56(R10), AX, R11 + ADOXQ AX, R12 + ADCXQ R11, R13 + + // | j8 // | w20 @ R13 - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + MULXQ 64(R10), AX, R11 + ADOXQ AX, R13 - // | w21 @ 168(SP) - ADDQ SI, R15 - ADCQ R15, 168(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | w21 @ 88(SP) + // | move to temp register + MOVQ 88(SP), AX + ADCXQ R11, AX + ADOXQ R15, AX - // | - MOVQ 112(SP), BX - MOVQ BX, 160(SP) + // | move to an idle register + // | w21 @ AX + MOVQ AX, R15 + ADCXQ R9, R9 + MOVQ $0x00, AX + ADOXQ AX, R9 - // | (u @ CX) = (w8 @ (SP)) * inp - MOVQ (SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + // | clear flags + XORQ AX, AX - // | w8 @ (SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI + // | - // | w9 @ 8(SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI +/* i = 13 */ - // | w10 @ 16(SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 CX | 14 R8 | 15 SI + // | 16 DI | 17 R14 | 18 BX | 19 R12 | 20 R13 | 21 R15 | 22 80(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | w11 @ 24(SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI - // | w12 @ 32(SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI - ADDQ SI, 32(SP) - ADCQ $0x00, DI + // | | u13 = w13 * inp + MOVQ CX, DX + MULXQ inp+32(FP), DX, R11 - // | w13 @ 40(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, SI - ADDQ DI, 40(SP) - ADCQ $0x00, SI + // | save u13 + MOVQ DX, 88(SP) - // | w14 @ 48(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, DI - ADDQ SI, 48(SP) - ADCQ $0x00, DI + // | - // | w15 @ R8 - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI +/* */ - // | w16 @ R9 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI + // | j0 - // | w17 @ R10 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + // | w13 @ CX + MULXQ (R10), AX, R11 + ADOXQ AX, CX + ADCXQ R11, R8 - // | w18 @ R11 - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI + // | j1 - // | w19 @ R12 - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + // | w14 @ R8 + MULXQ 8(R10), AX, R11 + ADOXQ AX, R8 + ADCXQ R11, SI - // | w20 @ R13 - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + // | j2 - // | w21 @ 168(SP) - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX - ADDQ AX, 168(SP) - ADCQ DX, SI - ADDQ DI, 168(SP) - ADCQ $0x00, SI + // | w15 @ SI + MULXQ 16(R10), AX, R11 + ADOXQ AX, SI + ADCXQ R11, DI - // | w22 @ 160(SP) - ADDQ SI, R15 - ADCQ R15, 160(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | j3 - // | - MOVQ 120(SP), BX - MOVQ BX, (SP) + // | w16 @ DI + MULXQ 24(R10), AX, R11 + ADOXQ AX, DI + ADCXQ R11, R14 - // | (u @ CX) = (w9 @ 8(SP)) * inp - MOVQ 8(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + // | j4 - // | w9 @ 8(SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI + // | w17 @ R14 + MULXQ 32(R10), AX, R11 + ADOXQ AX, R14 + ADCXQ R11, BX - // | w10 @ 16(SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI + // | j5 - // | w11 @ 24(SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI - ADDQ SI, 24(SP) - ADCQ $0x00, DI + // | w18 @ BX + MULXQ 40(R10), AX, R11 + ADOXQ AX, BX + ADCXQ R11, R12 - // | w12 @ 32(SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, SI - ADDQ DI, 32(SP) - ADCQ $0x00, SI + // | j6 - // | w13 @ 40(SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, DI - ADDQ SI, 40(SP) - ADCQ $0x00, DI + // | w19 @ R12 + MULXQ 48(R10), AX, R11 + ADOXQ AX, R12 + ADCXQ R11, R13 - // | w14 @ 48(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, SI - ADDQ DI, 48(SP) - ADCQ $0x00, SI + // | j7 - // | w15 @ R8 - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + // | w20 @ R13 + MULXQ 56(R10), AX, R11 + ADOXQ AX, R13 + ADCXQ R11, R15 - // | w16 @ R9 - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + // | j8 - // | w17 @ R10 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + // | w21 @ R15 + MULXQ 64(R10), AX, R11 + ADOXQ AX, R15 + + // | w22 @ 80(SP) + // | move to temp register + MOVQ 80(SP), AX + ADCXQ R11, AX + ADOXQ R9, AX + + // | move to an idle register + // | w22 @ AX + MOVQ AX, R9 + ADCXQ CX, CX + MOVQ $0x00, AX + ADOXQ AX, CX + + // | clear flags + XORQ AX, AX - // | w18 @ R11 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + // | - // | w19 @ R12 - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI +/* i = 14 */ - // | w20 @ R13 - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 R8 | 15 SI + // | 16 DI | 17 R14 | 18 BX | 19 R12 | 20 R13 | 21 R15 | 22 R9 | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | w21 @ 168(SP) - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, 168(SP) - ADCQ DX, DI - ADDQ SI, 168(SP) - ADCQ $0x00, DI - // | w22 @ 160(SP) - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX - ADDQ AX, 160(SP) - ADCQ DX, SI - ADDQ DI, 160(SP) - ADCQ $0x00, SI + // | | u14 = w14 * inp + MOVQ R8, DX + MULXQ inp+32(FP), DX, R11 - // | w23 @ (SP) - ADDQ SI, R15 - ADCQ R15, (SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | save u14 + MOVQ DX, 80(SP) - // | - MOVQ 128(SP), BX - MOVQ BX, 8(SP) + // | - // | (u @ CX) = (w10 @ 16(SP)) * inp - MOVQ 16(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX +/* */ - // | w10 @ 16(SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI + // | j0 - // | w11 @ 24(SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI + // | w14 @ R8 + MULXQ (R10), AX, R11 + ADOXQ AX, R8 + ADCXQ R11, SI - // | w12 @ 32(SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI - ADDQ SI, 32(SP) - ADCQ $0x00, DI + // | j1 - // | w13 @ 40(SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, SI - ADDQ DI, 40(SP) - ADCQ $0x00, SI + // | w15 @ SI + MULXQ 8(R10), AX, R11 + ADOXQ AX, SI + ADCXQ R11, DI - // | w14 @ 48(SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, DI - ADDQ SI, 48(SP) - ADCQ $0x00, DI + // | j2 - // | w15 @ R8 - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + // | w16 @ DI + MULXQ 16(R10), AX, R11 + ADOXQ AX, DI + ADCXQ R11, R14 - // | w16 @ R9 - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI + // | j3 - // | w17 @ R10 - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + // | w17 @ R14 + MULXQ 24(R10), AX, R11 + ADOXQ AX, R14 + ADCXQ R11, BX - // | w18 @ R11 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI + // | j4 + + // | w18 @ BX + MULXQ 32(R10), AX, R11 + ADOXQ AX, BX + ADCXQ R11, R12 + + // | j5 // | w19 @ R12 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + MULXQ 40(R10), AX, R11 + ADOXQ AX, R12 + ADCXQ R11, R13 - // | w20 @ R13 - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + // | j6 - // | w21 @ 168(SP) - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, 168(SP) - ADCQ DX, SI - ADDQ DI, 168(SP) - ADCQ $0x00, SI + // | w20 @ R13 + MULXQ 48(R10), AX, R11 + ADOXQ AX, R13 + ADCXQ R11, R15 - // | w22 @ 160(SP) - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, 160(SP) - ADCQ DX, DI - ADDQ SI, 160(SP) - ADCQ $0x00, DI + // | j7 - // | w23 @ (SP) - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + // | w21 @ R15 + MULXQ 56(R10), AX, R11 + ADOXQ AX, R15 + ADCXQ R11, R9 - // | w24 @ 8(SP) - ADDQ SI, R15 - ADCQ R15, 8(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | j8 - // | - MOVQ 136(SP), BX - MOVQ BX, 16(SP) + // | w22 @ R9 + MULXQ 64(R10), AX, R11 + ADOXQ AX, R9 - // | (u @ CX) = (w11 @ 24(SP)) * inp - MOVQ 24(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + // | w23 @ 64(SP) + // | move to temp register + MOVQ 64(SP), AX + ADCXQ R11, AX + ADOXQ CX, AX - // | w11 @ 24(SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI + // | move to an idle register + // | w23 @ AX + MOVQ AX, CX + ADCXQ R8, R8 + MOVQ $0x00, AX + ADOXQ AX, R8 - // | w12 @ 32(SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, SI - ADDQ DI, 32(SP) - ADCQ $0x00, SI + // | clear flags + XORQ AX, AX - // | w13 @ 40(SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, DI - ADDQ SI, 40(SP) - ADCQ $0x00, DI + // | - // | w14 @ 48(SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, SI - ADDQ DI, 48(SP) - ADCQ $0x00, SI +/* i = 15 */ - // | w15 @ R8 - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - | 15 SI + // | 16 DI | 17 R14 | 18 BX | 19 R12 | 20 R13 | 21 R15 | 22 R9 | 23 CX | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | w16 @ R9 - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI - // | w17 @ R10 - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + // | | u15 = w15 * inp + MOVQ SI, DX + MULXQ inp+32(FP), DX, R11 - // | w18 @ R11 - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + // | save u15 + MOVQ DX, 64(SP) - // | w19 @ R12 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + // | - // | w20 @ R13 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI +/* */ - // | w21 @ 168(SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, 168(SP) - ADCQ DX, DI - ADDQ SI, 168(SP) - ADCQ $0x00, DI + // | j0 - // | w22 @ 160(SP) - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, 160(SP) - ADCQ DX, SI - ADDQ DI, 160(SP) - ADCQ $0x00, SI + // | w15 @ SI + MULXQ (R10), AX, R11 + ADOXQ AX, SI + ADCXQ R11, DI - // | w23 @ (SP) - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + // | j1 - // | w24 @ 8(SP) - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + // | w16 @ DI + MULXQ 8(R10), AX, R11 + ADOXQ AX, DI + ADCXQ R11, R14 - // | w25 @ 16(SP) - ADDQ SI, R15 - ADCQ R15, 16(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | j2 - // | - MOVQ 144(SP), BX - MOVQ BX, 24(SP) + // | w17 @ R14 + MULXQ 16(R10), AX, R11 + ADOXQ AX, R14 + ADCXQ R11, BX - // | (u @ CX) = (w12 @ 32(SP)) * inp - MOVQ 32(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + // | j3 - // | w12 @ 32(SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI + // | w18 @ BX + MULXQ 24(R10), AX, R11 + ADOXQ AX, BX + ADCXQ R11, R12 - // | w13 @ 40(SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, SI - ADDQ DI, 40(SP) - ADCQ $0x00, SI + // | j4 - // | w14 @ 48(SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, DI - ADDQ SI, 48(SP) - ADCQ $0x00, DI + // | w19 @ R12 + MULXQ 32(R10), AX, R11 + ADOXQ AX, R12 + ADCXQ R11, R13 - // | w15 @ R8 - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + // | j5 - // | w16 @ R9 - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI + // | w20 @ R13 + MULXQ 40(R10), AX, R11 + ADOXQ AX, R13 + ADCXQ R11, R15 - // | w17 @ R10 - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + // | j6 - // | w18 @ R11 - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI + // | w21 @ R15 + MULXQ 48(R10), AX, R11 + ADOXQ AX, R15 + ADCXQ R11, R9 - // | w19 @ R12 - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + // | j7 - // | w20 @ R13 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + // | w22 @ R9 + MULXQ 56(R10), AX, R11 + ADOXQ AX, R9 + ADCXQ R11, CX - // | w21 @ 168(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 168(SP) - ADCQ DX, SI - ADDQ DI, 168(SP) - ADCQ $0x00, SI + // | j8 - // | w22 @ 160(SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, 160(SP) - ADCQ DX, DI - ADDQ SI, 160(SP) - ADCQ $0x00, DI + // | w23 @ CX + MULXQ 64(R10), AX, R11 + ADOXQ AX, CX - // | w23 @ (SP) - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + // | w24 @ 56(SP) + // | move to temp register + MOVQ 56(SP), AX + ADCXQ R11, AX + ADOXQ R8, AX - // | w24 @ 8(SP) - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + // | move to an idle register + // | w24 @ AX + MOVQ AX, R8 + ADCXQ SI, SI + MOVQ $0x00, AX + ADOXQ AX, SI - // | w25 @ 16(SP) - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI + // | + // | W q3 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - | 15 - + // | 16 DI | 17 R14 | 18 BX | 19 R12 | 20 R13 | 21 R15 | 22 R9 | 23 CX | 24 R8 | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | w26 @ 24(SP) - ADDQ SI, R15 - ADCQ R15, 24(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 - // | - MOVQ 152(SP), BX - MOVQ BX, 32(SP) + // | aggregate carries from q2 & q3 + // | should be added to w25 + ADCQ 152(SP), SI - // | (u @ CX) = (w13 @ 40(SP)) * inp - MOVQ 40(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + // | - // | w13 @ 40(SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, DI +/* montgomerry reduction q4 */ - // | w14 @ 48(SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, SI - ADDQ DI, 48(SP) - ADCQ $0x00, SI + // | clear flags + XORQ AX, AX - // | w15 @ R8 - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + // | - // | w16 @ R9 - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI +/* i = 0 */ - // | w17 @ R10 - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - | 15 - + // | 16 DI | 17 R14 | 18 BX | 19 R12 | 20 R13 | 21 R15 | 22 R9 | 23 CX | 24 R8 | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | w18 @ R11 - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI - // | w19 @ R12 - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + // | u0 @ 72(SP) + MOVQ 72(SP), DX - // | w20 @ R13 - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + // | - // | w21 @ 168(SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 168(SP) - ADCQ DX, DI - ADDQ SI, 168(SP) - ADCQ $0x00, DI +/* */ - // | w22 @ 160(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 160(SP) - ADCQ DX, SI - ADDQ DI, 160(SP) - ADCQ $0x00, SI + // | j9 - // | w23 @ (SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + // | w18 @ BX + MULXQ 72(R10), AX, R11 + ADOXQ AX, BX + ADCXQ R11, R12 + MOVQ BX, 56(SP) - // | w24 @ 8(SP) - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + // | j10 - // | w25 @ 16(SP) - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + // | w19 @ R12 + MULXQ 80(R10), AX, R11 + ADOXQ AX, R12 + ADCXQ R11, R13 - // | w26 @ 24(SP) - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI + // | j11 - // | w27 @ 32(SP) - ADDQ SI, R15 - ADCQ R15, 32(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | w20 @ R13 + MULXQ 88(R10), AX, R11 + ADOXQ AX, R13 + ADCXQ R11, R15 - // | Reduce by modulus - MOVQ 48(SP), CX - SUBQ (R14), CX - MOVQ R8, AX - SBBQ 8(R14), AX - MOVQ R9, DX - SBBQ 16(R14), DX - MOVQ R10, BX - SBBQ 24(R14), BX - MOVQ BX, 40(SP) - MOVQ R11, BX - SBBQ 32(R14), BX - MOVQ BX, 176(SP) - MOVQ R12, BX - SBBQ 40(R14), BX - MOVQ BX, 184(SP) - MOVQ R13, BX - SBBQ 48(R14), BX - MOVQ BX, 192(SP) - MOVQ 168(SP), BX - SBBQ 56(R14), BX - MOVQ BX, 200(SP) - MOVQ 160(SP), BX - SBBQ 64(R14), BX - MOVQ BX, 208(SP) - MOVQ (SP), BX - SBBQ 72(R14), BX - MOVQ BX, 216(SP) - MOVQ 8(SP), BX - SBBQ 80(R14), BX - MOVQ BX, 224(SP) - MOVQ 16(SP), BX - SBBQ 88(R14), BX - MOVQ BX, 232(SP) - MOVQ 24(SP), BX - SBBQ 96(R14), BX - MOVQ BX, 240(SP) - MOVQ 32(SP), BX - SBBQ 104(R14), BX - MOVQ BX, 248(SP) - SBBQ $0x00, R15 + // | j12 - // | Compare & Return - MOVQ c+0(FP), DI - CMOVQCS 48(SP), CX - MOVQ CX, (DI) - CMOVQCC AX, R8 - MOVQ R8, 8(DI) - CMOVQCC DX, R9 - MOVQ R9, 16(DI) - CMOVQCC 40(SP), R10 - MOVQ R10, 24(DI) - CMOVQCC 176(SP), R11 - MOVQ R11, 32(DI) - CMOVQCC 184(SP), R12 - MOVQ R12, 40(DI) - CMOVQCC 192(SP), R13 - MOVQ R13, 48(DI) - MOVQ 168(SP), BX - CMOVQCC 200(SP), BX - MOVQ BX, 56(DI) - MOVQ 160(SP), BX - CMOVQCC 208(SP), BX - MOVQ BX, 64(DI) - MOVQ (SP), BX - CMOVQCC 216(SP), BX - MOVQ BX, 72(DI) - MOVQ 8(SP), BX - CMOVQCC 224(SP), BX - MOVQ BX, 80(DI) - MOVQ 16(SP), BX - CMOVQCC 232(SP), BX - MOVQ BX, 88(DI) - MOVQ 24(SP), BX - CMOVQCC 240(SP), BX - MOVQ BX, 96(DI) - MOVQ 32(SP), BX - CMOVQCC 248(SP), BX - MOVQ BX, 104(DI) - RET + // | w21 @ R15 + MULXQ 96(R10), AX, R11 + ADOXQ AX, R15 + ADCXQ R11, R9 + // | j13 -// func mul12(c *[30]uint64, a *[15]uint64, b *[15]uint64, p *[15]uint64, inp uint64) -TEXT ·mul15(SB), $280-40 -/* inputs */ - // | - // | Multiplication - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI + // | w22 @ R9 + MULXQ 104(R10), AX, R11 + ADOXQ AX, R9 + ADCXQ R11, CX - // | - // | - XORQ R10, R10 - XORQ R11, R11 - XORQ R12, R12 - XORQ R13, R13 - XORQ R14, R14 - XORQ R15, R15 - MOVQ $0x00000000, (SP) - MOVQ $0x00000000, 8(SP) - MOVQ $0x00000000, 16(SP) - MOVQ $0x00000000, 24(SP) - MOVQ $0x00000000, 32(SP) - MOVQ $0x00000000, 40(SP) - MOVQ $0x00000000, 48(SP) - MOVQ $0x00000000, 56(SP) - MOVQ $0x00000000, 64(SP) - MOVQ $0x00000000, 72(SP) - MOVQ $0x00000000, 80(SP) - MOVQ $0x00000000, 88(SP) - MOVQ $0x00000000, 96(SP) - MOVQ $0x00000000, 104(SP) - MOVQ $0x00000000, 112(SP) - MOVQ $0x00000000, 120(SP) - MOVQ $0x00000000, 128(SP) - MOVQ $0x00000000, 136(SP) - MOVQ $0x00000000, 144(SP) - MOVQ $0x00000000, 152(SP) - MOVQ $0x00000000, 160(SP) - MOVQ $0x00000000, 168(SP) - - // | - // | b0 - MOVQ (SI), CX + // | j14 - // | a0 * b0 - // | (w0, w1) @ (R8, R9) - MOVQ (DI), AX - MULQ CX - MOVQ AX, R8 - MOVQ DX, R9 + // | w23 @ CX + MULXQ 112(R10), AX, R11 + ADOXQ AX, CX + ADCXQ R11, R8 - // | a1 * b0 - // | (w1, w2) @ (R9, R10) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, R10 + // | j15 - // | a2 * b0 - // | (w2, w3) @ (R10, R11) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, R11 + // | w24 @ R8 + MULXQ 120(R10), AX, R11 + ADOXQ AX, R8 - // | a3 * b0 - // | (w3, w4) @ (R11, R12) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, R12 + // | w25 @ 48(SP) + // | move to an idle register + MOVQ 48(SP), BX + ADCXQ R11, BX - // | a4 * b0 - // | (w4, w5) @ (R12, R13) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, R13 + // | bring carry from q2 & q3 + // | w25 @ BX + ADOXQ SI, BX + MOVQ $0x00, SI + ADCXQ SI, SI + MOVQ $0x00, R11 + ADOXQ R11, SI - // | a5 * b0 - // | (w5, w6) @ (R13, R14) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 + // | - // | a6 * b0 - // | (w6, w7) @ (R14, R15) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 +/* i = 1 */ - // | a7 * b0 - // | (w7, w8) @ (R15, (SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - | 15 - + // | 16 DI | 17 R14 | 18 56(SP) | 19 R12 | 20 R13 | 21 R15 | 22 R9 | 23 CX | 24 R8 | 25 BX | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | a8 * b0 - // | (w8, w9) @ ((SP), 8(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - // | a9 * b0 - // | (w9, w10) @ (8(SP), 16(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) + // | u1 @ 112(SP) + MOVQ 112(SP), DX - // | a10 * b0 - // | (w10, w11) @ (16(SP), 24(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) + // | - // | a11 * b0 - // | (w11, w12) @ (24(SP), 32(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) +/* */ - // | a12 * b0 - // | (w12, w13) @ (32(SP), 40(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) + // | j9 - // | a13 * b0 - // | (w13, w14) @ (40(SP), 48(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) + // | w19 @ R12 + MULXQ 72(R10), AX, R11 + ADOXQ AX, R12 + ADCXQ R11, R13 + MOVQ R12, 48(SP) - // | a14 * b0 - // | (w14, w15) @ (48(SP), 56(SP)) - MOVQ 112(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) + // | j10 - // | - // | b1 - MOVQ 8(SI), CX + // | w20 @ R13 + MULXQ 80(R10), AX, R11 + ADOXQ AX, R13 + ADCXQ R11, R15 - // | a0 * b1 - // | (w1, w2, w3, w4) @ (R9, R10, R11, R12) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, R10 - ADCQ $0x00, R11 - ADCQ $0x00, R12 + // | j11 - // | a1 * b1 - // | (w2, w3, w4, w5) @ (R10, R11, R12, R13) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, R11 - ADCQ $0x00, R12 - ADCQ $0x00, R13 + // | w21 @ R15 + MULXQ 88(R10), AX, R11 + ADOXQ AX, R15 + ADCXQ R11, R9 - // | a2 * b1 - // | (w3, w4, w5, w6) @ (R11, R12, R13, R14) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, R12 - ADCQ $0x00, R13 - ADCQ $0x00, R14 + // | j12 - // | a3 * b1 - // | (w4, w5, w6, w7) @ (R12, R13, R14, R15) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, R13 - ADCQ $0x00, R14 - ADCQ $0x00, R15 + // | w22 @ R9 + MULXQ 96(R10), AX, R11 + ADOXQ AX, R9 + ADCXQ R11, CX - // | a4 * b1 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) + // | j13 + + // | w23 @ CX + MULXQ 104(R10), AX, R11 + ADOXQ AX, CX + ADCXQ R11, R8 - // | a5 * b1 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + // | j14 - // | a6 * b1 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + // | w24 @ R8 + MULXQ 112(R10), AX, R11 + ADOXQ AX, R8 + ADCXQ R11, BX - // | a7 * b1 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | j15 - // | a8 * b1 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | w25 @ BX + MULXQ 120(R10), AX, R11 + ADOXQ AX, BX - // | a9 * b1 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | w26 @ 40(SP) + // | move to an idle register + MOVQ 40(SP), R12 + ADCXQ R11, R12 - // | a10 * b1 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | w26 @ R12 + ADOXQ SI, R12 + MOVQ $0x00, SI + ADCXQ SI, SI + MOVQ $0x00, R11 + ADOXQ R11, SI - // | a11 * b1 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | - // | a12 * b1 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) +/* i = 2 */ - // | a13 * b1 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - | 15 - + // | 16 DI | 17 R14 | 18 56(SP) | 19 48(SP) | 20 R13 | 21 R15 | 22 R9 | 23 CX | 24 R8 | 25 BX | 26 R12 | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | a14 * b1 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 112(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) - // | - // | b2 - MOVQ 16(SI), CX + // | u2 @ 104(SP) + MOVQ 104(SP), DX - // | a0 * b2 - // | (w2, w3, w4, w5) @ (R10, R11, R12, R13) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, R11 - ADCQ $0x00, R12 - ADCQ $0x00, R13 + // | - // | a1 * b2 - // | (w3, w4, w5, w6) @ (R11, R12, R13, R14) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, R12 - ADCQ $0x00, R13 - ADCQ $0x00, R14 +/* */ - // | a2 * b2 - // | (w4, w5, w6, w7) @ (R12, R13, R14, R15) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, R13 - ADCQ $0x00, R14 - ADCQ $0x00, R15 + // | j9 - // | a3 * b2 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) + // | w20 @ R13 + MULXQ 72(R10), AX, R11 + ADOXQ AX, R13 + ADCXQ R11, R15 + MOVQ R13, 40(SP) - // | a4 * b2 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + // | j10 - // | a5 * b2 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + // | w21 @ R15 + MULXQ 80(R10), AX, R11 + ADOXQ AX, R15 + ADCXQ R11, R9 - // | a6 * b2 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | j11 - // | a7 * b2 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | w22 @ R9 + MULXQ 88(R10), AX, R11 + ADOXQ AX, R9 + ADCXQ R11, CX - // | a8 * b2 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | j12 - // | a9 * b2 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | w23 @ CX + MULXQ 96(R10), AX, R11 + ADOXQ AX, CX + ADCXQ R11, R8 - // | a10 * b2 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | j13 - // | a11 * b2 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | w24 @ R8 + MULXQ 104(R10), AX, R11 + ADOXQ AX, R8 + ADCXQ R11, BX - // | a12 * b2 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | j14 - // | a13 * b2 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | w25 @ BX + MULXQ 112(R10), AX, R11 + ADOXQ AX, BX + ADCXQ R11, R12 - // | a14 * b2 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 112(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | j15 - // | - // | b3 - MOVQ 24(SI), CX + // | w26 @ R12 + MULXQ 120(R10), AX, R11 + ADOXQ AX, R12 - // | a0 * b3 - // | (w3, w4, w5, w6) @ (R11, R12, R13, R14) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, R12 - ADCQ $0x00, R13 - ADCQ $0x00, R14 + // | w27 @ 32(SP) + // | move to an idle register + MOVQ 32(SP), R13 + ADCXQ R11, R13 + + // | w27 @ R13 + ADOXQ SI, R13 + MOVQ $0x00, SI + ADCXQ SI, SI + MOVQ $0x00, R11 + ADOXQ R11, SI - // | a1 * b3 - // | (w4, w5, w6, w7) @ (R12, R13, R14, R15) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, R13 - ADCQ $0x00, R14 - ADCQ $0x00, R15 + // | - // | a2 * b3 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) +/* i = 3 */ - // | a3 * b3 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - | 15 - + // | 16 DI | 17 R14 | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 R15 | 22 R9 | 23 CX | 24 R8 | 25 BX | 26 R12 | 27 R13 | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | a4 * b3 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) - // | a5 * b3 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | u3 @ 96(SP) + MOVQ 96(SP), DX - // | a6 * b3 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | - // | a7 * b3 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) +/* */ - // | a8 * b3 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | j9 - // | a9 * b3 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | w21 @ R15 + MULXQ 72(R10), AX, R11 + ADOXQ AX, R15 + ADCXQ R11, R9 + MOVQ R15, 32(SP) - // | a10 * b3 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | j10 - // | a11 * b3 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | w22 @ R9 + MULXQ 80(R10), AX, R11 + ADOXQ AX, R9 + ADCXQ R11, CX - // | a12 * b3 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | j11 - // | a13 * b3 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | w23 @ CX + MULXQ 88(R10), AX, R11 + ADOXQ AX, CX + ADCXQ R11, R8 - // | a14 * b3 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 112(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | j12 - // | - // | b4 - MOVQ 32(SI), CX + // | w24 @ R8 + MULXQ 96(R10), AX, R11 + ADOXQ AX, R8 + ADCXQ R11, BX - // | a0 * b4 - // | (w4, w5, w6, w7) @ (R12, R13, R14, R15) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, R13 - ADCQ $0x00, R14 - ADCQ $0x00, R15 + // | j13 - // | a1 * b4 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) + // | w25 @ BX + MULXQ 104(R10), AX, R11 + ADOXQ AX, BX + ADCXQ R11, R12 - // | a2 * b4 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + // | j14 - // | a3 * b4 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + // | w26 @ R12 + MULXQ 112(R10), AX, R11 + ADOXQ AX, R12 + ADCXQ R11, R13 - // | a4 * b4 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | j15 - // | a5 * b4 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | w27 @ R13 + MULXQ 120(R10), AX, R11 + ADOXQ AX, R13 - // | a6 * b4 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | w28 @ 24(SP) + // | move to an idle register + MOVQ 24(SP), R15 + ADCXQ R11, R15 + + // | w28 @ R15 + ADOXQ SI, R15 + MOVQ $0x00, SI + ADCXQ SI, SI + MOVQ $0x00, R11 + ADOXQ R11, SI - // | a7 * b4 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | - // | a8 * b4 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) +/* i = 4 */ - // | a9 * b4 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - | 15 - + // | 16 DI | 17 R14 | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 R9 | 23 CX | 24 R8 | 25 BX | 26 R12 | 27 R13 | 28 R15 | 29 16(SP) | 30 8(SP) | 31 (SP) - // | a10 * b4 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) - // | a11 * b4 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | u4 @ 88(SP) + MOVQ 88(SP), DX - // | a12 * b4 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | - // | a13 * b4 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) +/* */ - // | a14 * b4 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 112(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + // | j9 - // | - // | b5 - MOVQ 40(SI), CX + // | w22 @ R9 + MULXQ 72(R10), AX, R11 + ADOXQ AX, R9 + ADCXQ R11, CX + MOVQ R9, 24(SP) - // | a0 * b5 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) + // | j10 - // | a1 * b5 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + // | w23 @ CX + MULXQ 80(R10), AX, R11 + ADOXQ AX, CX + ADCXQ R11, R8 - // | a2 * b5 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + // | j11 - // | a3 * b5 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | w24 @ R8 + MULXQ 88(R10), AX, R11 + ADOXQ AX, R8 + ADCXQ R11, BX - // | a4 * b5 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | j12 - // | a5 * b5 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | w25 @ BX + MULXQ 96(R10), AX, R11 + ADOXQ AX, BX + ADCXQ R11, R12 - // | a6 * b5 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | j13 - // | a7 * b5 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | w26 @ R12 + MULXQ 104(R10), AX, R11 + ADOXQ AX, R12 + ADCXQ R11, R13 - // | a8 * b5 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | j14 - // | a9 * b5 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | w27 @ R13 + MULXQ 112(R10), AX, R11 + ADOXQ AX, R13 + ADCXQ R11, R15 - // | a10 * b5 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | j15 - // | a11 * b5 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | w28 @ R15 + MULXQ 120(R10), AX, R11 + ADOXQ AX, R15 - // | a12 * b5 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | w29 @ 16(SP) + // | move to an idle register + MOVQ 16(SP), R9 + ADCXQ R11, R9 - // | a13 * b5 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + // | w29 @ R9 + ADOXQ SI, R9 + MOVQ $0x00, SI + ADCXQ SI, SI + MOVQ $0x00, R11 + ADOXQ R11, SI - // | a14 * b5 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 112(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) + // | - // | - // | b6 - MOVQ 48(SI), CX +/* i = 5 */ - // | a0 * b6 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - | 15 - + // | 16 DI | 17 R14 | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 CX | 24 R8 | 25 BX | 26 R12 | 27 R13 | 28 R15 | 29 R9 | 30 8(SP) | 31 (SP) - // | a1 * b6 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) - // | a2 * b6 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | u5 @ 80(SP) + MOVQ 80(SP), DX - // | a3 * b6 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | - // | a4 * b6 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) +/* */ - // | a5 * b6 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | j9 - // | a6 * b6 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | w23 @ CX + MULXQ 72(R10), AX, R11 + ADOXQ AX, CX + ADCXQ R11, R8 + MOVQ CX, 16(SP) - // | a7 * b6 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | j10 - // | a8 * b6 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | w24 @ R8 + MULXQ 80(R10), AX, R11 + ADOXQ AX, R8 + ADCXQ R11, BX - // | a9 * b6 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | j11 - // | a10 * b6 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | w25 @ BX + MULXQ 88(R10), AX, R11 + ADOXQ AX, BX + ADCXQ R11, R12 - // | a11 * b6 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | j12 - // | a12 * b6 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + // | w26 @ R12 + MULXQ 96(R10), AX, R11 + ADOXQ AX, R12 + ADCXQ R11, R13 - // | a13 * b6 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) + // | j13 - // | a14 * b6 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 112(DI), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) + // | w27 @ R13 + MULXQ 104(R10), AX, R11 + ADOXQ AX, R13 + ADCXQ R11, R15 - // | - // | b7 - MOVQ 56(SI), CX + // | j14 - // | a0 * b7 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + // | w28 @ R15 + MULXQ 112(R10), AX, R11 + ADOXQ AX, R15 + ADCXQ R11, R9 - // | a1 * b7 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | j15 - // | a2 * b7 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | w29 @ R9 + MULXQ 120(R10), AX, R11 + ADOXQ AX, R9 + + // | w30 @ 8(SP) + // | move to an idle register + MOVQ 8(SP), CX + ADCXQ R11, CX + + // | w30 @ CX + ADOXQ SI, CX + MOVQ $0x00, SI + ADCXQ SI, SI + MOVQ $0x00, R11 + ADOXQ R11, SI + + // | - // | a3 * b7 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) +/* i = 6 */ - // | a4 * b7 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - | 15 - + // | 16 DI | 17 R14 | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 R8 | 25 BX | 26 R12 | 27 R13 | 28 R15 | 29 R9 | 30 CX | 31 (SP) - // | a5 * b7 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) - // | a6 * b7 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | u6 @ 64(SP) + MOVQ 64(SP), DX - // | a7 * b7 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | - // | a8 * b7 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) +/* */ - // | a9 * b7 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | j9 - // | a10 * b7 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | w24 @ R8 + MULXQ 72(R10), AX, R11 + ADOXQ AX, R8 + ADCXQ R11, BX - // | a11 * b7 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + // | j10 - // | a12 * b7 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) + // | w25 @ BX + MULXQ 80(R10), AX, R11 + ADOXQ AX, BX + ADCXQ R11, R12 - // | a13 * b7 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) + // | j11 - // | a14 * b7 - // | (w21, w22, w23, w24) @ (104(SP), 112(SP), 120(SP), 128(SP)) - MOVQ 112(DI), AX - MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, 112(SP) - ADCQ $0x00, 120(SP) - ADCQ $0x00, 128(SP) + // | w26 @ R12 + MULXQ 88(R10), AX, R11 + ADOXQ AX, R12 + ADCXQ R11, R13 - // | - // | b8 - MOVQ 64(SI), CX + // | j12 - // | a0 * b8 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | w27 @ R13 + MULXQ 96(R10), AX, R11 + ADOXQ AX, R13 + ADCXQ R11, R15 - // | a1 * b8 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | j13 - // | a2 * b8 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | w28 @ R15 + MULXQ 104(R10), AX, R11 + ADOXQ AX, R15 + ADCXQ R11, R9 - // | a3 * b8 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | j14 - // | a4 * b8 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | w29 @ R9 + MULXQ 112(R10), AX, R11 + ADOXQ AX, R9 + ADCXQ R11, CX - // | a5 * b8 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | j15 - // | a6 * b8 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | w30 @ CX + MULXQ 120(R10), AX, R11 + ADOXQ AX, CX - // | a7 * b8 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | w31 @ (SP) + // | move to an idle register + MOVQ (SP), AX + ADCXQ R11, AX - // | a8 * b8 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | w31 @ AX + ADOXQ SI, AX + MOVQ $0x00, SI + ADCXQ SI, SI + MOVQ $0x00, R11 + ADOXQ R11, SI - // | a9 * b8 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | + // | W q4 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - | 15 - + // | 16 DI | 17 R14 | 18 56(SP) | 19 48(SP) | 20 40(SP) | 21 32(SP) | 22 24(SP) | 23 16(SP) | 24 R8 | 25 BX | 26 R12 | 27 R13 | 28 R15 | 29 R9 | 30 CX | 31 AX - // | a10 * b8 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) - // | a11 * b8 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) + // | - // | a12 * b8 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) +/* modular reduction */ - // | a13 * b8 - // | (w21, w22, w23, w24) @ (104(SP), 112(SP), 120(SP), 128(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, 112(SP) - ADCQ $0x00, 120(SP) - ADCQ $0x00, 128(SP) + MOVQ DI, R11 + SUBQ (R10), R11 + MOVQ R14, DX + SBBQ 8(R10), DX + MOVQ DX, (SP) + MOVQ 56(SP), DX + SBBQ 16(R10), DX + MOVQ DX, 8(SP) + MOVQ 48(SP), DX + SBBQ 24(R10), DX + MOVQ DX, 64(SP) + MOVQ 40(SP), DX + SBBQ 32(R10), DX + MOVQ DX, 72(SP) + MOVQ 32(SP), DX + SBBQ 40(R10), DX + MOVQ DX, 80(SP) + MOVQ 24(SP), DX + SBBQ 48(R10), DX + MOVQ DX, 88(SP) + MOVQ 16(SP), DX + SBBQ 56(R10), DX + MOVQ DX, 96(SP) + MOVQ R8, DX + SBBQ 64(R10), DX + MOVQ DX, 104(SP) + MOVQ BX, DX + SBBQ 72(R10), DX + MOVQ DX, 112(SP) + MOVQ R12, DX + SBBQ 80(R10), DX + MOVQ DX, 120(SP) + MOVQ R13, DX + SBBQ 88(R10), DX + MOVQ DX, 128(SP) + MOVQ R15, DX + SBBQ 96(R10), DX + MOVQ DX, 136(SP) + MOVQ R9, DX + SBBQ 104(R10), DX + MOVQ DX, 144(SP) + MOVQ CX, DX + SBBQ 112(R10), DX + MOVQ DX, 152(SP) + MOVQ AX, DX + SBBQ 120(R10), DX + MOVQ DX, 160(SP) + SBBQ $0x00, SI - // | a14 * b8 - // | (w22, w23, w24, w25) @ (112(SP), 120(SP), 128(SP), 136(SP)) - MOVQ 112(DI), AX - MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, 120(SP) - ADCQ $0x00, 128(SP) - ADCQ $0x00, 136(SP) + // | - // | - // | b9 - MOVQ 72(SI), CX +/* out */ - // | a0 * b9 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + MOVQ c+0(FP), SI + CMOVQCC R11, DI + MOVQ DI, (SI) + CMOVQCC (SP), R14 + MOVQ R14, 8(SI) + MOVQ 56(SP), DX + CMOVQCC 8(SP), DX + MOVQ DX, 16(SI) + MOVQ 48(SP), DX + CMOVQCC 64(SP), DX + MOVQ DX, 24(SI) + MOVQ 40(SP), DX + CMOVQCC 72(SP), DX + MOVQ DX, 32(SI) + MOVQ 32(SP), DX + CMOVQCC 80(SP), DX + MOVQ DX, 40(SI) + MOVQ 24(SP), DX + CMOVQCC 88(SP), DX + MOVQ DX, 48(SI) + MOVQ 16(SP), DX + CMOVQCC 96(SP), DX + MOVQ DX, 56(SI) + CMOVQCC 104(SP), R8 + MOVQ R8, 64(SI) + CMOVQCC 112(SP), BX + MOVQ BX, 72(SI) + CMOVQCC 120(SP), R12 + MOVQ R12, 80(SI) + CMOVQCC 128(SP), R13 + MOVQ R13, 88(SI) + CMOVQCC 136(SP), R15 + MOVQ R15, 96(SI) + CMOVQCC 144(SP), R9 + MOVQ R9, 104(SI) + CMOVQCC 152(SP), CX + MOVQ CX, 112(SI) + CMOVQCC 160(SP), AX + MOVQ AX, 120(SI) + RET - // | a1 * b9 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | - // | a2 * b9 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) +/* end */ - // | a3 * b9 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) - // | a4 * b9 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) +// func mul_no_adx_bmi2_16(c *[16]uint64, a *[16]uint64, b *[16]uint64, p *[16]uint64, inp uint64) +TEXT ·mul_no_adx_bmi2_16(SB), NOSPLIT, $312-40 + // | - // | a5 * b9 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) +/* inputs */ - // | a6 * b9 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + MOVQ $0x00, R9 + MOVQ $0x00, R10 + MOVQ $0x00, R11 + MOVQ $0x00, R12 + MOVQ $0x00, R13 + MOVQ $0x00, R14 + MOVQ $0x00, R15 - // | a7 * b9 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | - // | a8 * b9 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) +/* i = 0 */ - // | a9 * b9 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + // | a0 @ CX + MOVQ (DI), CX - // | a10 * b9 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 80(DI), AX + // | a0 * b0 + MOVQ (SI), AX MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) + MOVQ AX, (SP) + MOVQ DX, R8 - // | a11 * b9 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 88(DI), AX + // | a0 * b1 + MOVQ 8(SI), AX MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) + ADDQ AX, R8 + ADCQ DX, R9 - // | a12 * b9 - // | (w21, w22, w23, w24) @ (104(SP), 112(SP), 120(SP), 128(SP)) - MOVQ 96(DI), AX + // | a0 * b2 + MOVQ 16(SI), AX MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, 112(SP) - ADCQ $0x00, 120(SP) - ADCQ $0x00, 128(SP) + ADDQ AX, R9 + ADCQ DX, R10 - // | a13 * b9 - // | (w22, w23, w24, w25) @ (112(SP), 120(SP), 128(SP), 136(SP)) - MOVQ 104(DI), AX + // | a0 * b3 + MOVQ 24(SI), AX MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, 120(SP) - ADCQ $0x00, 128(SP) - ADCQ $0x00, 136(SP) + ADDQ AX, R10 + ADCQ DX, R11 - // | a14 * b9 - // | (w23, w24, w25, w26) @ (120(SP), 128(SP), 136(SP), 144(SP)) - MOVQ 112(DI), AX + // | a0 * b4 + MOVQ 32(SI), AX MULQ CX - ADDQ AX, 120(SP) - ADCQ DX, 128(SP) - ADCQ $0x00, 136(SP) - ADCQ $0x00, 144(SP) - - // | - // | b10 - MOVQ 80(SI), CX + ADDQ AX, R11 + ADCQ DX, R12 - // | a0 * b10 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ (DI), AX + // | a0 * b5 + MOVQ 40(SI), AX MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + ADDQ AX, R12 + ADCQ DX, R13 - // | a1 * b10 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 8(DI), AX + // | a0 * b6 + MOVQ 48(SI), AX MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + ADDQ AX, R13 + ADCQ DX, R14 - // | a2 * b10 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 16(DI), AX + // | a0 * b7 + MOVQ 56(SI), AX MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + ADDQ AX, R14 + ADCQ DX, R15 - // | a3 * b10 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | - // | a4 * b10 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) +/* i = 1 */ - // | a5 * b10 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | a1 @ CX + MOVQ 8(DI), CX + MOVQ $0x00, BX - // | a6 * b10 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 48(DI), AX + // | a1 * b0 + MOVQ (SI), AX MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R10 + ADCQ $0x00, BX + MOVQ R8, 8(SP) + MOVQ $0x00, R8 - // | a7 * b10 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 56(DI), AX + // | a1 * b1 + MOVQ 8(SI), AX MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a8 * b10 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 64(DI), AX + // | a1 * b2 + MOVQ 16(SI), AX MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a9 * b10 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 72(DI), AX + // | a1 * b3 + MOVQ 24(SI), AX MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a10 * b10 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 80(DI), AX + // | a1 * b4 + MOVQ 32(SI), AX MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a11 * b10 - // | (w21, w22, w23, w24) @ (104(SP), 112(SP), 120(SP), 128(SP)) - MOVQ 88(DI), AX + // | a1 * b5 + MOVQ 40(SI), AX MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, 112(SP) - ADCQ $0x00, 120(SP) - ADCQ $0x00, 128(SP) + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a12 * b10 - // | (w22, w23, w24, w25) @ (112(SP), 120(SP), 128(SP), 136(SP)) - MOVQ 96(DI), AX + // | a1 * b6 + MOVQ 48(SI), AX MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, 120(SP) - ADCQ $0x00, 128(SP) - ADCQ $0x00, 136(SP) + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 - // | a13 * b10 - // | (w23, w24, w25, w26) @ (120(SP), 128(SP), 136(SP), 144(SP)) - MOVQ 104(DI), AX + // | a1 * b7 + MOVQ 56(SI), AX MULQ CX - ADDQ AX, 120(SP) - ADCQ DX, 128(SP) - ADCQ $0x00, 136(SP) - ADCQ $0x00, 144(SP) + ADDQ AX, R15 + ADCQ DX, R8 - // | a14 * b10 - // | (w24, w25, w26, w27) @ (128(SP), 136(SP), 144(SP), 152(SP)) - MOVQ 112(DI), AX - MULQ CX - ADDQ AX, 128(SP) - ADCQ DX, 136(SP) - ADCQ $0x00, 144(SP) - ADCQ $0x00, 152(SP) + // | - // | - // | b11 - MOVQ 88(SI), CX +/* i = 2 */ - // | a0 * b11 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | a2 @ CX + MOVQ 16(DI), CX + MOVQ $0x00, BX - // | a1 * b11 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 8(DI), AX + // | a2 * b0 + MOVQ (SI), AX MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ $0x00, R11 + ADCQ $0x00, BX + MOVQ R9, 16(SP) + MOVQ $0x00, R9 - // | a2 * b11 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 16(DI), AX + // | a2 * b1 + MOVQ 8(SI), AX MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a3 * b11 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 24(DI), AX + // | a2 * b2 + MOVQ 16(SI), AX MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a4 * b11 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 32(DI), AX + // | a2 * b3 + MOVQ 24(SI), AX MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a5 * b11 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 40(DI), AX + // | a2 * b4 + MOVQ 32(SI), AX MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a6 * b11 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 48(DI), AX + // | a2 * b5 + MOVQ 40(SI), AX MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a7 * b11 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 56(DI), AX + // | a2 * b6 + MOVQ 48(SI), AX MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 - // | a8 * b11 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 64(DI), AX + // | a2 * b7 + MOVQ 56(SI), AX MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) + ADDQ AX, R8 + ADCQ DX, R9 - // | a9 * b11 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) + // | - // | a10 * b11 - // | (w21, w22, w23, w24) @ (104(SP), 112(SP), 120(SP), 128(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, 112(SP) - ADCQ $0x00, 120(SP) - ADCQ $0x00, 128(SP) +/* i = 3 */ - // | a11 * b11 - // | (w22, w23, w24, w25) @ (112(SP), 120(SP), 128(SP), 136(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, 120(SP) - ADCQ $0x00, 128(SP) - ADCQ $0x00, 136(SP) + // | a3 @ CX + MOVQ 24(DI), CX + MOVQ $0x00, BX - // | a12 * b11 - // | (w23, w24, w25, w26) @ (120(SP), 128(SP), 136(SP), 144(SP)) - MOVQ 96(DI), AX + // | a3 * b0 + MOVQ (SI), AX MULQ CX - ADDQ AX, 120(SP) - ADCQ DX, 128(SP) - ADCQ $0x00, 136(SP) - ADCQ $0x00, 144(SP) + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ $0x00, R12 + ADCQ $0x00, BX + MOVQ R10, 24(SP) + MOVQ $0x00, R10 - // | a13 * b11 - // | (w24, w25, w26, w27) @ (128(SP), 136(SP), 144(SP), 152(SP)) - MOVQ 104(DI), AX + // | a3 * b1 + MOVQ 8(SI), AX MULQ CX - ADDQ AX, 128(SP) - ADCQ DX, 136(SP) - ADCQ $0x00, 144(SP) - ADCQ $0x00, 152(SP) + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a14 * b11 - // | (w25, w26, w27, w28) @ (136(SP), 144(SP), 152(SP), 160(SP)) - MOVQ 112(DI), AX + // | a3 * b2 + MOVQ 16(SI), AX MULQ CX - ADDQ AX, 136(SP) - ADCQ DX, 144(SP) - ADCQ $0x00, 152(SP) - ADCQ $0x00, 160(SP) - - // | - // | b12 - MOVQ 96(SI), CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a0 * b12 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ (DI), AX + // | a3 * b3 + MOVQ 24(SI), AX MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a1 * b12 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 8(DI), AX + // | a3 * b4 + MOVQ 32(SI), AX MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a2 * b12 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 16(DI), AX + // | a3 * b5 + MOVQ 40(SI), AX MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a3 * b12 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 24(DI), AX + // | a3 * b6 + MOVQ 48(SI), AX MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 - // | a4 * b12 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 32(DI), AX + // | a3 * b7 + MOVQ 56(SI), AX MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + ADDQ AX, R9 + ADCQ DX, R10 - // | a5 * b12 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | - // | a6 * b12 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) +/* i = 4 */ - // | a7 * b12 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) + // | a4 @ CX + MOVQ 32(DI), CX + MOVQ $0x00, BX - // | a8 * b12 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 64(DI), AX + // | a4 * b0 + MOVQ (SI), AX MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ $0x00, R13 + ADCQ $0x00, BX + MOVQ R11, 32(SP) + MOVQ $0x00, R11 - // | a9 * b12 - // | (w21, w22, w23, w24) @ (104(SP), 112(SP), 120(SP), 128(SP)) - MOVQ 72(DI), AX + // | a4 * b1 + MOVQ 8(SI), AX MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, 112(SP) - ADCQ $0x00, 120(SP) - ADCQ $0x00, 128(SP) + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a10 * b12 - // | (w22, w23, w24, w25) @ (112(SP), 120(SP), 128(SP), 136(SP)) - MOVQ 80(DI), AX + // | a4 * b2 + MOVQ 16(SI), AX MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, 120(SP) - ADCQ $0x00, 128(SP) - ADCQ $0x00, 136(SP) + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a11 * b12 - // | (w23, w24, w25, w26) @ (120(SP), 128(SP), 136(SP), 144(SP)) - MOVQ 88(DI), AX + // | a4 * b3 + MOVQ 24(SI), AX MULQ CX - ADDQ AX, 120(SP) - ADCQ DX, 128(SP) - ADCQ $0x00, 136(SP) - ADCQ $0x00, 144(SP) + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a12 * b12 - // | (w24, w25, w26, w27) @ (128(SP), 136(SP), 144(SP), 152(SP)) - MOVQ 96(DI), AX + // | a4 * b4 + MOVQ 32(SI), AX MULQ CX - ADDQ AX, 128(SP) - ADCQ DX, 136(SP) - ADCQ $0x00, 144(SP) - ADCQ $0x00, 152(SP) + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a13 * b12 - // | (w25, w26, w27, w28) @ (136(SP), 144(SP), 152(SP), 160(SP)) - MOVQ 104(DI), AX + // | a4 * b5 + MOVQ 40(SI), AX MULQ CX - ADDQ AX, 136(SP) - ADCQ DX, 144(SP) - ADCQ $0x00, 152(SP) - ADCQ $0x00, 160(SP) + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a14 * b12 - // | (w26, w27, w28, w29) @ (144(SP), 152(SP), 160(SP), 168(SP)) - MOVQ 112(DI), AX + // | a4 * b6 + MOVQ 48(SI), AX MULQ CX - ADDQ AX, 144(SP) - ADCQ DX, 152(SP) - ADCQ $0x00, 160(SP) - ADCQ $0x00, 168(SP) - - // | - // | b13 - MOVQ 104(SI), CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 - // | a0 * b13 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ (DI), AX + // | a4 * b7 + MOVQ 56(SI), AX MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + ADDQ AX, R10 + ADCQ DX, R11 - // | a1 * b13 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | - // | a2 * b13 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) +/* i = 5 */ - // | a3 * b13 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | a5 @ CX + MOVQ 40(DI), CX + MOVQ $0x00, BX - // | a4 * b13 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 32(DI), AX + // | a5 * b0 + MOVQ (SI), AX MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ $0x00, R14 + ADCQ $0x00, BX + MOVQ R12, 40(SP) + MOVQ $0x00, R12 - // | a5 * b13 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 40(DI), AX + // | a5 * b1 + MOVQ 8(SI), AX MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a6 * b13 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 48(DI), AX + // | a5 * b2 + MOVQ 16(SI), AX MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a7 * b13 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 56(DI), AX + // | a5 * b3 + MOVQ 24(SI), AX MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a8 * b13 - // | (w21, w22, w23, w24) @ (104(SP), 112(SP), 120(SP), 128(SP)) - MOVQ 64(DI), AX + // | a5 * b4 + MOVQ 32(SI), AX MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, 112(SP) - ADCQ $0x00, 120(SP) - ADCQ $0x00, 128(SP) + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a9 * b13 - // | (w22, w23, w24, w25) @ (112(SP), 120(SP), 128(SP), 136(SP)) - MOVQ 72(DI), AX + // | a5 * b5 + MOVQ 40(SI), AX MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, 120(SP) - ADCQ $0x00, 128(SP) - ADCQ $0x00, 136(SP) + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a10 * b13 - // | (w23, w24, w25, w26) @ (120(SP), 128(SP), 136(SP), 144(SP)) - MOVQ 80(DI), AX + // | a5 * b6 + MOVQ 48(SI), AX MULQ CX - ADDQ AX, 120(SP) - ADCQ DX, 128(SP) - ADCQ $0x00, 136(SP) - ADCQ $0x00, 144(SP) + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 - // | a11 * b13 - // | (w24, w25, w26, w27) @ (128(SP), 136(SP), 144(SP), 152(SP)) - MOVQ 88(DI), AX + // | a5 * b7 + MOVQ 56(SI), AX MULQ CX - ADDQ AX, 128(SP) - ADCQ DX, 136(SP) - ADCQ $0x00, 144(SP) - ADCQ $0x00, 152(SP) + ADDQ AX, R11 + ADCQ DX, R12 - // | a12 * b13 - // | (w25, w26, w27, w28) @ (136(SP), 144(SP), 152(SP), 160(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 136(SP) - ADCQ DX, 144(SP) - ADCQ $0x00, 152(SP) - ADCQ $0x00, 160(SP) + // | - // | a13 * b13 - // | (w26, w27, w28, w29) @ (144(SP), 152(SP), 160(SP), 168(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 144(SP) - ADCQ DX, 152(SP) - ADCQ $0x00, 160(SP) - ADCQ $0x00, 168(SP) +/* i = 6 */ - // | a14 * b13 - // | (w27, w28, w29) @ (152(SP), 160(SP), 168(SP)) - MOVQ 112(DI), AX - MULQ CX - ADDQ AX, 152(SP) - ADCQ DX, 160(SP) - ADCQ $0x00, 168(SP) + // | a6 @ CX + MOVQ 48(DI), CX + MOVQ $0x00, BX - // | - // | b14 - MOVQ 112(SI), CX + // | a6 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ $0x00, R15 + ADCQ $0x00, BX + MOVQ R13, 48(SP) + MOVQ $0x00, R13 - // | a0 * b14 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ (DI), AX + // | a6 * b1 + MOVQ 8(SI), AX MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a1 * b14 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 8(DI), AX + // | a6 * b2 + MOVQ 16(SI), AX MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a2 * b14 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 16(DI), AX + // | a6 * b3 + MOVQ 24(SI), AX MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a3 * b14 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 24(DI), AX + // | a6 * b4 + MOVQ 32(SI), AX MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a4 * b14 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 32(DI), AX + // | a6 * b5 + MOVQ 40(SI), AX MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a5 * b14 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 40(DI), AX + // | a6 * b6 + MOVQ 48(SI), AX MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 - // | a6 * b14 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 48(DI), AX + // | a6 * b7 + MOVQ 56(SI), AX MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) + ADDQ AX, R12 + ADCQ DX, R13 - // | a7 * b14 - // | (w21, w22, w23, w24) @ (104(SP), 112(SP), 120(SP), 128(SP)) - MOVQ 56(DI), AX + // | + +/* i = 7 */ + + // | a7 @ CX + MOVQ 56(DI), CX + MOVQ $0x00, BX + + // | a7 * b0 + MOVQ (SI), AX MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, 112(SP) - ADCQ $0x00, 120(SP) - ADCQ $0x00, 128(SP) + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ $0x00, R8 + ADCQ $0x00, BX + MOVQ R14, 56(SP) + MOVQ $0x00, R14 - // | a8 * b14 - // | (w22, w23, w24, w25) @ (112(SP), 120(SP), 128(SP), 136(SP)) - MOVQ 64(DI), AX + // | a7 * b1 + MOVQ 8(SI), AX MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, 120(SP) - ADCQ $0x00, 128(SP) - ADCQ $0x00, 136(SP) + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a9 * b14 - // | (w23, w24, w25, w26) @ (120(SP), 128(SP), 136(SP), 144(SP)) - MOVQ 72(DI), AX + // | a7 * b2 + MOVQ 16(SI), AX MULQ CX - ADDQ AX, 120(SP) - ADCQ DX, 128(SP) - ADCQ $0x00, 136(SP) - ADCQ $0x00, 144(SP) + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a10 * b14 - // | (w24, w25, w26, w27) @ (128(SP), 136(SP), 144(SP), 152(SP)) - MOVQ 80(DI), AX + // | a7 * b3 + MOVQ 24(SI), AX MULQ CX - ADDQ AX, 128(SP) - ADCQ DX, 136(SP) - ADCQ $0x00, 144(SP) - ADCQ $0x00, 152(SP) + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a11 * b14 - // | (w25, w26, w27, w28) @ (136(SP), 144(SP), 152(SP), 160(SP)) - MOVQ 88(DI), AX + // | a7 * b4 + MOVQ 32(SI), AX MULQ CX - ADDQ AX, 136(SP) - ADCQ DX, 144(SP) - ADCQ $0x00, 152(SP) - ADCQ $0x00, 160(SP) + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a12 * b14 - // | (w26, w27, w28, w29) @ (144(SP), 152(SP), 160(SP), 168(SP)) - MOVQ 96(DI), AX + // | a7 * b5 + MOVQ 40(SI), AX MULQ CX - ADDQ AX, 144(SP) - ADCQ DX, 152(SP) - ADCQ $0x00, 160(SP) - ADCQ $0x00, 168(SP) + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | a13 * b14 - // | (w27, w28, w29) @ (152(SP), 160(SP), 168(SP)) - MOVQ 104(DI), AX + // | a7 * b6 + MOVQ 48(SI), AX MULQ CX - ADDQ AX, 152(SP) - ADCQ DX, 160(SP) - ADCQ $0x00, 168(SP) + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 - // | a14 * b14 - // | (w28, w29) @ (160(SP), 168(SP)) - MOVQ 112(DI), AX + // | a7 * b7 + MOVQ 56(SI), AX MULQ CX - ADDQ AX, 160(SP) - ADCQ DX, 168(SP) + ADDQ AX, R13 + ADCQ DX, R14 - // | - // | Montgomerry Reduction - MOVQ R15, 176(SP) - MOVQ R14, 184(SP) - MOVQ p+24(FP), R14 + // | - // | - // | (u @ CX) = (w0 @ R8) * inp - MOVQ R8, AX - MULQ inp+32(FP) - MOVQ AX, CX +/* i = 8 */ - // | w0 @ R8 - XORQ DI, DI - MOVQ (R14), AX + // | a8 @ CX + MOVQ 64(DI), CX + MOVQ $0x00, BX + + // | a8 * b0 + MOVQ (SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ $0x00, R9 + ADCQ $0x00, BX + MOVQ R15, 64(SP) + MOVQ $0x00, R15 + + // | a8 * b1 + MOVQ 8(SI), AX MULQ CX ADDQ AX, R8 - ADCQ DX, DI + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w1 @ R9 - XORQ SI, SI - MOVQ 8(R14), AX + // | a8 * b2 + MOVQ 16(SI), AX MULQ CX ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w2 @ R10 - XORQ DI, DI - MOVQ 16(R14), AX + // | a8 * b3 + MOVQ 24(SI), AX MULQ CX ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w3 @ R11 - XORQ SI, SI - MOVQ 24(R14), AX + // | a8 * b4 + MOVQ 32(SI), AX MULQ CX ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w4 @ R12 - XORQ DI, DI - MOVQ 32(R14), AX + // | a8 * b5 + MOVQ 40(SI), AX MULQ CX ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w5 @ R13 - XORQ SI, SI - MOVQ 40(R14), AX + // | a8 * b6 + MOVQ 48(SI), AX MULQ CX ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + ADCQ DX, R14 + ADCQ BX, R15 - // | w6 @ 184(SP) - XORQ DI, DI - MOVQ 48(R14), AX + // | a8 * b7 + MOVQ 56(SI), AX MULQ CX - ADDQ AX, 184(SP) - ADCQ DX, DI - ADDQ SI, 184(SP) - ADCQ $0x00, DI + ADDQ AX, R14 + ADCQ DX, R15 - // | w7 @ 176(SP) - XORQ SI, SI - MOVQ 56(R14), AX + // | + +/* i = 9 */ + + // | a9 @ CX + MOVQ 72(DI), CX + MOVQ $0x00, BX + + // | a9 * b0 + MOVQ (SI), AX MULQ CX - ADDQ AX, 176(SP) - ADCQ DX, SI - ADDQ DI, 176(SP) - ADCQ $0x00, SI + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R10 + ADCQ $0x00, BX + MOVQ R8, 72(SP) + MOVQ $0x00, R8 - // | w8 @ (SP) - XORQ DI, DI - MOVQ 64(R14), AX + // | a9 * b1 + MOVQ 8(SI), AX MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w9 @ 8(SP) - XORQ SI, SI - MOVQ 72(R14), AX + // | a9 * b2 + MOVQ 16(SI), AX MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w10 @ 16(SP) - XORQ DI, DI - MOVQ 80(R14), AX + // | a9 * b3 + MOVQ 24(SI), AX MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w11 @ 24(SP) - XORQ SI, SI - MOVQ 88(R14), AX + // | a9 * b4 + MOVQ 32(SI), AX MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w12 @ 32(SP) - XORQ DI, DI - MOVQ 96(R14), AX + // | a9 * b5 + MOVQ 40(SI), AX MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI - ADDQ SI, 32(SP) - ADCQ $0x00, DI + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w13 @ 40(SP) - XORQ SI, SI - MOVQ 104(R14), AX + // | a9 * b6 + MOVQ 48(SI), AX MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, SI - ADDQ DI, 40(SP) - ADCQ $0x00, SI + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 - // | w14 @ 48(SP) - XORQ DI, DI - MOVQ 112(R14), AX + // | a9 * b7 + MOVQ 56(SI), AX MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, DI - ADDQ SI, 48(SP) - ADCQ $0x00, DI + ADDQ AX, R15 + ADCQ DX, R8 - // | w15 @ 56(SP) - ADDQ DI, 56(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | - // | - MOVQ 64(SP), R8 +/* i = 10 */ - // | (u @ CX) = (w1 @ R9) * inp - MOVQ R9, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | a10 @ CX + MOVQ 80(DI), CX + MOVQ $0x00, BX - // | w1 @ R9 - XORQ DI, DI - MOVQ (R14), AX + // | a10 * b0 + MOVQ (SI), AX MULQ CX ADDQ AX, R9 - ADCQ DX, DI + ADCQ DX, R10 + ADCQ $0x00, R11 + ADCQ $0x00, BX + MOVQ R9, 80(SP) + MOVQ $0x00, R9 - // | w2 @ R10 - XORQ SI, SI - MOVQ 8(R14), AX + // | a10 * b1 + MOVQ 8(SI), AX MULQ CX ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w3 @ R11 - XORQ DI, DI - MOVQ 16(R14), AX + // | a10 * b2 + MOVQ 16(SI), AX MULQ CX ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w4 @ R12 - XORQ SI, SI - MOVQ 24(R14), AX + // | a10 * b3 + MOVQ 24(SI), AX MULQ CX ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w5 @ R13 - XORQ DI, DI - MOVQ 32(R14), AX + // | a10 * b4 + MOVQ 32(SI), AX MULQ CX ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w6 @ 184(SP) - XORQ SI, SI - MOVQ 40(R14), AX + // | a10 * b5 + MOVQ 40(SI), AX MULQ CX - ADDQ AX, 184(SP) - ADCQ DX, SI - ADDQ DI, 184(SP) - ADCQ $0x00, SI + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w7 @ 176(SP) - XORQ DI, DI - MOVQ 48(R14), AX + // | a10 * b6 + MOVQ 48(SI), AX MULQ CX - ADDQ AX, 176(SP) - ADCQ DX, DI - ADDQ SI, 176(SP) - ADCQ $0x00, DI + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 - // | w8 @ (SP) - XORQ SI, SI - MOVQ 56(R14), AX + // | a10 * b7 + MOVQ 56(SI), AX MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + ADDQ AX, R8 + ADCQ DX, R9 - // | w9 @ 8(SP) - XORQ DI, DI - MOVQ 64(R14), AX + // | + +/* i = 11 */ + + // | a11 @ CX + MOVQ 88(DI), CX + MOVQ $0x00, BX + + // | a11 * b0 + MOVQ (SI), AX MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ $0x00, R12 + ADCQ $0x00, BX + MOVQ R10, 88(SP) + MOVQ $0x00, R10 - // | w10 @ 16(SP) - XORQ SI, SI - MOVQ 72(R14), AX + // | a11 * b1 + MOVQ 8(SI), AX MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w11 @ 24(SP) - XORQ DI, DI - MOVQ 80(R14), AX + // | a11 * b2 + MOVQ 16(SI), AX MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI - ADDQ SI, 24(SP) - ADCQ $0x00, DI + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w12 @ 32(SP) - XORQ SI, SI - MOVQ 88(R14), AX + // | a11 * b3 + MOVQ 24(SI), AX MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, SI - ADDQ DI, 32(SP) - ADCQ $0x00, SI + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w13 @ 40(SP) - XORQ DI, DI - MOVQ 96(R14), AX + // | a11 * b4 + MOVQ 32(SI), AX MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, DI - ADDQ SI, 40(SP) - ADCQ $0x00, DI + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w14 @ 48(SP) - XORQ SI, SI - MOVQ 104(R14), AX + // | a11 * b5 + MOVQ 40(SI), AX MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, SI - ADDQ DI, 48(SP) - ADCQ $0x00, SI + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w15 @ 56(SP) - XORQ DI, DI - MOVQ 112(R14), AX + // | a11 * b6 + MOVQ 48(SI), AX MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, DI - ADDQ SI, 56(SP) - ADCQ $0x00, DI + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 - // | w16 @ R8 - ADDQ DI, R15 - ADCQ R15, R8 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | a11 * b7 + MOVQ 56(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 - // | - MOVQ 72(SP), R9 + // | - // | (u @ CX) = (w2 @ R10) * inp - MOVQ R10, AX - MULQ inp+32(FP) - MOVQ AX, CX +/* i = 12 */ - // | w2 @ R10 - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI + // | a12 @ CX + MOVQ 96(DI), CX + MOVQ $0x00, BX - // | w3 @ R11 - XORQ SI, SI - MOVQ 8(R14), AX + // | a12 * b0 + MOVQ (SI), AX MULQ CX ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + ADCQ DX, R12 + ADCQ $0x00, R13 + ADCQ $0x00, BX + MOVQ R11, 96(SP) + MOVQ $0x00, R11 - // | w4 @ R12 - XORQ DI, DI - MOVQ 16(R14), AX + // | a12 * b1 + MOVQ 8(SI), AX MULQ CX ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w5 @ R13 - XORQ SI, SI - MOVQ 24(R14), AX + // | a12 * b2 + MOVQ 16(SI), AX MULQ CX ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w6 @ 184(SP) - XORQ DI, DI - MOVQ 32(R14), AX + // | a12 * b3 + MOVQ 24(SI), AX MULQ CX - ADDQ AX, 184(SP) - ADCQ DX, DI - ADDQ SI, 184(SP) - ADCQ $0x00, DI + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w7 @ 176(SP) - XORQ SI, SI - MOVQ 40(R14), AX + // | a12 * b4 + MOVQ 32(SI), AX MULQ CX - ADDQ AX, 176(SP) - ADCQ DX, SI - ADDQ DI, 176(SP) - ADCQ $0x00, SI + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w8 @ (SP) - XORQ DI, DI - MOVQ 48(R14), AX + // | a12 * b5 + MOVQ 40(SI), AX MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w9 @ 8(SP) - XORQ SI, SI - MOVQ 56(R14), AX + // | a12 * b6 + MOVQ 48(SI), AX MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 - // | w10 @ 16(SP) - XORQ DI, DI - MOVQ 64(R14), AX + // | a12 * b7 + MOVQ 56(SI), AX MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + ADDQ AX, R10 + ADCQ DX, R11 - // | w11 @ 24(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI + // | - // | w12 @ 32(SP) - XORQ DI, DI - MOVQ 80(R14), AX +/* i = 13 */ + + // | a13 @ CX + MOVQ 104(DI), CX + MOVQ $0x00, BX + + // | a13 * b0 + MOVQ (SI), AX MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI - ADDQ SI, 32(SP) - ADCQ $0x00, DI + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ $0x00, R14 + ADCQ $0x00, BX + MOVQ R12, 104(SP) + MOVQ $0x00, R12 - // | w13 @ 40(SP) - XORQ SI, SI - MOVQ 88(R14), AX + // | a13 * b1 + MOVQ 8(SI), AX MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, SI - ADDQ DI, 40(SP) - ADCQ $0x00, SI + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w14 @ 48(SP) - XORQ DI, DI - MOVQ 96(R14), AX + // | a13 * b2 + MOVQ 16(SI), AX MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, DI - ADDQ SI, 48(SP) - ADCQ $0x00, DI + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w15 @ 56(SP) - XORQ SI, SI - MOVQ 104(R14), AX + // | a13 * b3 + MOVQ 24(SI), AX MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, SI - ADDQ DI, 56(SP) - ADCQ $0x00, SI + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w16 @ R8 - XORQ DI, DI - MOVQ 112(R14), AX + // | a13 * b4 + MOVQ 32(SI), AX MULQ CX ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI - - // | w17 @ R9 - ADDQ DI, R15 - ADCQ R15, R9 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | - MOVQ 80(SP), R10 + // | a13 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | (u @ CX) = (w3 @ R11) * inp - MOVQ R11, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | a13 * b6 + MOVQ 48(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 - // | w3 @ R11 - XORQ DI, DI - MOVQ (R14), AX + // | a13 * b7 + MOVQ 56(SI), AX MULQ CX ADDQ AX, R11 - ADCQ DX, DI + ADCQ DX, R12 - // | w4 @ R12 - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + // | - // | w5 @ R13 - XORQ DI, DI - MOVQ 16(R14), AX +/* i = 14 */ + + // | a14 @ CX + MOVQ 112(DI), CX + MOVQ $0x00, BX + + // | a14 * b0 + MOVQ (SI), AX MULQ CX ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + ADCQ DX, R14 + ADCQ $0x00, R15 + ADCQ $0x00, BX + MOVQ R13, 112(SP) + MOVQ $0x00, R13 - // | w6 @ 184(SP) - XORQ SI, SI - MOVQ 24(R14), AX + // | a14 * b1 + MOVQ 8(SI), AX MULQ CX - ADDQ AX, 184(SP) - ADCQ DX, SI - ADDQ DI, 184(SP) - ADCQ $0x00, SI + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w7 @ 176(SP) - XORQ DI, DI - MOVQ 32(R14), AX + // | a14 * b2 + MOVQ 16(SI), AX MULQ CX - ADDQ AX, 176(SP) - ADCQ DX, DI - ADDQ SI, 176(SP) - ADCQ $0x00, DI + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w8 @ (SP) - XORQ SI, SI - MOVQ 40(R14), AX + // | a14 * b3 + MOVQ 24(SI), AX MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w9 @ 8(SP) - XORQ DI, DI - MOVQ 48(R14), AX + // | a14 * b4 + MOVQ 32(SI), AX MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w10 @ 16(SP) - XORQ SI, SI - MOVQ 56(R14), AX + // | a14 * b5 + MOVQ 40(SI), AX MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w11 @ 24(SP) - XORQ DI, DI - MOVQ 64(R14), AX + // | a14 * b6 + MOVQ 48(SI), AX MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI - ADDQ SI, 24(SP) - ADCQ $0x00, DI + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 - // | w12 @ 32(SP) - XORQ SI, SI - MOVQ 72(R14), AX + // | a14 * b7 + MOVQ 56(SI), AX MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, SI - ADDQ DI, 32(SP) - ADCQ $0x00, SI + ADDQ AX, R12 + ADCQ DX, R13 - // | w13 @ 40(SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, DI - ADDQ SI, 40(SP) - ADCQ $0x00, DI + // | - // | w14 @ 48(SP) - XORQ SI, SI - MOVQ 88(R14), AX +/* i = 15 */ + + // | a15 @ CX + MOVQ 120(DI), CX + MOVQ $0x00, BX + + // | a15 * b0 + MOVQ (SI), AX MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, SI - ADDQ DI, 48(SP) - ADCQ $0x00, SI + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ $0x00, R8 + ADCQ $0x00, BX - // | w15 @ 56(SP) - XORQ DI, DI - MOVQ 96(R14), AX + // | a15 * b1 + MOVQ 8(SI), AX MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, DI - ADDQ SI, 56(SP) - ADCQ $0x00, DI + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w16 @ R8 - XORQ SI, SI - MOVQ 104(R14), AX + // | a15 * b2 + MOVQ 16(SI), AX MULQ CX ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w17 @ R9 - XORQ DI, DI - MOVQ 112(R14), AX + // | a15 * b3 + MOVQ 24(SI), AX MULQ CX ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI - - // | w18 @ R10 - ADDQ DI, R15 - ADCQ R15, R10 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | - MOVQ 88(SP), R11 + // | a15 * b4 + MOVQ 32(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | (u @ CX) = (w4 @ R12) * inp - MOVQ R12, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | a15 * b5 + MOVQ 40(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w4 @ R12 - XORQ DI, DI - MOVQ (R14), AX + // | a15 * b6 + MOVQ 48(SI), AX MULQ CX ADDQ AX, R12 - ADCQ DX, DI + ADCQ DX, R13 + ADCQ $0x00, BX - // | w5 @ R13 - XORQ SI, SI - MOVQ 8(R14), AX + // | a15 * b7 + MOVQ 56(SI), AX MULQ CX ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + ADCQ DX, BX - // | w6 @ 184(SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 184(SP) - ADCQ DX, DI - ADDQ SI, 184(SP) - ADCQ $0x00, DI + // | - // | w7 @ 176(SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 176(SP) - ADCQ DX, SI - ADDQ DI, 176(SP) - ADCQ $0x00, SI +/* */ - // | w8 @ (SP) - XORQ DI, DI - MOVQ 32(R14), AX + // | + // | W part 1 multiplication + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 112(SP) | 15 R14 + // | 16 R15 | 17 R8 | 18 R9 | 19 R10 | 20 R11 | 21 R12 | 22 R13 | 23 BX | 24 - | 25 - | 26 - | 27 - | 28 - | 29 - | 30 - | 31 - + + + MOVQ R14, 120(SP) + MOVQ R15, 128(SP) + MOVQ R8, 136(SP) + MOVQ R9, 144(SP) + MOVQ R10, 152(SP) + MOVQ R11, 160(SP) + MOVQ R12, 168(SP) + MOVQ R13, 176(SP) + MOVQ BX, 184(SP) + + // | + // | W part 1 moved to stack + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 112(SP) | 15 120(SP) + // | 16 128(SP) | 17 136(SP) | 18 144(SP) | 19 152(SP) | 20 160(SP) | 21 168(SP) | 22 176(SP) | 23 184(SP) | 24 - | 25 - | 26 - | 27 - | 28 - | 29 - | 30 - | 31 - + + + MOVQ $0x00, R9 + MOVQ $0x00, R10 + MOVQ $0x00, R11 + MOVQ $0x00, R12 + MOVQ $0x00, R13 + MOVQ $0x00, R14 + MOVQ $0x00, R15 + + // | + +/* i = 0 */ + + // | a0 @ CX + MOVQ (DI), CX + + // | a0 * b8 + MOVQ 64(SI), AX MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + MOVQ AX, 192(SP) + MOVQ DX, R8 - // | w9 @ 8(SP) - XORQ SI, SI - MOVQ 40(R14), AX + // | a0 * b9 + MOVQ 72(SI), AX MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + ADDQ AX, R8 + ADCQ DX, R9 - // | w10 @ 16(SP) - XORQ DI, DI - MOVQ 48(R14), AX + // | a0 * b10 + MOVQ 80(SI), AX MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + ADDQ AX, R9 + ADCQ DX, R10 - // | w11 @ 24(SP) - XORQ SI, SI - MOVQ 56(R14), AX + // | a0 * b11 + MOVQ 88(SI), AX MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI + ADDQ AX, R10 + ADCQ DX, R11 - // | w12 @ 32(SP) - XORQ DI, DI - MOVQ 64(R14), AX + // | a0 * b12 + MOVQ 96(SI), AX MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI - ADDQ SI, 32(SP) - ADCQ $0x00, DI + ADDQ AX, R11 + ADCQ DX, R12 - // | w13 @ 40(SP) - XORQ SI, SI - MOVQ 72(R14), AX + // | a0 * b13 + MOVQ 104(SI), AX MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, SI - ADDQ DI, 40(SP) - ADCQ $0x00, SI + ADDQ AX, R12 + ADCQ DX, R13 - // | w14 @ 48(SP) - XORQ DI, DI - MOVQ 80(R14), AX + // | a0 * b14 + MOVQ 112(SI), AX MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, DI - ADDQ SI, 48(SP) - ADCQ $0x00, DI + ADDQ AX, R13 + ADCQ DX, R14 - // | w15 @ 56(SP) - XORQ SI, SI - MOVQ 88(R14), AX + // | a0 * b15 + MOVQ 120(SI), AX MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, SI - ADDQ DI, 56(SP) - ADCQ $0x00, SI + ADDQ AX, R14 + ADCQ DX, R15 - // | w16 @ R8 - XORQ DI, DI - MOVQ 96(R14), AX + // | + +/* i = 1 */ + + // | a1 @ CX + MOVQ 8(DI), CX + MOVQ $0x00, BX + + // | a1 * b8 + MOVQ 64(SI), AX MULQ CX ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + ADCQ DX, R9 + ADCQ $0x00, R10 + ADCQ $0x00, BX + MOVQ R8, 200(SP) + MOVQ $0x00, R8 - // | w17 @ R9 - XORQ SI, SI - MOVQ 104(R14), AX + // | a1 * b9 + MOVQ 72(SI), AX MULQ CX ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w18 @ R10 - XORQ DI, DI - MOVQ 112(R14), AX + // | a1 * b10 + MOVQ 80(SI), AX MULQ CX ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI - - // | w19 @ R11 - ADDQ DI, R15 - ADCQ R15, R11 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | - MOVQ 96(SP), R12 + // | a1 * b11 + MOVQ 88(SI), AX + MULQ CX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | (u @ CX) = (w5 @ R13) * inp - MOVQ R13, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | a1 * b12 + MOVQ 96(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w5 @ R13 - XORQ DI, DI - MOVQ (R14), AX + // | a1 * b13 + MOVQ 104(SI), AX MULQ CX ADDQ AX, R13 - ADCQ DX, DI + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w6 @ 184(SP) - XORQ SI, SI - MOVQ 8(R14), AX + // | a1 * b14 + MOVQ 112(SI), AX MULQ CX - ADDQ AX, 184(SP) - ADCQ DX, SI - ADDQ DI, 184(SP) - ADCQ $0x00, SI + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 - // | w7 @ 176(SP) - XORQ DI, DI - MOVQ 16(R14), AX + // | a1 * b15 + MOVQ 120(SI), AX MULQ CX - ADDQ AX, 176(SP) - ADCQ DX, DI - ADDQ SI, 176(SP) - ADCQ $0x00, DI + ADDQ AX, R15 + ADCQ DX, R8 - // | w8 @ (SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + // | - // | w9 @ 8(SP) - XORQ DI, DI - MOVQ 32(R14), AX +/* i = 2 */ + + // | a2 @ CX + MOVQ 16(DI), CX + MOVQ $0x00, BX + + // | a2 * b8 + MOVQ 64(SI), AX MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ $0x00, R11 + ADCQ $0x00, BX + MOVQ R9, 208(SP) + MOVQ $0x00, R9 - // | w10 @ 16(SP) - XORQ SI, SI - MOVQ 40(R14), AX + // | a2 * b9 + MOVQ 72(SI), AX MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w11 @ 24(SP) - XORQ DI, DI - MOVQ 48(R14), AX + // | a2 * b10 + MOVQ 80(SI), AX MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI - ADDQ SI, 24(SP) - ADCQ $0x00, DI + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w12 @ 32(SP) - XORQ SI, SI - MOVQ 56(R14), AX + // | a2 * b11 + MOVQ 88(SI), AX MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, SI - ADDQ DI, 32(SP) - ADCQ $0x00, SI + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w13 @ 40(SP) - XORQ DI, DI - MOVQ 64(R14), AX + // | a2 * b12 + MOVQ 96(SI), AX MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, DI - ADDQ SI, 40(SP) - ADCQ $0x00, DI + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w14 @ 48(SP) - XORQ SI, SI - MOVQ 72(R14), AX + // | a2 * b13 + MOVQ 104(SI), AX MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, SI - ADDQ DI, 48(SP) - ADCQ $0x00, SI + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w15 @ 56(SP) - XORQ DI, DI - MOVQ 80(R14), AX + // | a2 * b14 + MOVQ 112(SI), AX MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, DI - ADDQ SI, 56(SP) - ADCQ $0x00, DI + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 - // | w16 @ R8 - XORQ SI, SI - MOVQ 88(R14), AX + // | a2 * b15 + MOVQ 120(SI), AX MULQ CX ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + ADCQ DX, R9 - // | w17 @ R9 - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI + // | - // | w18 @ R10 - XORQ SI, SI - MOVQ 104(R14), AX +/* i = 3 */ + + // | a3 @ CX + MOVQ 24(DI), CX + MOVQ $0x00, BX + + // | a3 * b8 + MOVQ 64(SI), AX MULQ CX ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + ADCQ DX, R11 + ADCQ $0x00, R12 + ADCQ $0x00, BX + MOVQ R10, 216(SP) + MOVQ $0x00, R10 - // | w19 @ R11 - XORQ DI, DI - MOVQ 112(R14), AX + // | a3 * b9 + MOVQ 72(SI), AX MULQ CX ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI - - // | w20 @ R12 - ADDQ DI, R15 - ADCQ R15, R12 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 - - // | - MOVQ 104(SP), R13 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | (u @ CX) = (w6 @ 184(SP)) * inp - MOVQ 184(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + // | a3 * b10 + MOVQ 80(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w6 @ 184(SP) - XORQ DI, DI - MOVQ (R14), AX + // | a3 * b11 + MOVQ 88(SI), AX MULQ CX - ADDQ AX, 184(SP) - ADCQ DX, DI + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w7 @ 176(SP) - XORQ SI, SI - MOVQ 8(R14), AX + // | a3 * b12 + MOVQ 96(SI), AX MULQ CX - ADDQ AX, 176(SP) - ADCQ DX, SI - ADDQ DI, 176(SP) - ADCQ $0x00, SI + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w8 @ (SP) - XORQ DI, DI - MOVQ 16(R14), AX + // | a3 * b13 + MOVQ 104(SI), AX MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w9 @ 8(SP) - XORQ SI, SI - MOVQ 24(R14), AX + // | a3 * b14 + MOVQ 112(SI), AX MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 - // | w10 @ 16(SP) - XORQ DI, DI - MOVQ 32(R14), AX + // | a3 * b15 + MOVQ 120(SI), AX MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + ADDQ AX, R9 + ADCQ DX, R10 - // | w11 @ 24(SP) - XORQ SI, SI - MOVQ 40(R14), AX + // | + +/* i = 4 */ + + // | a4 @ CX + MOVQ 32(DI), CX + MOVQ $0x00, BX + + // | a4 * b8 + MOVQ 64(SI), AX MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ $0x00, R13 + ADCQ $0x00, BX + MOVQ R11, 224(SP) + MOVQ $0x00, R11 - // | w12 @ 32(SP) - XORQ DI, DI - MOVQ 48(R14), AX + // | a4 * b9 + MOVQ 72(SI), AX MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI - ADDQ SI, 32(SP) - ADCQ $0x00, DI + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w13 @ 40(SP) - XORQ SI, SI - MOVQ 56(R14), AX + // | a4 * b10 + MOVQ 80(SI), AX MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, SI - ADDQ DI, 40(SP) - ADCQ $0x00, SI + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w14 @ 48(SP) - XORQ DI, DI - MOVQ 64(R14), AX + // | a4 * b11 + MOVQ 88(SI), AX MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, DI - ADDQ SI, 48(SP) - ADCQ $0x00, DI + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w15 @ 56(SP) - XORQ SI, SI - MOVQ 72(R14), AX + // | a4 * b12 + MOVQ 96(SI), AX MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, SI - ADDQ DI, 56(SP) - ADCQ $0x00, SI + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w16 @ R8 - XORQ DI, DI - MOVQ 80(R14), AX + // | a4 * b13 + MOVQ 104(SI), AX MULQ CX ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w17 @ R9 - XORQ SI, SI - MOVQ 88(R14), AX + // | a4 * b14 + MOVQ 112(SI), AX MULQ CX ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + ADCQ DX, R10 + ADCQ BX, R11 - // | w18 @ R10 - XORQ DI, DI - MOVQ 96(R14), AX + // | a4 * b15 + MOVQ 120(SI), AX MULQ CX ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + ADCQ DX, R11 - // | w19 @ R11 - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + // | - // | w20 @ R12 - XORQ DI, DI - MOVQ 112(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI +/* i = 5 */ - // | w21 @ R13 - ADDQ DI, R15 - ADCQ R15, R13 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | a5 @ CX + MOVQ 40(DI), CX + MOVQ $0x00, BX - // | - MOVQ 112(SP), BX - MOVQ BX, 184(SP) + // | a5 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ $0x00, R14 + ADCQ $0x00, BX + MOVQ R12, 232(SP) + MOVQ $0x00, R12 - // | (u @ CX) = (w7 @ 176(SP)) * inp - MOVQ 176(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + // | a5 * b9 + MOVQ 72(SI), AX + MULQ CX + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w7 @ 176(SP) - XORQ DI, DI - MOVQ (R14), AX + // | a5 * b10 + MOVQ 80(SI), AX MULQ CX - ADDQ AX, 176(SP) - ADCQ DX, DI + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w8 @ (SP) - XORQ SI, SI - MOVQ 8(R14), AX + // | a5 * b11 + MOVQ 88(SI), AX MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w9 @ 8(SP) - XORQ DI, DI - MOVQ 16(R14), AX + // | a5 * b12 + MOVQ 96(SI), AX MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w10 @ 16(SP) - XORQ SI, SI - MOVQ 24(R14), AX + // | a5 * b13 + MOVQ 104(SI), AX MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w11 @ 24(SP) - XORQ DI, DI - MOVQ 32(R14), AX + // | a5 * b14 + MOVQ 112(SI), AX MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI - ADDQ SI, 24(SP) - ADCQ $0x00, DI + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 - // | w12 @ 32(SP) - XORQ SI, SI - MOVQ 40(R14), AX + // | a5 * b15 + MOVQ 120(SI), AX MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, SI - ADDQ DI, 32(SP) - ADCQ $0x00, SI + ADDQ AX, R11 + ADCQ DX, R12 - // | w13 @ 40(SP) - XORQ DI, DI - MOVQ 48(R14), AX + // | + +/* i = 6 */ + + // | a6 @ CX + MOVQ 48(DI), CX + MOVQ $0x00, BX + + // | a6 * b8 + MOVQ 64(SI), AX MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, DI - ADDQ SI, 40(SP) - ADCQ $0x00, DI + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ $0x00, R15 + ADCQ $0x00, BX + MOVQ R13, 240(SP) + MOVQ $0x00, R13 - // | w14 @ 48(SP) - XORQ SI, SI - MOVQ 56(R14), AX + // | a6 * b9 + MOVQ 72(SI), AX MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, SI - ADDQ DI, 48(SP) - ADCQ $0x00, SI + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w15 @ 56(SP) - XORQ DI, DI - MOVQ 64(R14), AX + // | a6 * b10 + MOVQ 80(SI), AX MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, DI - ADDQ SI, 56(SP) - ADCQ $0x00, DI + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w16 @ R8 - XORQ SI, SI - MOVQ 72(R14), AX + // | a6 * b11 + MOVQ 88(SI), AX MULQ CX ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w17 @ R9 - XORQ DI, DI - MOVQ 80(R14), AX + // | a6 * b12 + MOVQ 96(SI), AX MULQ CX ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w18 @ R10 - XORQ SI, SI - MOVQ 88(R14), AX + // | a6 * b13 + MOVQ 104(SI), AX MULQ CX ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w19 @ R11 - XORQ DI, DI - MOVQ 96(R14), AX + // | a6 * b14 + MOVQ 112(SI), AX MULQ CX ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI - - // | w20 @ R12 - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + ADCQ DX, R12 + ADCQ BX, R13 - // | w21 @ R13 - XORQ DI, DI - MOVQ 112(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + // | a6 * b15 + MOVQ 120(SI), AX + MULQ CX + ADDQ AX, R12 + ADCQ DX, R13 - // | w22 @ 184(SP) - ADDQ DI, R15 - ADCQ R15, 184(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | - // | - MOVQ 120(SP), BX - MOVQ BX, 176(SP) +/* i = 7 */ - // | (u @ CX) = (w8 @ (SP)) * inp - MOVQ (SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + // | a7 @ CX + MOVQ 56(DI), CX + MOVQ $0x00, BX - // | w8 @ (SP) - XORQ DI, DI - MOVQ (R14), AX + // | a7 * b8 + MOVQ 64(SI), AX MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ $0x00, R8 + ADCQ $0x00, BX + MOVQ R14, 248(SP) + MOVQ $0x00, R14 - // | w9 @ 8(SP) - XORQ SI, SI - MOVQ 8(R14), AX + // | a7 * b9 + MOVQ 72(SI), AX MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w10 @ 16(SP) - XORQ DI, DI - MOVQ 16(R14), AX + // | a7 * b10 + MOVQ 80(SI), AX MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w11 @ 24(SP) - XORQ SI, SI - MOVQ 24(R14), AX + // | a7 * b11 + MOVQ 88(SI), AX MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w12 @ 32(SP) - XORQ DI, DI - MOVQ 32(R14), AX + // | a7 * b12 + MOVQ 96(SI), AX MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI - ADDQ SI, 32(SP) - ADCQ $0x00, DI + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w13 @ 40(SP) - XORQ SI, SI - MOVQ 40(R14), AX + // | a7 * b13 + MOVQ 104(SI), AX MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, SI - ADDQ DI, 40(SP) - ADCQ $0x00, SI + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w14 @ 48(SP) - XORQ DI, DI - MOVQ 48(R14), AX + // | a7 * b14 + MOVQ 112(SI), AX MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, DI - ADDQ SI, 48(SP) - ADCQ $0x00, DI + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 - // | w15 @ 56(SP) - XORQ SI, SI - MOVQ 56(R14), AX + // | a7 * b15 + MOVQ 120(SI), AX MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, SI - ADDQ DI, 56(SP) - ADCQ $0x00, SI + ADDQ AX, R13 + ADCQ DX, R14 - // | w16 @ R8 - XORQ DI, DI - MOVQ 64(R14), AX + // | + +/* i = 8 */ + + // | a8 @ CX + MOVQ 64(DI), CX + MOVQ $0x00, BX + + // | a8 * b8 + MOVQ 64(SI), AX + MULQ CX + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ $0x00, R9 + ADCQ $0x00, BX + MOVQ R15, 256(SP) + MOVQ $0x00, R15 + + // | a8 * b9 + MOVQ 72(SI), AX MULQ CX ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w17 @ R9 - XORQ SI, SI - MOVQ 72(R14), AX + // | a8 * b10 + MOVQ 80(SI), AX MULQ CX ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w18 @ R10 - XORQ DI, DI - MOVQ 80(R14), AX + // | a8 * b11 + MOVQ 88(SI), AX MULQ CX ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w19 @ R11 - XORQ SI, SI - MOVQ 88(R14), AX + // | a8 * b12 + MOVQ 96(SI), AX MULQ CX ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w20 @ R12 - XORQ DI, DI - MOVQ 96(R14), AX + // | a8 * b13 + MOVQ 104(SI), AX MULQ CX ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w21 @ R13 - XORQ SI, SI - MOVQ 104(R14), AX + // | a8 * b14 + MOVQ 112(SI), AX MULQ CX ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + ADCQ DX, R14 + ADCQ BX, R15 - // | w22 @ 184(SP) - XORQ DI, DI - MOVQ 112(R14), AX + // | a8 * b15 + MOVQ 120(SI), AX MULQ CX - ADDQ AX, 184(SP) - ADCQ DX, DI - ADDQ SI, 184(SP) - ADCQ $0x00, DI + ADDQ AX, R14 + ADCQ DX, R15 - // | w23 @ 176(SP) - ADDQ DI, R15 - ADCQ R15, 176(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | - // | - MOVQ 128(SP), BX - MOVQ BX, (SP) +/* i = 9 */ - // | (u @ CX) = (w9 @ 8(SP)) * inp - MOVQ 8(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + // | a9 @ CX + MOVQ 72(DI), CX + MOVQ $0x00, BX - // | w9 @ 8(SP) - XORQ DI, DI - MOVQ (R14), AX + // | a9 * b8 + MOVQ 64(SI), AX MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R10 + ADCQ $0x00, BX + MOVQ R8, 264(SP) + MOVQ $0x00, R8 - // | w10 @ 16(SP) - XORQ SI, SI - MOVQ 8(R14), AX + // | a9 * b9 + MOVQ 72(SI), AX MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w11 @ 24(SP) - XORQ DI, DI - MOVQ 16(R14), AX + // | a9 * b10 + MOVQ 80(SI), AX MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI - ADDQ SI, 24(SP) - ADCQ $0x00, DI + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w12 @ 32(SP) - XORQ SI, SI - MOVQ 24(R14), AX + // | a9 * b11 + MOVQ 88(SI), AX MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, SI - ADDQ DI, 32(SP) - ADCQ $0x00, SI + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w13 @ 40(SP) - XORQ DI, DI - MOVQ 32(R14), AX + // | a9 * b12 + MOVQ 96(SI), AX MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, DI - ADDQ SI, 40(SP) - ADCQ $0x00, DI + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w14 @ 48(SP) - XORQ SI, SI - MOVQ 40(R14), AX + // | a9 * b13 + MOVQ 104(SI), AX MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, SI - ADDQ DI, 48(SP) - ADCQ $0x00, SI + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w15 @ 56(SP) - XORQ DI, DI - MOVQ 48(R14), AX + // | a9 * b14 + MOVQ 112(SI), AX MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, DI - ADDQ SI, 56(SP) - ADCQ $0x00, DI + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 - // | w16 @ R8 - XORQ SI, SI - MOVQ 56(R14), AX + // | a9 * b15 + MOVQ 120(SI), AX MULQ CX - ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + ADDQ AX, R15 + ADCQ DX, R8 - // | w17 @ R9 - XORQ DI, DI - MOVQ 64(R14), AX + // | + +/* i = 10 */ + + // | a10 @ CX + MOVQ 80(DI), CX + MOVQ $0x00, BX + + // | a10 * b8 + MOVQ 64(SI), AX MULQ CX ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI + ADCQ DX, R10 + ADCQ $0x00, R11 + ADCQ $0x00, BX + MOVQ R9, 272(SP) + MOVQ $0x00, R9 - // | w18 @ R10 - XORQ SI, SI - MOVQ 72(R14), AX + // | a10 * b9 + MOVQ 72(SI), AX MULQ CX ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w19 @ R11 - XORQ DI, DI - MOVQ 80(R14), AX + // | a10 * b10 + MOVQ 80(SI), AX MULQ CX ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w20 @ R12 - XORQ SI, SI - MOVQ 88(R14), AX + // | a10 * b11 + MOVQ 88(SI), AX MULQ CX ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w21 @ R13 - XORQ DI, DI - MOVQ 96(R14), AX + // | a10 * b12 + MOVQ 96(SI), AX MULQ CX ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w22 @ 184(SP) - XORQ SI, SI - MOVQ 104(R14), AX + // | a10 * b13 + MOVQ 104(SI), AX MULQ CX - ADDQ AX, 184(SP) - ADCQ DX, SI - ADDQ DI, 184(SP) - ADCQ $0x00, SI + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w23 @ 176(SP) - XORQ DI, DI - MOVQ 112(R14), AX + // | a10 * b14 + MOVQ 112(SI), AX MULQ CX - ADDQ AX, 176(SP) - ADCQ DX, DI - ADDQ SI, 176(SP) - ADCQ $0x00, DI + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 - // | w24 @ (SP) - ADDQ DI, R15 - ADCQ R15, (SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | a10 * b15 + MOVQ 120(SI), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 - // | - MOVQ 136(SP), BX - MOVQ BX, 8(SP) + // | - // | (u @ CX) = (w10 @ 16(SP)) * inp - MOVQ 16(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX +/* i = 11 */ - // | w10 @ 16(SP) - XORQ DI, DI - MOVQ (R14), AX + // | a11 @ CX + MOVQ 88(DI), CX + MOVQ $0x00, BX + + // | a11 * b8 + MOVQ 64(SI), AX MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ $0x00, R12 + ADCQ $0x00, BX + MOVQ R10, 280(SP) + MOVQ $0x00, R10 - // | w11 @ 24(SP) - XORQ SI, SI - MOVQ 8(R14), AX + // | a11 * b9 + MOVQ 72(SI), AX MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w12 @ 32(SP) - XORQ DI, DI - MOVQ 16(R14), AX + // | a11 * b10 + MOVQ 80(SI), AX MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI - ADDQ SI, 32(SP) - ADCQ $0x00, DI + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w13 @ 40(SP) - XORQ SI, SI - MOVQ 24(R14), AX + // | a11 * b11 + MOVQ 88(SI), AX MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, SI - ADDQ DI, 40(SP) - ADCQ $0x00, SI + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w14 @ 48(SP) - XORQ DI, DI - MOVQ 32(R14), AX + // | a11 * b12 + MOVQ 96(SI), AX MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, DI - ADDQ SI, 48(SP) - ADCQ $0x00, DI + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w15 @ 56(SP) - XORQ SI, SI - MOVQ 40(R14), AX + // | a11 * b13 + MOVQ 104(SI), AX MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, SI - ADDQ DI, 56(SP) - ADCQ $0x00, SI + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w16 @ R8 - XORQ DI, DI - MOVQ 48(R14), AX + // | a11 * b14 + MOVQ 112(SI), AX MULQ CX ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + ADCQ DX, R9 + ADCQ BX, R10 - // | w17 @ R9 - XORQ SI, SI - MOVQ 56(R14), AX + // | a11 * b15 + MOVQ 120(SI), AX MULQ CX ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + ADCQ DX, R10 - // | w18 @ R10 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + // | - // | w19 @ R11 - XORQ SI, SI - MOVQ 72(R14), AX +/* i = 12 */ + + // | a12 @ CX + MOVQ 96(DI), CX + MOVQ $0x00, BX + + // | a12 * b8 + MOVQ 64(SI), AX MULQ CX ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + ADCQ DX, R12 + ADCQ $0x00, R13 + ADCQ $0x00, BX + MOVQ R11, 288(SP) + MOVQ $0x00, R11 - // | w20 @ R12 - XORQ DI, DI - MOVQ 80(R14), AX + // | a12 * b9 + MOVQ 72(SI), AX MULQ CX ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + ADCQ DX, R13 + ADCQ BX, R14 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w21 @ R13 - XORQ SI, SI - MOVQ 88(R14), AX + // | a12 * b10 + MOVQ 80(SI), AX MULQ CX ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w22 @ 184(SP) - XORQ DI, DI - MOVQ 96(R14), AX + // | a12 * b11 + MOVQ 88(SI), AX MULQ CX - ADDQ AX, 184(SP) - ADCQ DX, DI - ADDQ SI, 184(SP) - ADCQ $0x00, DI + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w23 @ 176(SP) - XORQ SI, SI - MOVQ 104(R14), AX + // | a12 * b12 + MOVQ 96(SI), AX MULQ CX - ADDQ AX, 176(SP) - ADCQ DX, SI - ADDQ DI, 176(SP) - ADCQ $0x00, SI + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w24 @ (SP) - XORQ DI, DI - MOVQ 112(R14), AX + // | a12 * b13 + MOVQ 104(SI), AX MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX + + // | a12 * b14 + MOVQ 112(SI), AX + MULQ CX + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 - // | w25 @ 8(SP) - ADDQ DI, R15 - ADCQ R15, 8(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | a12 * b15 + MOVQ 120(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 - // | - MOVQ 144(SP), BX - MOVQ BX, 16(SP) + // | - // | (u @ CX) = (w11 @ 24(SP)) * inp - MOVQ 24(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX +/* i = 13 */ - // | w11 @ 24(SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI + // | a13 @ CX + MOVQ 104(DI), CX + MOVQ $0x00, BX - // | w12 @ 32(SP) - XORQ SI, SI - MOVQ 8(R14), AX + // | a13 * b8 + MOVQ 64(SI), AX MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, SI - ADDQ DI, 32(SP) - ADCQ $0x00, SI + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ $0x00, R14 + ADCQ $0x00, BX + MOVQ R12, 296(SP) + MOVQ $0x00, R12 - // | w13 @ 40(SP) - XORQ DI, DI - MOVQ 16(R14), AX + // | a13 * b9 + MOVQ 72(SI), AX MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, DI - ADDQ SI, 40(SP) - ADCQ $0x00, DI + ADDQ AX, R13 + ADCQ DX, R14 + ADCQ BX, R15 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w14 @ 48(SP) - XORQ SI, SI - MOVQ 24(R14), AX + // | a13 * b10 + MOVQ 80(SI), AX MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, SI - ADDQ DI, 48(SP) - ADCQ $0x00, SI + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w15 @ 56(SP) - XORQ DI, DI - MOVQ 32(R14), AX + // | a13 * b11 + MOVQ 88(SI), AX MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, DI - ADDQ SI, 56(SP) - ADCQ $0x00, DI + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w16 @ R8 - XORQ SI, SI - MOVQ 40(R14), AX + // | a13 * b12 + MOVQ 96(SI), AX MULQ CX ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w17 @ R9 - XORQ DI, DI - MOVQ 48(R14), AX + // | a13 * b13 + MOVQ 104(SI), AX MULQ CX ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w18 @ R10 - XORQ SI, SI - MOVQ 56(R14), AX + // | a13 * b14 + MOVQ 112(SI), AX MULQ CX ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + ADCQ DX, R11 + ADCQ BX, R12 - // | w19 @ R11 - XORQ DI, DI - MOVQ 64(R14), AX + // | a13 * b15 + MOVQ 120(SI), AX MULQ CX ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI + ADCQ DX, R12 - // | w20 @ R12 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + // | - // | w21 @ R13 - XORQ DI, DI - MOVQ 80(R14), AX +/* i = 14 */ + + // | a14 @ CX + MOVQ 112(DI), CX + MOVQ $0x00, BX + + // | a14 * b8 + MOVQ 64(SI), AX MULQ CX ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + ADCQ DX, R14 + ADCQ $0x00, R15 + ADCQ $0x00, BX + MOVQ R13, 304(SP) + MOVQ $0x00, R13 - // | w22 @ 184(SP) - XORQ SI, SI - MOVQ 88(R14), AX + // | a14 * b9 + MOVQ 72(SI), AX MULQ CX - ADDQ AX, 184(SP) - ADCQ DX, SI - ADDQ DI, 184(SP) - ADCQ $0x00, SI + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ BX, R8 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w23 @ 176(SP) - XORQ DI, DI - MOVQ 96(R14), AX + // | a14 * b10 + MOVQ 80(SI), AX MULQ CX - ADDQ AX, 176(SP) - ADCQ DX, DI - ADDQ SI, 176(SP) - ADCQ $0x00, DI + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w24 @ (SP) - XORQ SI, SI - MOVQ 104(R14), AX + // | a14 * b11 + MOVQ 88(SI), AX MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w25 @ 8(SP) - XORQ DI, DI - MOVQ 112(R14), AX + // | a14 * b12 + MOVQ 96(SI), AX MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI - - // | w26 @ 16(SP) - ADDQ DI, R15 - ADCQ R15, 16(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 - - // | - MOVQ 152(SP), BX - MOVQ BX, 24(SP) + ADDQ AX, R9 + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | (u @ CX) = (w12 @ 32(SP)) * inp - MOVQ 32(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + // | a14 * b13 + MOVQ 104(SI), AX + MULQ CX + ADDQ AX, R10 + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w12 @ 32(SP) - XORQ DI, DI - MOVQ (R14), AX + // | a14 * b14 + MOVQ 112(SI), AX MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ BX, R13 - // | w13 @ 40(SP) - XORQ SI, SI - MOVQ 8(R14), AX + // | a14 * b15 + MOVQ 120(SI), AX MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, SI - ADDQ DI, 40(SP) - ADCQ $0x00, SI + ADDQ AX, R12 + ADCQ DX, R13 - // | w14 @ 48(SP) - XORQ DI, DI - MOVQ 16(R14), AX + // | + +/* i = 15 */ + + // | a15 @ CX + MOVQ 120(DI), CX + MOVQ $0x00, BX + + // | a15 * b8 + MOVQ 64(SI), AX MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, DI - ADDQ SI, 48(SP) - ADCQ $0x00, DI + ADDQ AX, R14 + ADCQ DX, R15 + ADCQ $0x00, R8 + ADCQ $0x00, BX - // | w15 @ 56(SP) - XORQ SI, SI - MOVQ 24(R14), AX + // | a15 * b9 + MOVQ 72(SI), AX MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, SI - ADDQ DI, 56(SP) - ADCQ $0x00, SI + ADDQ AX, R15 + ADCQ DX, R8 + ADCQ BX, R9 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w16 @ R8 - XORQ DI, DI - MOVQ 32(R14), AX + // | a15 * b10 + MOVQ 80(SI), AX MULQ CX ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + ADCQ DX, R9 + ADCQ BX, R10 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w17 @ R9 - XORQ SI, SI - MOVQ 40(R14), AX + // | a15 * b11 + MOVQ 88(SI), AX MULQ CX ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + ADCQ DX, R10 + ADCQ BX, R11 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w18 @ R10 - XORQ DI, DI - MOVQ 48(R14), AX + // | a15 * b12 + MOVQ 96(SI), AX MULQ CX ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + ADCQ DX, R11 + ADCQ BX, R12 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w19 @ R11 - XORQ SI, SI - MOVQ 56(R14), AX + // | a15 * b13 + MOVQ 104(SI), AX MULQ CX ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + ADCQ DX, R12 + ADCQ BX, R13 + MOVQ $0x00, BX + ADCQ $0x00, BX - // | w20 @ R12 - XORQ DI, DI - MOVQ 64(R14), AX + // | a15 * b14 + MOVQ 112(SI), AX MULQ CX ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + ADCQ DX, R13 + ADCQ $0x00, BX - // | w21 @ R13 - XORQ SI, SI - MOVQ 72(R14), AX + // | a15 * b15 + MOVQ 120(SI), AX MULQ CX ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + ADCQ DX, BX - // | w22 @ 184(SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, 184(SP) - ADCQ DX, DI - ADDQ SI, 184(SP) - ADCQ $0x00, DI + // | - // | w23 @ 176(SP) - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, 176(SP) - ADCQ DX, SI - ADDQ DI, 176(SP) - ADCQ $0x00, SI +/* */ - // | w24 @ (SP) - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + // | + // | W part 2 multiplication + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 192(SP) | 9 200(SP) | 10 208(SP) | 11 216(SP) | 12 224(SP) | 13 232(SP) | 14 240(SP) | 15 248(SP) + // | 16 256(SP) | 17 264(SP) | 18 272(SP) | 19 280(SP) | 20 288(SP) | 21 296(SP) | 22 304(SP) | 23 R14 | 24 R15 | 25 R8 | 26 R9 | 27 R10 | 28 R11 | 29 R12 | 30 R13 | 31 BX + + + // | + // | W part 1 + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 64(SP) | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 112(SP) | 15 120(SP) + // | 16 128(SP) | 17 136(SP) | 18 144(SP) | 19 152(SP) | 20 160(SP) | 21 168(SP) | 22 176(SP) | 23 184(SP) | 24 - | 25 - | 26 - | 27 - | 28 - | 29 - | 30 - | 31 - + + + MOVQ 64(SP), AX + ADDQ AX, 192(SP) + MOVQ 72(SP), AX + ADCQ AX, 200(SP) + MOVQ 80(SP), AX + ADCQ AX, 208(SP) + MOVQ 88(SP), AX + ADCQ AX, 216(SP) + MOVQ 96(SP), AX + ADCQ AX, 224(SP) + MOVQ 104(SP), AX + ADCQ AX, 232(SP) + MOVQ 112(SP), AX + ADCQ AX, 240(SP) + MOVQ 120(SP), AX + ADCQ AX, 248(SP) + MOVQ 128(SP), AX + ADCQ AX, 256(SP) + MOVQ 136(SP), AX + ADCQ AX, 264(SP) + MOVQ 144(SP), AX + ADCQ AX, 272(SP) + MOVQ 152(SP), AX + ADCQ AX, 280(SP) + MOVQ 160(SP), AX + ADCQ AX, 288(SP) + MOVQ 168(SP), AX + ADCQ AX, 296(SP) + MOVQ 176(SP), AX + ADCQ AX, 304(SP) + ADCQ 184(SP), R14 + ADCQ $0x00, R15 + ADCQ $0x00, R8 + ADCQ $0x00, R9 + ADCQ $0x00, R10 + ADCQ $0x00, R11 + ADCQ $0x00, R12 + ADCQ $0x00, R13 + ADCQ $0x00, BX + + // | + // | W combined + // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 24(SP) | 4 32(SP) | 5 40(SP) | 6 48(SP) | 7 56(SP) | 8 192(SP) | 9 200(SP) | 10 208(SP) | 11 216(SP) | 12 224(SP) | 13 232(SP) | 14 240(SP) | 15 248(SP) + // | 16 256(SP) | 17 264(SP) | 18 272(SP) | 19 280(SP) | 20 288(SP) | 21 296(SP) | 22 304(SP) | 23 R14 | 24 R15 | 25 R8 | 26 R9 | 27 R10 | 28 R11 | 29 R12 | 30 R13 | 31 BX + + + MOVQ (SP), CX + MOVQ 8(SP), DI + MOVQ 16(SP), SI + MOVQ BX, (SP) + MOVQ 24(SP), BX + MOVQ R13, 8(SP) + MOVQ 32(SP), R13 + MOVQ R12, 16(SP) + MOVQ 40(SP), R12 + MOVQ R11, 24(SP) + MOVQ 48(SP), R11 + MOVQ R10, 32(SP) + MOVQ 56(SP), R10 + MOVQ R9, 40(SP) + MOVQ 192(SP), R9 + MOVQ R8, 48(SP) + MOVQ R15, 56(SP) + MOVQ R14, 64(SP) + + // | fetch modulus + MOVQ p+24(FP), R15 + + // | + +/* montgomery reduction q1 */ + + // | + +/* i = 0 */ + + // | + // | W + // | 0 CX | 1 DI | 2 SI | 3 BX | 4 R13 | 5 R12 | 6 R11 | 7 R10 | 8 R9 | 9 200(SP) | 10 208(SP) | 11 216(SP) | 12 224(SP) | 13 232(SP) | 14 240(SP) | 15 248(SP) + // | 16 256(SP) | 17 264(SP) | 18 272(SP) | 19 280(SP) | 20 288(SP) | 21 296(SP) | 22 304(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) + + + // | | u0 = w0 * inp + MOVQ CX, AX + MULQ inp+32(FP) + MOVQ AX, R14 + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u0 + MOVQ R14, 72(SP) + + // | j0 + + // | w0 @ CX + MOVQ (R15), AX + MULQ R14 + ADDQ AX, CX + ADCQ DX, R8 + + // | j1 + + // | w1 @ DI + MOVQ 8(R15), AX + MULQ R14 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w2 @ SI + MOVQ 16(R15), AX + MULQ R14 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w3 @ BX + MOVQ 24(R15), AX + MULQ R14 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w4 @ R13 + MOVQ 32(R15), AX + MULQ R14 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w5 @ R12 + MOVQ 40(R15), AX + MULQ R14 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w6 @ R11 + MOVQ 48(R15), AX + MULQ R14 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w7 @ R10 + MOVQ 56(R15), AX + MULQ R14 + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R8, R10 + + // | w8 @ R9 + ADCQ DX, R9 + ADCQ $0x00, CX + + // | + +/* i = 1 */ + + // | + // | W + // | 0 - | 1 DI | 2 SI | 3 BX | 4 R13 | 5 R12 | 6 R11 | 7 R10 | 8 R9 | 9 200(SP) | 10 208(SP) | 11 216(SP) | 12 224(SP) | 13 232(SP) | 14 240(SP) | 15 248(SP) + // | 16 256(SP) | 17 264(SP) | 18 272(SP) | 19 280(SP) | 20 288(SP) | 21 296(SP) | 22 304(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) + + + // | | u1 = w1 * inp + MOVQ DI, AX + MULQ inp+32(FP) + MOVQ AX, R14 + MOVQ $0x00, R8 + + // | + +/* */ + + // | save u1 + MOVQ R14, 80(SP) + + // | j0 + + // | w1 @ DI + MOVQ (R15), AX + MULQ R14 + ADDQ AX, DI + ADCQ DX, R8 + + // | j1 + + // | w2 @ SI + MOVQ 8(R15), AX + MULQ R14 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w3 @ BX + MOVQ 16(R15), AX + MULQ R14 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w4 @ R13 + MOVQ 24(R15), AX + MULQ R14 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w5 @ R12 + MOVQ 32(R15), AX + MULQ R14 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w6 @ R11 + MOVQ 40(R15), AX + MULQ R14 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w7 @ R10 + MOVQ 48(R15), AX + MULQ R14 + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w8 @ R9 + MOVQ 56(R15), AX + MULQ R14 + ADDQ AX, R9 + ADCQ DX, CX + ADDQ R8, R9 - // | w25 @ 8(SP) - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + // | move to idle register + MOVQ 200(SP), DI - // | w26 @ 16(SP) - XORQ DI, DI - MOVQ 112(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + // | w9 @ DI + ADCQ CX, DI + MOVQ $0x00, CX + ADCQ $0x00, CX - // | w27 @ 24(SP) - ADDQ DI, R15 - ADCQ R15, 24(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | - // | - MOVQ 160(SP), BX - MOVQ BX, 32(SP) +/* i = 2 */ + + // | + // | W + // | 0 - | 1 - | 2 SI | 3 BX | 4 R13 | 5 R12 | 6 R11 | 7 R10 | 8 R9 | 9 DI | 10 208(SP) | 11 216(SP) | 12 224(SP) | 13 232(SP) | 14 240(SP) | 15 248(SP) + // | 16 256(SP) | 17 264(SP) | 18 272(SP) | 19 280(SP) | 20 288(SP) | 21 296(SP) | 22 304(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | (u @ CX) = (w13 @ 40(SP)) * inp - MOVQ 40(SP), AX + + // | | u2 = w2 * inp + MOVQ SI, AX MULQ inp+32(FP) - MOVQ AX, CX + MOVQ AX, R14 + MOVQ $0x00, R8 - // | w13 @ 40(SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, DI + // | - // | w14 @ 48(SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, SI - ADDQ DI, 48(SP) - ADCQ $0x00, SI +/* */ - // | w15 @ 56(SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, DI - ADDQ SI, 56(SP) - ADCQ $0x00, DI + // | save u2 + MOVQ R14, 88(SP) - // | w16 @ R8 - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + // | j0 - // | w17 @ R9 - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI + // | w2 @ SI + MOVQ (R15), AX + MULQ R14 + ADDQ AX, SI + ADCQ DX, R8 - // | w18 @ R10 - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + // | j1 - // | w19 @ R11 - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI + // | w3 @ BX + MOVQ 8(R15), AX + MULQ R14 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w20 @ R12 - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + // | j2 - // | w21 @ R13 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX + // | w4 @ R13 + MOVQ 16(R15), AX + MULQ R14 ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w22 @ 184(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 184(SP) - ADCQ DX, SI - ADDQ DI, 184(SP) - ADCQ $0x00, SI + // | j3 - // | w23 @ 176(SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, 176(SP) - ADCQ DX, DI - ADDQ SI, 176(SP) - ADCQ $0x00, DI + // | w5 @ R12 + MOVQ 24(R15), AX + MULQ R14 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w24 @ (SP) - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + // | j4 - // | w25 @ 8(SP) - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + // | w6 @ R11 + MOVQ 32(R15), AX + MULQ R14 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w26 @ 16(SP) - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI + // | j5 - // | w27 @ 24(SP) - XORQ DI, DI - MOVQ 112(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI - ADDQ SI, 24(SP) - ADCQ $0x00, DI + // | w7 @ R10 + MOVQ 40(R15), AX + MULQ R14 + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w28 @ 32(SP) - ADDQ DI, R15 - ADCQ R15, 32(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | j6 - // | - MOVQ 168(SP), BX - MOVQ BX, 40(SP) + // | w8 @ R9 + MOVQ 48(R15), AX + MULQ R14 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R8, R9 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | (u @ CX) = (w14 @ 48(SP)) * inp - MOVQ 48(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + // | j7 - // | w14 @ 48(SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, DI + // | w9 @ DI + MOVQ 56(R15), AX + MULQ R14 + ADDQ AX, DI + ADCQ DX, CX + ADDQ R8, DI - // | w15 @ 56(SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, SI - ADDQ DI, 56(SP) - ADCQ $0x00, SI + // | move to idle register + MOVQ 208(SP), SI - // | w16 @ R8 - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + // | w10 @ SI + ADCQ CX, SI + MOVQ $0x00, CX + ADCQ $0x00, CX - // | w17 @ R9 - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + // | - // | w18 @ R10 - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI +/* i = 3 */ - // | w19 @ R11 - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + // | + // | W + // | 0 - | 1 - | 2 - | 3 BX | 4 R13 | 5 R12 | 6 R11 | 7 R10 | 8 R9 | 9 DI | 10 SI | 11 216(SP) | 12 224(SP) | 13 232(SP) | 14 240(SP) | 15 248(SP) + // | 16 256(SP) | 17 264(SP) | 18 272(SP) | 19 280(SP) | 20 288(SP) | 21 296(SP) | 22 304(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | w20 @ R12 - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI - // | w21 @ R13 - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + // | | u3 = w3 * inp + MOVQ BX, AX + MULQ inp+32(FP) + MOVQ AX, R14 + MOVQ $0x00, R8 - // | w22 @ 184(SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 184(SP) - ADCQ DX, DI - ADDQ SI, 184(SP) - ADCQ $0x00, DI + // | - // | w23 @ 176(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 176(SP) - ADCQ DX, SI - ADDQ DI, 176(SP) - ADCQ $0x00, SI +/* */ - // | w24 @ (SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + // | save u3 + MOVQ R14, 96(SP) - // | w25 @ 8(SP) - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + // | j0 - // | w26 @ 16(SP) - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + // | w3 @ BX + MOVQ (R15), AX + MULQ R14 + ADDQ AX, BX + ADCQ DX, R8 - // | w27 @ 24(SP) - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI + // | j1 - // | w28 @ 32(SP) - XORQ DI, DI - MOVQ 112(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI - ADDQ SI, 32(SP) - ADCQ $0x00, DI + // | w4 @ R13 + MOVQ 8(R15), AX + MULQ R14 + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w29 @ 40(SP) - ADDQ DI, R15 - ADCQ R15, 40(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | j2 - // | Reduce by modulus - MOVQ 56(SP), CX - SUBQ (R14), CX - MOVQ R8, AX - SBBQ 8(R14), AX - MOVQ R9, DX - SBBQ 16(R14), DX - MOVQ R10, BX - SBBQ 24(R14), BX - MOVQ BX, 48(SP) - MOVQ R11, BX - SBBQ 32(R14), BX - MOVQ BX, 192(SP) - MOVQ R12, BX - SBBQ 40(R14), BX - MOVQ BX, 200(SP) - MOVQ R13, BX - SBBQ 48(R14), BX - MOVQ BX, 208(SP) - MOVQ 184(SP), BX - SBBQ 56(R14), BX - MOVQ BX, 216(SP) - MOVQ 176(SP), BX - SBBQ 64(R14), BX - MOVQ BX, 224(SP) - MOVQ (SP), BX - SBBQ 72(R14), BX - MOVQ BX, 232(SP) - MOVQ 8(SP), BX - SBBQ 80(R14), BX - MOVQ BX, 240(SP) - MOVQ 16(SP), BX - SBBQ 88(R14), BX - MOVQ BX, 248(SP) - MOVQ 24(SP), BX - SBBQ 96(R14), BX - MOVQ BX, 256(SP) - MOVQ 32(SP), BX - SBBQ 104(R14), BX - MOVQ BX, 264(SP) - MOVQ 40(SP), BX - SBBQ 112(R14), BX - MOVQ BX, 272(SP) - SBBQ $0x00, R15 + // | w5 @ R12 + MOVQ 16(R15), AX + MULQ R14 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | Compare & Return - MOVQ c+0(FP), DI - CMOVQCS 56(SP), CX - MOVQ CX, (DI) - CMOVQCC AX, R8 - MOVQ R8, 8(DI) - CMOVQCC DX, R9 - MOVQ R9, 16(DI) - CMOVQCC 48(SP), R10 - MOVQ R10, 24(DI) - CMOVQCC 192(SP), R11 - MOVQ R11, 32(DI) - CMOVQCC 200(SP), R12 - MOVQ R12, 40(DI) - CMOVQCC 208(SP), R13 - MOVQ R13, 48(DI) - MOVQ 184(SP), BX - CMOVQCC 216(SP), BX - MOVQ BX, 56(DI) - MOVQ 176(SP), BX - CMOVQCC 224(SP), BX - MOVQ BX, 64(DI) - MOVQ (SP), BX - CMOVQCC 232(SP), BX - MOVQ BX, 72(DI) - MOVQ 8(SP), BX - CMOVQCC 240(SP), BX - MOVQ BX, 80(DI) - MOVQ 16(SP), BX - CMOVQCC 248(SP), BX - MOVQ BX, 88(DI) - MOVQ 24(SP), BX - CMOVQCC 256(SP), BX - MOVQ BX, 96(DI) - MOVQ 32(SP), BX - CMOVQCC 264(SP), BX - MOVQ BX, 104(DI) - MOVQ 40(SP), BX - CMOVQCC 272(SP), BX - MOVQ BX, 112(DI) - RET + // | j3 + // | w6 @ R11 + MOVQ 24(R15), AX + MULQ R14 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 -// func mul12(c *[32]uint64, a *[16]uint64, b *[16]uint64, p *[16]uint64, inp uint64) -TEXT ·mul16(SB), $304-40 -/* inputs */ - // | - // | Multiplication - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI + // | j4 - // | - // | - XORQ R10, R10 - XORQ R11, R11 - XORQ R12, R12 - XORQ R13, R13 - XORQ R14, R14 - XORQ R15, R15 - MOVQ $0x00000000, (SP) - MOVQ $0x00000000, 8(SP) - MOVQ $0x00000000, 16(SP) - MOVQ $0x00000000, 24(SP) - MOVQ $0x00000000, 32(SP) - MOVQ $0x00000000, 40(SP) - MOVQ $0x00000000, 48(SP) - MOVQ $0x00000000, 56(SP) - MOVQ $0x00000000, 64(SP) - MOVQ $0x00000000, 72(SP) - MOVQ $0x00000000, 80(SP) - MOVQ $0x00000000, 88(SP) - MOVQ $0x00000000, 96(SP) - MOVQ $0x00000000, 104(SP) - MOVQ $0x00000000, 112(SP) - MOVQ $0x00000000, 120(SP) - MOVQ $0x00000000, 128(SP) - MOVQ $0x00000000, 136(SP) - MOVQ $0x00000000, 144(SP) - MOVQ $0x00000000, 152(SP) - MOVQ $0x00000000, 160(SP) - MOVQ $0x00000000, 168(SP) - MOVQ $0x00000000, 176(SP) - MOVQ $0x00000000, 184(SP) - - // | - // | b0 - MOVQ (SI), CX + // | w7 @ R10 + MOVQ 32(R15), AX + MULQ R14 + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a0 * b0 - // | (w0, w1) @ (R8, R9) - MOVQ (DI), AX - MULQ CX - MOVQ AX, R8 - MOVQ DX, R9 + // | j5 - // | a1 * b0 - // | (w1, w2) @ (R9, R10) - MOVQ 8(DI), AX - MULQ CX + // | w8 @ R9 + MOVQ 40(R15), AX + MULQ R14 ADDQ AX, R9 - ADCQ DX, R10 + ADCQ $0x00, DX + ADDQ R8, R9 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w9 @ DI + MOVQ 48(R15), AX + MULQ R14 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w10 @ SI + MOVQ 56(R15), AX + MULQ R14 + ADDQ AX, SI + ADCQ DX, CX + ADDQ R8, SI - // | a2 * b0 - // | (w2, w3) @ (R10, R11) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, R11 + // | move to idle register + MOVQ 216(SP), BX - // | a3 * b0 - // | (w3, w4) @ (R11, R12) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, R12 + // | w11 @ BX + ADCQ CX, BX + MOVQ $0x00, CX + ADCQ $0x00, CX - // | a4 * b0 - // | (w4, w5) @ (R12, R13) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, R13 + // | - // | a5 * b0 - // | (w5, w6) @ (R13, R14) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 +/* i = 4 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 R13 | 5 R12 | 6 R11 | 7 R10 | 8 R9 | 9 DI | 10 SI | 11 BX | 12 224(SP) | 13 232(SP) | 14 240(SP) | 15 248(SP) + // | 16 256(SP) | 17 264(SP) | 18 272(SP) | 19 280(SP) | 20 288(SP) | 21 296(SP) | 22 304(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | a6 * b0 - // | (w6, w7) @ (R14, R15) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - // | a7 * b0 - // | (w7, w8) @ (R15, (SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) + // | | u4 = w4 * inp + MOVQ R13, AX + MULQ inp+32(FP) + MOVQ AX, R14 + MOVQ $0x00, R8 - // | a8 * b0 - // | (w8, w9) @ ((SP), 8(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) + // | - // | a9 * b0 - // | (w9, w10) @ (8(SP), 16(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) +/* */ - // | a10 * b0 - // | (w10, w11) @ (16(SP), 24(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) + // | save u4 + MOVQ R14, 104(SP) - // | a11 * b0 - // | (w11, w12) @ (24(SP), 32(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) + // | j0 - // | a12 * b0 - // | (w12, w13) @ (32(SP), 40(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) + // | w4 @ R13 + MOVQ (R15), AX + MULQ R14 + ADDQ AX, R13 + ADCQ DX, R8 - // | a13 * b0 - // | (w13, w14) @ (40(SP), 48(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) + // | j1 - // | a14 * b0 - // | (w14, w15) @ (48(SP), 56(SP)) - MOVQ 112(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) + // | w5 @ R12 + MOVQ 8(R15), AX + MULQ R14 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a15 * b0 - // | (w15, w16) @ (56(SP), 64(SP)) - MOVQ 120(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) + // | j2 - // | - // | b1 - MOVQ 8(SI), CX + // | w6 @ R11 + MOVQ 16(R15), AX + MULQ R14 + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a0 * b1 - // | (w1, w2, w3, w4) @ (R9, R10, R11, R12) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, R10 - ADCQ $0x00, R11 - ADCQ $0x00, R12 + // | j3 - // | a1 * b1 - // | (w2, w3, w4, w5) @ (R10, R11, R12, R13) - MOVQ 8(DI), AX - MULQ CX + // | w7 @ R10 + MOVQ 24(R15), AX + MULQ R14 ADDQ AX, R10 - ADCQ DX, R11 - ADCQ $0x00, R12 - ADCQ $0x00, R13 + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a2 * b1 - // | (w3, w4, w5, w6) @ (R11, R12, R13, R14) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, R12 - ADCQ $0x00, R13 - ADCQ $0x00, R14 + // | j4 - // | a3 * b1 - // | (w4, w5, w6, w7) @ (R12, R13, R14, R15) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, R13 - ADCQ $0x00, R14 - ADCQ $0x00, R15 + // | w8 @ R9 + MOVQ 32(R15), AX + MULQ R14 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R8, R9 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w9 @ DI + MOVQ 40(R15), AX + MULQ R14 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w10 @ SI + MOVQ 48(R15), AX + MULQ R14 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a4 * b1 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) + // | j7 - // | a5 * b1 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + // | w11 @ BX + MOVQ 56(R15), AX + MULQ R14 + ADDQ AX, BX + ADCQ DX, CX + ADDQ R8, BX - // | a6 * b1 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + // | move to idle register + MOVQ 224(SP), R13 - // | a7 * b1 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | w12 @ R13 + ADCQ CX, R13 + MOVQ $0x00, CX + ADCQ $0x00, CX - // | a8 * b1 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | - // | a9 * b1 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) +/* i = 5 */ - // | a10 * b1 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 R12 | 6 R11 | 7 R10 | 8 R9 | 9 DI | 10 SI | 11 BX | 12 R13 | 13 232(SP) | 14 240(SP) | 15 248(SP) + // | 16 256(SP) | 17 264(SP) | 18 272(SP) | 19 280(SP) | 20 288(SP) | 21 296(SP) | 22 304(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | a11 * b1 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) - // | a12 * b1 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | | u5 = w5 * inp + MOVQ R12, AX + MULQ inp+32(FP) + MOVQ AX, R14 + MOVQ $0x00, R8 - // | a13 * b1 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | - // | a14 * b1 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 112(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) +/* */ - // | a15 * b1 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 120(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | save u5 + MOVQ R14, 112(SP) - // | - // | b2 - MOVQ 16(SI), CX + // | j0 - // | a0 * b2 - // | (w2, w3, w4, w5) @ (R10, R11, R12, R13) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, R11 - ADCQ $0x00, R12 - ADCQ $0x00, R13 + // | w5 @ R12 + MOVQ (R15), AX + MULQ R14 + ADDQ AX, R12 + ADCQ DX, R8 - // | a1 * b2 - // | (w3, w4, w5, w6) @ (R11, R12, R13, R14) - MOVQ 8(DI), AX - MULQ CX + // | j1 + + // | w6 @ R11 + MOVQ 8(R15), AX + MULQ R14 ADDQ AX, R11 - ADCQ DX, R12 - ADCQ $0x00, R13 - ADCQ $0x00, R14 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a2 * b2 - // | (w4, w5, w6, w7) @ (R12, R13, R14, R15) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, R13 - ADCQ $0x00, R14 - ADCQ $0x00, R15 + // | j2 - // | a3 * b2 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) + // | w7 @ R10 + MOVQ 16(R15), AX + MULQ R14 + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a4 * b2 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + // | j3 - // | a5 * b2 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + // | w8 @ R9 + MOVQ 24(R15), AX + MULQ R14 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R8, R9 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w9 @ DI + MOVQ 32(R15), AX + MULQ R14 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w10 @ SI + MOVQ 40(R15), AX + MULQ R14 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w11 @ BX + MOVQ 48(R15), AX + MULQ R14 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w12 @ R13 + MOVQ 56(R15), AX + MULQ R14 + ADDQ AX, R13 + ADCQ DX, CX + ADDQ R8, R13 - // | a6 * b2 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | move to idle register + MOVQ 232(SP), R12 - // | a7 * b2 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | w13 @ R12 + ADCQ CX, R12 + MOVQ $0x00, CX + ADCQ $0x00, CX - // | a8 * b2 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | - // | a9 * b2 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) +/* i = 6 */ - // | a10 * b2 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 R11 | 7 R10 | 8 R9 | 9 DI | 10 SI | 11 BX | 12 R13 | 13 R12 | 14 240(SP) | 15 248(SP) + // | 16 256(SP) | 17 264(SP) | 18 272(SP) | 19 280(SP) | 20 288(SP) | 21 296(SP) | 22 304(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | a11 * b2 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) - // | a12 * b2 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | | u6 = w6 * inp + MOVQ R11, AX + MULQ inp+32(FP) + MOVQ AX, R14 + MOVQ $0x00, R8 - // | a13 * b2 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | - // | a14 * b2 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 112(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) +/* */ - // | a15 * b2 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 120(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | save u6 + MOVQ R14, 120(SP) - // | - // | b3 - MOVQ 24(SI), CX + // | j0 - // | a0 * b3 - // | (w3, w4, w5, w6) @ (R11, R12, R13, R14) - MOVQ (DI), AX - MULQ CX + // | w6 @ R11 + MOVQ (R15), AX + MULQ R14 ADDQ AX, R11 - ADCQ DX, R12 - ADCQ $0x00, R13 - ADCQ $0x00, R14 + ADCQ DX, R8 - // | a1 * b3 - // | (w4, w5, w6, w7) @ (R12, R13, R14, R15) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, R13 - ADCQ $0x00, R14 - ADCQ $0x00, R15 + // | j1 - // | a2 * b3 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ 16(DI), AX - MULQ CX + // | w7 @ R10 + MOVQ 8(R15), AX + MULQ R14 + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w8 @ R9 + MOVQ 16(R15), AX + MULQ R14 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R8, R9 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w9 @ DI + MOVQ 24(R15), AX + MULQ R14 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w10 @ SI + MOVQ 32(R15), AX + MULQ R14 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w11 @ BX + MOVQ 40(R15), AX + MULQ R14 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w12 @ R13 + MOVQ 48(R15), AX + MULQ R14 ADDQ AX, R13 - ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a3 * b3 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + // | j7 - // | a4 * b3 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + // | w13 @ R12 + MOVQ 56(R15), AX + MULQ R14 + ADDQ AX, R12 + ADCQ DX, CX + ADDQ R8, R12 - // | a5 * b3 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | move to idle register + MOVQ 240(SP), R11 - // | a6 * b3 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | w14 @ R11 + ADCQ CX, R11 + MOVQ $0x00, CX + ADCQ $0x00, CX - // | a7 * b3 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | - // | a8 * b3 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) +/* i = 7 */ - // | a9 * b3 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 R10 | 8 R9 | 9 DI | 10 SI | 11 BX | 12 R13 | 13 R12 | 14 R11 | 15 248(SP) + // | 16 256(SP) | 17 264(SP) | 18 272(SP) | 19 280(SP) | 20 288(SP) | 21 296(SP) | 22 304(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | a10 * b3 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) - // | a11 * b3 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | | u7 = w7 * inp + MOVQ R10, AX + MULQ inp+32(FP) + MOVQ AX, R14 + MOVQ $0x00, R8 - // | a12 * b3 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | - // | a13 * b3 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) +/* */ - // | a14 * b3 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 112(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | save u7 + MOVQ R14, 128(SP) - // | a15 * b3 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 120(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + // | j0 - // | - // | b4 - MOVQ 32(SI), CX + // | w7 @ R10 + MOVQ (R15), AX + MULQ R14 + ADDQ AX, R10 + ADCQ DX, R8 - // | a0 * b4 - // | (w4, w5, w6, w7) @ (R12, R13, R14, R15) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, R13 - ADCQ $0x00, R14 - ADCQ $0x00, R15 + // | j1 - // | a1 * b4 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ 8(DI), AX - MULQ CX + // | w8 @ R9 + MOVQ 8(R15), AX + MULQ R14 + ADDQ AX, R9 + ADCQ $0x00, DX + ADDQ R8, R9 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w9 @ DI + MOVQ 16(R15), AX + MULQ R14 + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w10 @ SI + MOVQ 24(R15), AX + MULQ R14 + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w11 @ BX + MOVQ 32(R15), AX + MULQ R14 + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w12 @ R13 + MOVQ 40(R15), AX + MULQ R14 ADDQ AX, R13 - ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a2 * b4 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + // | j6 - // | a3 * b4 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + // | w13 @ R12 + MOVQ 48(R15), AX + MULQ R14 + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a4 * b4 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | j7 - // | a5 * b4 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | w14 @ R11 + MOVQ 56(R15), AX + MULQ R14 + ADDQ AX, R11 + ADCQ DX, CX + ADDQ R8, R11 - // | a6 * b4 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | move to idle register + MOVQ 248(SP), R10 - // | a7 * b4 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | w15 @ R10 + ADCQ CX, R10 + MOVQ $0x00, CX + ADCQ $0x00, CX - // | a8 * b4 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | + // | W q1 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R9 | 9 DI | 10 SI | 11 BX | 12 R13 | 13 R12 | 14 R11 | 15 R10 + // | 16 256(SP) | 17 264(SP) | 18 272(SP) | 19 280(SP) | 20 288(SP) | 21 296(SP) | 22 304(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | a9 * b4 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) - // | a10 * b4 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | save the carry from q1 + // | should be added to w16 + MOVQ CX, 136(SP) - // | a11 * b4 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | - // | a12 * b4 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) +/* montgomerry reduction q2 */ - // | a13 * b4 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | - // | a14 * b4 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 112(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) +/* i = 0 */ - // | a15 * b4 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 120(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R9 | 9 DI | 10 SI | 11 BX | 12 R13 | 13 R12 | 14 R11 | 15 R10 + // | 16 256(SP) | 17 264(SP) | 18 272(SP) | 19 280(SP) | 20 288(SP) | 21 296(SP) | 22 304(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | - // | b5 - MOVQ 40(SI), CX - // | a0 * b5 - // | (w5, w6, w7, w8) @ (R13, R14, R15, (SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, R14 - ADCQ $0x00, R15 - ADCQ $0x00, (SP) + MOVQ $0x00, R8 - // | a1 * b5 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) + // | - // | a2 * b5 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) +/* */ + + // | j8 + + // | w8 @ R9 + MOVQ 64(R15), AX + MULQ 72(SP) + ADDQ AX, R9 + ADCQ DX, R8 + + // | j9 + + // | w9 @ DI + MOVQ 72(R15), AX + MULQ 72(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a3 * b5 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | j10 - // | a4 * b5 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | w10 @ SI + MOVQ 80(R15), AX + MULQ 72(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j11 + + // | w11 @ BX + MOVQ 88(R15), AX + MULQ 72(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j12 + + // | w12 @ R13 + MOVQ 96(R15), AX + MULQ 72(SP) + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a5 * b5 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | j13 - // | a6 * b5 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | w13 @ R12 + MOVQ 104(R15), AX + MULQ 72(SP) + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a7 * b5 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | j14 - // | a8 * b5 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | w14 @ R11 + MOVQ 112(R15), AX + MULQ 72(SP) + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a9 * b5 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | j15 - // | a10 * b5 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | w15 @ R10 + MOVQ 120(R15), AX + MULQ 72(SP) + ADDQ AX, R10 - // | a11 * b5 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | bring the carry from q1 + ADCQ 136(SP), DX + ADDQ R8, R10 - // | a12 * b5 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | move to an idle register + MOVQ 256(SP), R14 - // | a13 * b5 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + // | w16 @ R14 + ADCQ DX, R14 + MOVQ $0x00, CX + ADCQ $0x00, CX - // | a14 * b5 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 112(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) + // | - // | a15 * b5 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 120(DI), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) +/* i = 1 */ - // | - // | b6 - MOVQ 48(SI), CX + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R9 | 9 DI | 10 SI | 11 BX | 12 R13 | 13 R12 | 14 R11 | 15 R10 + // | 16 R14 | 17 264(SP) | 18 272(SP) | 19 280(SP) | 20 288(SP) | 21 296(SP) | 22 304(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | a0 * b6 - // | (w6, w7, w8, w9) @ (R14, R15, (SP), 8(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R14 - ADCQ DX, R15 - ADCQ $0x00, (SP) - ADCQ $0x00, 8(SP) - // | a1 * b6 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + MOVQ $0x00, R8 - // | a2 * b6 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | - // | a3 * b6 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) +/* */ - // | a4 * b6 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | j8 - // | a5 * b6 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | w9 @ DI + MOVQ 64(R15), AX + MULQ 80(SP) + ADDQ AX, DI + ADCQ DX, R8 + MOVQ DI, 72(SP) - // | a6 * b6 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | j9 - // | a7 * b6 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | w10 @ SI + MOVQ 72(R15), AX + MULQ 80(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j10 + + // | w11 @ BX + MOVQ 80(R15), AX + MULQ 80(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j11 + + // | w12 @ R13 + MOVQ 88(R15), AX + MULQ 80(SP) + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a8 * b6 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | j12 - // | a9 * b6 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | w13 @ R12 + MOVQ 96(R15), AX + MULQ 80(SP) + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a10 * b6 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | j13 - // | a11 * b6 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | w14 @ R11 + MOVQ 104(R15), AX + MULQ 80(SP) + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a12 * b6 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + // | j14 - // | a13 * b6 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) + // | w15 @ R10 + MOVQ 112(R15), AX + MULQ 80(SP) + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a14 * b6 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 112(DI), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) + // | j15 - // | a15 * b6 - // | (w21, w22, w23, w24) @ (104(SP), 112(SP), 120(SP), 128(SP)) - MOVQ 120(DI), AX - MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, 112(SP) - ADCQ $0x00, 120(SP) - ADCQ $0x00, 128(SP) + // | w16 @ R14 + MOVQ 120(R15), AX + MULQ 80(SP) + ADDQ AX, R14 + ADCQ DX, CX + ADDQ R8, R14 - // | - // | b7 - MOVQ 56(SI), CX + // | move to an idle register + MOVQ 264(SP), DI - // | a0 * b7 - // | (w7, w8, w9, w10) @ (R15, (SP), 8(SP), 16(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, R15 - ADCQ DX, (SP) - ADCQ $0x00, 8(SP) - ADCQ $0x00, 16(SP) + // | w17 @ DI + ADCQ CX, DI + MOVQ $0x00, CX + ADCQ $0x00, CX - // | a1 * b7 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | - // | a2 * b7 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) +/* i = 2 */ - // | a3 * b7 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R9 | 9 72(SP) | 10 SI | 11 BX | 12 R13 | 13 R12 | 14 R11 | 15 R10 + // | 16 R14 | 17 DI | 18 272(SP) | 19 280(SP) | 20 288(SP) | 21 296(SP) | 22 304(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | a4 * b7 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) - // | a5 * b7 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + MOVQ $0x00, R8 - // | a6 * b7 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | - // | a7 * b7 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) +/* */ - // | a8 * b7 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | j8 - // | a9 * b7 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | w10 @ SI + MOVQ 64(R15), AX + MULQ 88(SP) + ADDQ AX, SI + ADCQ DX, R8 + MOVQ SI, 80(SP) - // | a10 * b7 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | j9 - // | a11 * b7 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + // | w11 @ BX + MOVQ 72(R15), AX + MULQ 88(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a12 * b7 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) + // | j10 - // | a13 * b7 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) + // | w12 @ R13 + MOVQ 80(R15), AX + MULQ 88(SP) + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a14 * b7 - // | (w21, w22, w23, w24) @ (104(SP), 112(SP), 120(SP), 128(SP)) - MOVQ 112(DI), AX - MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, 112(SP) - ADCQ $0x00, 120(SP) - ADCQ $0x00, 128(SP) + // | j11 - // | a15 * b7 - // | (w22, w23, w24, w25) @ (112(SP), 120(SP), 128(SP), 136(SP)) - MOVQ 120(DI), AX - MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, 120(SP) - ADCQ $0x00, 128(SP) - ADCQ $0x00, 136(SP) + // | w13 @ R12 + MOVQ 88(R15), AX + MULQ 88(SP) + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | - // | b8 - MOVQ 64(SI), CX + // | j12 - // | a0 * b8 - // | (w8, w9, w10, w11) @ ((SP), 8(SP), 16(SP), 24(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, 8(SP) - ADCQ $0x00, 16(SP) - ADCQ $0x00, 24(SP) + // | w14 @ R11 + MOVQ 96(R15), AX + MULQ 88(SP) + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a1 * b8 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | j13 - // | a2 * b8 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | w15 @ R10 + MOVQ 104(R15), AX + MULQ 88(SP) + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a3 * b8 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | j14 - // | a4 * b8 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | w16 @ R14 + MOVQ 112(R15), AX + MULQ 88(SP) + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a5 * b8 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | j15 - // | a6 * b8 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | w17 @ DI + MOVQ 120(R15), AX + MULQ 88(SP) + ADDQ AX, DI + ADCQ DX, CX + ADDQ R8, DI - // | a7 * b8 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | move to an idle register + MOVQ 272(SP), SI - // | a8 * b8 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | w18 @ SI + ADCQ CX, SI + MOVQ $0x00, CX + ADCQ $0x00, CX - // | a9 * b8 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | - // | a10 * b8 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) +/* i = 3 */ - // | a11 * b8 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R9 | 9 72(SP) | 10 80(SP) | 11 BX | 12 R13 | 13 R12 | 14 R11 | 15 R10 + // | 16 R14 | 17 DI | 18 SI | 19 280(SP) | 20 288(SP) | 21 296(SP) | 22 304(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | a12 * b8 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) - // | a13 * b8 - // | (w21, w22, w23, w24) @ (104(SP), 112(SP), 120(SP), 128(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, 112(SP) - ADCQ $0x00, 120(SP) - ADCQ $0x00, 128(SP) + MOVQ $0x00, R8 - // | a14 * b8 - // | (w22, w23, w24, w25) @ (112(SP), 120(SP), 128(SP), 136(SP)) - MOVQ 112(DI), AX - MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, 120(SP) - ADCQ $0x00, 128(SP) - ADCQ $0x00, 136(SP) + // | - // | a15 * b8 - // | (w23, w24, w25, w26) @ (120(SP), 128(SP), 136(SP), 144(SP)) - MOVQ 120(DI), AX - MULQ CX - ADDQ AX, 120(SP) - ADCQ DX, 128(SP) - ADCQ $0x00, 136(SP) - ADCQ $0x00, 144(SP) +/* */ - // | - // | b9 - MOVQ 72(SI), CX + // | j8 - // | a0 * b9 - // | (w9, w10, w11, w12) @ (8(SP), 16(SP), 24(SP), 32(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, 16(SP) - ADCQ $0x00, 24(SP) - ADCQ $0x00, 32(SP) + // | w11 @ BX + MOVQ 64(R15), AX + MULQ 96(SP) + ADDQ AX, BX + ADCQ DX, R8 + MOVQ BX, 88(SP) - // | a1 * b9 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + // | j9 - // | a2 * b9 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | w12 @ R13 + MOVQ 72(R15), AX + MULQ 96(SP) + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a3 * b9 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | j10 - // | a4 * b9 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | w13 @ R12 + MOVQ 80(R15), AX + MULQ 96(SP) + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a5 * b9 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | j11 - // | a6 * b9 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | w14 @ R11 + MOVQ 88(R15), AX + MULQ 96(SP) + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a7 * b9 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | j12 - // | a8 * b9 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | w15 @ R10 + MOVQ 96(R15), AX + MULQ 96(SP) + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a9 * b9 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + // | j13 - // | a10 * b9 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) + // | w16 @ R14 + MOVQ 104(R15), AX + MULQ 96(SP) + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j14 + + // | w17 @ DI + MOVQ 112(R15), AX + MULQ 96(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j15 + + // | w18 @ SI + MOVQ 120(R15), AX + MULQ 96(SP) + ADDQ AX, SI + ADCQ DX, CX + ADDQ R8, SI - // | a11 * b9 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) + // | move to an idle register + MOVQ 280(SP), BX - // | a12 * b9 - // | (w21, w22, w23, w24) @ (104(SP), 112(SP), 120(SP), 128(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, 112(SP) - ADCQ $0x00, 120(SP) - ADCQ $0x00, 128(SP) + // | w19 @ BX + ADCQ CX, BX + MOVQ $0x00, CX + ADCQ $0x00, CX - // | a13 * b9 - // | (w22, w23, w24, w25) @ (112(SP), 120(SP), 128(SP), 136(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, 120(SP) - ADCQ $0x00, 128(SP) - ADCQ $0x00, 136(SP) + // | - // | a14 * b9 - // | (w23, w24, w25, w26) @ (120(SP), 128(SP), 136(SP), 144(SP)) - MOVQ 112(DI), AX - MULQ CX - ADDQ AX, 120(SP) - ADCQ DX, 128(SP) - ADCQ $0x00, 136(SP) - ADCQ $0x00, 144(SP) +/* i = 4 */ - // | a15 * b9 - // | (w24, w25, w26, w27) @ (128(SP), 136(SP), 144(SP), 152(SP)) - MOVQ 120(DI), AX - MULQ CX - ADDQ AX, 128(SP) - ADCQ DX, 136(SP) - ADCQ $0x00, 144(SP) - ADCQ $0x00, 152(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R9 | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 R13 | 13 R12 | 14 R11 | 15 R10 + // | 16 R14 | 17 DI | 18 SI | 19 BX | 20 288(SP) | 21 296(SP) | 22 304(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | - // | b10 - MOVQ 80(SI), CX - // | a0 * b10 - // | (w10, w11, w12, w13) @ (16(SP), 24(SP), 32(SP), 40(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, 24(SP) - ADCQ $0x00, 32(SP) - ADCQ $0x00, 40(SP) + MOVQ $0x00, R8 - // | a1 * b10 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | - // | a2 * b10 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) +/* */ - // | a3 * b10 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | j8 - // | a4 * b10 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | w12 @ R13 + MOVQ 64(R15), AX + MULQ 104(SP) + ADDQ AX, R13 + ADCQ DX, R8 + MOVQ R13, 96(SP) - // | a5 * b10 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | j9 - // | a6 * b10 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | w13 @ R12 + MOVQ 72(R15), AX + MULQ 104(SP) + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a7 * b10 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | j10 - // | a8 * b10 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) - - // | a9 * b10 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) - - // | a10 * b10 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) - - // | a11 * b10 - // | (w21, w22, w23, w24) @ (104(SP), 112(SP), 120(SP), 128(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, 112(SP) - ADCQ $0x00, 120(SP) - ADCQ $0x00, 128(SP) - - // | a12 * b10 - // | (w22, w23, w24, w25) @ (112(SP), 120(SP), 128(SP), 136(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, 120(SP) - ADCQ $0x00, 128(SP) - ADCQ $0x00, 136(SP) - - // | a13 * b10 - // | (w23, w24, w25, w26) @ (120(SP), 128(SP), 136(SP), 144(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 120(SP) - ADCQ DX, 128(SP) - ADCQ $0x00, 136(SP) - ADCQ $0x00, 144(SP) - - // | a14 * b10 - // | (w24, w25, w26, w27) @ (128(SP), 136(SP), 144(SP), 152(SP)) - MOVQ 112(DI), AX - MULQ CX - ADDQ AX, 128(SP) - ADCQ DX, 136(SP) - ADCQ $0x00, 144(SP) - ADCQ $0x00, 152(SP) - - // | a15 * b10 - // | (w25, w26, w27, w28) @ (136(SP), 144(SP), 152(SP), 160(SP)) - MOVQ 120(DI), AX - MULQ CX - ADDQ AX, 136(SP) - ADCQ DX, 144(SP) - ADCQ $0x00, 152(SP) - ADCQ $0x00, 160(SP) + // | w14 @ R11 + MOVQ 80(R15), AX + MULQ 104(SP) + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | - // | b11 - MOVQ 88(SI), CX + // | j11 - // | a0 * b11 - // | (w11, w12, w13, w14) @ (24(SP), 32(SP), 40(SP), 48(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, 32(SP) - ADCQ $0x00, 40(SP) - ADCQ $0x00, 48(SP) + // | w15 @ R10 + MOVQ 88(R15), AX + MULQ 104(SP) + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a1 * b11 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | j12 - // | a2 * b11 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | w16 @ R14 + MOVQ 96(R15), AX + MULQ 104(SP) + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j13 + + // | w17 @ DI + MOVQ 104(R15), AX + MULQ 104(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j14 + + // | w18 @ SI + MOVQ 112(R15), AX + MULQ 104(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a3 * b11 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | j15 - // | a4 * b11 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | w19 @ BX + MOVQ 120(R15), AX + MULQ 104(SP) + ADDQ AX, BX + ADCQ DX, CX + ADDQ R8, BX - // | a5 * b11 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | move to an idle register + MOVQ 288(SP), R13 - // | a6 * b11 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | w20 @ R13 + ADCQ CX, R13 + MOVQ $0x00, CX + ADCQ $0x00, CX - // | a7 * b11 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + // | - // | a8 * b11 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) +/* i = 5 */ - // | a9 * b11 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R9 | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 R12 | 14 R11 | 15 R10 + // | 16 R14 | 17 DI | 18 SI | 19 BX | 20 R13 | 21 296(SP) | 22 304(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | a10 * b11 - // | (w21, w22, w23, w24) @ (104(SP), 112(SP), 120(SP), 128(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, 112(SP) - ADCQ $0x00, 120(SP) - ADCQ $0x00, 128(SP) - // | a11 * b11 - // | (w22, w23, w24, w25) @ (112(SP), 120(SP), 128(SP), 136(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, 120(SP) - ADCQ $0x00, 128(SP) - ADCQ $0x00, 136(SP) + MOVQ $0x00, R8 - // | a12 * b11 - // | (w23, w24, w25, w26) @ (120(SP), 128(SP), 136(SP), 144(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 120(SP) - ADCQ DX, 128(SP) - ADCQ $0x00, 136(SP) - ADCQ $0x00, 144(SP) + // | - // | a13 * b11 - // | (w24, w25, w26, w27) @ (128(SP), 136(SP), 144(SP), 152(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 128(SP) - ADCQ DX, 136(SP) - ADCQ $0x00, 144(SP) - ADCQ $0x00, 152(SP) +/* */ - // | a14 * b11 - // | (w25, w26, w27, w28) @ (136(SP), 144(SP), 152(SP), 160(SP)) - MOVQ 112(DI), AX - MULQ CX - ADDQ AX, 136(SP) - ADCQ DX, 144(SP) - ADCQ $0x00, 152(SP) - ADCQ $0x00, 160(SP) + // | j8 - // | a15 * b11 - // | (w26, w27, w28, w29) @ (144(SP), 152(SP), 160(SP), 168(SP)) - MOVQ 120(DI), AX - MULQ CX - ADDQ AX, 144(SP) - ADCQ DX, 152(SP) - ADCQ $0x00, 160(SP) - ADCQ $0x00, 168(SP) + // | w13 @ R12 + MOVQ 64(R15), AX + MULQ 112(SP) + ADDQ AX, R12 + ADCQ DX, R8 + MOVQ R12, 104(SP) - // | - // | b12 - MOVQ 96(SI), CX + // | j9 - // | a0 * b12 - // | (w12, w13, w14, w15) @ (32(SP), 40(SP), 48(SP), 56(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, 40(SP) - ADCQ $0x00, 48(SP) - ADCQ $0x00, 56(SP) + // | w14 @ R11 + MOVQ 72(R15), AX + MULQ 112(SP) + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a1 * b12 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | j10 - // | a2 * b12 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | w15 @ R10 + MOVQ 80(R15), AX + MULQ 112(SP) + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j11 - // | a3 * b12 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | w16 @ R14 + MOVQ 88(R15), AX + MULQ 112(SP) + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j12 + + // | w17 @ DI + MOVQ 96(R15), AX + MULQ 112(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j13 + + // | w18 @ SI + MOVQ 104(R15), AX + MULQ 112(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a4 * b12 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | j14 - // | a5 * b12 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | w19 @ BX + MOVQ 112(R15), AX + MULQ 112(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a6 * b12 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + // | j15 - // | a7 * b12 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) + // | w20 @ R13 + MOVQ 120(R15), AX + MULQ 112(SP) + ADDQ AX, R13 + ADCQ DX, CX + ADDQ R8, R13 - // | a8 * b12 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) + // | move to an idle register + MOVQ 296(SP), R12 - // | a9 * b12 - // | (w21, w22, w23, w24) @ (104(SP), 112(SP), 120(SP), 128(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, 112(SP) - ADCQ $0x00, 120(SP) - ADCQ $0x00, 128(SP) + // | w21 @ R12 + ADCQ CX, R12 + MOVQ $0x00, CX + ADCQ $0x00, CX - // | a10 * b12 - // | (w22, w23, w24, w25) @ (112(SP), 120(SP), 128(SP), 136(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, 120(SP) - ADCQ $0x00, 128(SP) - ADCQ $0x00, 136(SP) + // | - // | a11 * b12 - // | (w23, w24, w25, w26) @ (120(SP), 128(SP), 136(SP), 144(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 120(SP) - ADCQ DX, 128(SP) - ADCQ $0x00, 136(SP) - ADCQ $0x00, 144(SP) +/* i = 6 */ - // | a12 * b12 - // | (w24, w25, w26, w27) @ (128(SP), 136(SP), 144(SP), 152(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 128(SP) - ADCQ DX, 136(SP) - ADCQ $0x00, 144(SP) - ADCQ $0x00, 152(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R9 | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 R11 | 15 R10 + // | 16 R14 | 17 DI | 18 SI | 19 BX | 20 R13 | 21 R12 | 22 304(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | a13 * b12 - // | (w25, w26, w27, w28) @ (136(SP), 144(SP), 152(SP), 160(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 136(SP) - ADCQ DX, 144(SP) - ADCQ $0x00, 152(SP) - ADCQ $0x00, 160(SP) - // | a14 * b12 - // | (w26, w27, w28, w29) @ (144(SP), 152(SP), 160(SP), 168(SP)) - MOVQ 112(DI), AX - MULQ CX - ADDQ AX, 144(SP) - ADCQ DX, 152(SP) - ADCQ $0x00, 160(SP) - ADCQ $0x00, 168(SP) + MOVQ $0x00, R8 - // | a15 * b12 - // | (w27, w28, w29, w30) @ (152(SP), 160(SP), 168(SP), 176(SP)) - MOVQ 120(DI), AX - MULQ CX - ADDQ AX, 152(SP) - ADCQ DX, 160(SP) - ADCQ $0x00, 168(SP) - ADCQ $0x00, 176(SP) + // | - // | - // | b13 - MOVQ 104(SI), CX +/* */ - // | a0 * b13 - // | (w13, w14, w15, w16) @ (40(SP), 48(SP), 56(SP), 64(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, 48(SP) - ADCQ $0x00, 56(SP) - ADCQ $0x00, 64(SP) + // | j8 - // | a1 * b13 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) + // | w14 @ R11 + MOVQ 64(R15), AX + MULQ 120(SP) + ADDQ AX, R11 + ADCQ DX, R8 + MOVQ R11, 112(SP) - // | a2 * b13 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | j9 - // | a3 * b13 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | w15 @ R10 + MOVQ 72(R15), AX + MULQ 120(SP) + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a4 * b13 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | j10 - // | a5 * b13 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + // | w16 @ R14 + MOVQ 80(R15), AX + MULQ 120(SP) + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j11 + + // | w17 @ DI + MOVQ 88(R15), AX + MULQ 120(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j12 + + // | w18 @ SI + MOVQ 96(R15), AX + MULQ 120(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a6 * b13 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) + // | j13 - // | a7 * b13 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) + // | w19 @ BX + MOVQ 104(R15), AX + MULQ 120(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a8 * b13 - // | (w21, w22, w23, w24) @ (104(SP), 112(SP), 120(SP), 128(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, 112(SP) - ADCQ $0x00, 120(SP) - ADCQ $0x00, 128(SP) + // | j14 - // | a9 * b13 - // | (w22, w23, w24, w25) @ (112(SP), 120(SP), 128(SP), 136(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, 120(SP) - ADCQ $0x00, 128(SP) - ADCQ $0x00, 136(SP) + // | w20 @ R13 + MOVQ 112(R15), AX + MULQ 120(SP) + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a10 * b13 - // | (w23, w24, w25, w26) @ (120(SP), 128(SP), 136(SP), 144(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 120(SP) - ADCQ DX, 128(SP) - ADCQ $0x00, 136(SP) - ADCQ $0x00, 144(SP) + // | j15 - // | a11 * b13 - // | (w24, w25, w26, w27) @ (128(SP), 136(SP), 144(SP), 152(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 128(SP) - ADCQ DX, 136(SP) - ADCQ $0x00, 144(SP) - ADCQ $0x00, 152(SP) + // | w21 @ R12 + MOVQ 120(R15), AX + MULQ 120(SP) + ADDQ AX, R12 + ADCQ DX, CX + ADDQ R8, R12 - // | a12 * b13 - // | (w25, w26, w27, w28) @ (136(SP), 144(SP), 152(SP), 160(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 136(SP) - ADCQ DX, 144(SP) - ADCQ $0x00, 152(SP) - ADCQ $0x00, 160(SP) + // | move to an idle register + MOVQ 304(SP), R11 - // | a13 * b13 - // | (w26, w27, w28, w29) @ (144(SP), 152(SP), 160(SP), 168(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 144(SP) - ADCQ DX, 152(SP) - ADCQ $0x00, 160(SP) - ADCQ $0x00, 168(SP) + // | w22 @ R11 + ADCQ CX, R11 + MOVQ $0x00, CX + ADCQ $0x00, CX - // | a14 * b13 - // | (w27, w28, w29, w30) @ (152(SP), 160(SP), 168(SP), 176(SP)) - MOVQ 112(DI), AX - MULQ CX - ADDQ AX, 152(SP) - ADCQ DX, 160(SP) - ADCQ $0x00, 168(SP) - ADCQ $0x00, 176(SP) + // | - // | a15 * b13 - // | (w28, w29, w30, w31) @ (160(SP), 168(SP), 176(SP), 184(SP)) - MOVQ 120(DI), AX - MULQ CX - ADDQ AX, 160(SP) - ADCQ DX, 168(SP) - ADCQ $0x00, 176(SP) - ADCQ $0x00, 184(SP) +/* i = 7 */ - // | - // | b14 - MOVQ 112(SI), CX + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R9 | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 112(SP) | 15 R10 + // | 16 R14 | 17 DI | 18 SI | 19 BX | 20 R13 | 21 R12 | 22 R11 | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | a0 * b14 - // | (w14, w15, w16, w17) @ (48(SP), 56(SP), 64(SP), 72(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, 56(SP) - ADCQ $0x00, 64(SP) - ADCQ $0x00, 72(SP) - // | a1 * b14 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + MOVQ $0x00, R8 - // | a2 * b14 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) + // | - // | a3 * b14 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) +/* */ - // | a4 * b14 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + // | j8 - // | a5 * b14 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) + // | w15 @ R10 + MOVQ 64(R15), AX + MULQ 128(SP) + ADDQ AX, R10 + ADCQ DX, R8 - // | a6 * b14 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) + // | j9 - // | a7 * b14 - // | (w21, w22, w23, w24) @ (104(SP), 112(SP), 120(SP), 128(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, 112(SP) - ADCQ $0x00, 120(SP) - ADCQ $0x00, 128(SP) + // | w16 @ R14 + MOVQ 72(R15), AX + MULQ 128(SP) + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j10 + + // | w17 @ DI + MOVQ 80(R15), AX + MULQ 128(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j11 + + // | w18 @ SI + MOVQ 88(R15), AX + MULQ 128(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a8 * b14 - // | (w22, w23, w24, w25) @ (112(SP), 120(SP), 128(SP), 136(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, 120(SP) - ADCQ $0x00, 128(SP) - ADCQ $0x00, 136(SP) + // | j12 - // | a9 * b14 - // | (w23, w24, w25, w26) @ (120(SP), 128(SP), 136(SP), 144(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 120(SP) - ADCQ DX, 128(SP) - ADCQ $0x00, 136(SP) - ADCQ $0x00, 144(SP) + // | w19 @ BX + MOVQ 96(R15), AX + MULQ 128(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a10 * b14 - // | (w24, w25, w26, w27) @ (128(SP), 136(SP), 144(SP), 152(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 128(SP) - ADCQ DX, 136(SP) - ADCQ $0x00, 144(SP) - ADCQ $0x00, 152(SP) + // | j13 - // | a11 * b14 - // | (w25, w26, w27, w28) @ (136(SP), 144(SP), 152(SP), 160(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 136(SP) - ADCQ DX, 144(SP) - ADCQ $0x00, 152(SP) - ADCQ $0x00, 160(SP) + // | w20 @ R13 + MOVQ 104(R15), AX + MULQ 128(SP) + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a12 * b14 - // | (w26, w27, w28, w29) @ (144(SP), 152(SP), 160(SP), 168(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 144(SP) - ADCQ DX, 152(SP) - ADCQ $0x00, 160(SP) - ADCQ $0x00, 168(SP) + // | j14 - // | a13 * b14 - // | (w27, w28, w29, w30) @ (152(SP), 160(SP), 168(SP), 176(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 152(SP) - ADCQ DX, 160(SP) - ADCQ $0x00, 168(SP) - ADCQ $0x00, 176(SP) + // | w21 @ R12 + MOVQ 112(R15), AX + MULQ 128(SP) + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | a14 * b14 - // | (w28, w29, w30, w31) @ (160(SP), 168(SP), 176(SP), 184(SP)) - MOVQ 112(DI), AX - MULQ CX - ADDQ AX, 160(SP) - ADCQ DX, 168(SP) - ADCQ $0x00, 176(SP) - ADCQ $0x00, 184(SP) + // | j15 - // | a15 * b14 - // | (w29, w30, w31) @ (168(SP), 176(SP), 184(SP)) - MOVQ 120(DI), AX - MULQ CX - ADDQ AX, 168(SP) - ADCQ DX, 176(SP) - ADCQ $0x00, 184(SP) + // | w22 @ R11 + MOVQ 120(R15), AX + MULQ 128(SP) + ADDQ AX, R11 + ADCQ DX, CX + ADDQ R8, R11 - // | - // | b15 - MOVQ 120(SI), CX + // | tolarete this limb to stay in stack + // | w23 @ 64(SP) + ADCQ CX, 64(SP) + MOVQ $0x00, CX + ADCQ $0x00, CX - // | a0 * b15 - // | (w15, w16, w17, w18) @ (56(SP), 64(SP), 72(SP), 80(SP)) - MOVQ (DI), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, 64(SP) - ADCQ $0x00, 72(SP) - ADCQ $0x00, 80(SP) + // | + // | q2 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R9 | 9 72(SP) | 10 80(SP) | 11 88(SP) | 12 96(SP) | 13 104(SP) | 14 112(SP) | 15 R10 + // | 16 R14 | 17 DI | 18 SI | 19 BX | 20 R13 | 21 R12 | 22 R11 | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | a1 * b15 - // | (w16, w17, w18, w19) @ (64(SP), 72(SP), 80(SP), 88(SP)) - MOVQ 8(DI), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, 72(SP) - ADCQ $0x00, 80(SP) - ADCQ $0x00, 88(SP) - // | a2 * b15 - // | (w17, w18, w19, w20) @ (72(SP), 80(SP), 88(SP), 96(SP)) - MOVQ 16(DI), AX - MULQ CX - ADDQ AX, 72(SP) - ADCQ DX, 80(SP) - ADCQ $0x00, 88(SP) - ADCQ $0x00, 96(SP) + // | save the carry from q2 + // | should be added to w24 + MOVQ CX, 136(SP) - // | a3 * b15 - // | (w18, w19, w20, w21) @ (80(SP), 88(SP), 96(SP), 104(SP)) - MOVQ 24(DI), AX - MULQ CX - ADDQ AX, 80(SP) - ADCQ DX, 88(SP) - ADCQ $0x00, 96(SP) - ADCQ $0x00, 104(SP) + // | - // | a4 * b15 - // | (w19, w20, w21, w22) @ (88(SP), 96(SP), 104(SP), 112(SP)) - MOVQ 32(DI), AX - MULQ CX - ADDQ AX, 88(SP) - ADCQ DX, 96(SP) - ADCQ $0x00, 104(SP) - ADCQ $0x00, 112(SP) +/* q2 q3 transition swap */ - // | a5 * b15 - // | (w20, w21, w22, w23) @ (96(SP), 104(SP), 112(SP), 120(SP)) - MOVQ 40(DI), AX - MULQ CX - ADDQ AX, 96(SP) - ADCQ DX, 104(SP) - ADCQ $0x00, 112(SP) - ADCQ $0x00, 120(SP) + MOVQ 72(SP), CX + MOVQ R11, 72(SP) + MOVQ 80(SP), R11 + MOVQ R12, 80(SP) + MOVQ 88(SP), R12 + MOVQ R13, 88(SP) + MOVQ 96(SP), R13 + MOVQ BX, 96(SP) + MOVQ 104(SP), BX + MOVQ SI, 104(SP) + MOVQ 112(SP), SI + MOVQ DI, 112(SP) - // | a6 * b15 - // | (w21, w22, w23, w24) @ (104(SP), 112(SP), 120(SP), 128(SP)) - MOVQ 48(DI), AX - MULQ CX - ADDQ AX, 104(SP) - ADCQ DX, 112(SP) - ADCQ $0x00, 120(SP) - ADCQ $0x00, 128(SP) + // | + // | W q2 q3 transition + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R9 | 9 CX | 10 R11 | 11 R12 | 12 R13 | 13 BX | 14 SI | 15 R10 + // | 16 R14 | 17 112(SP) | 18 104(SP) | 19 96(SP) | 20 88(SP) | 21 80(SP) | 22 72(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | a7 * b15 - // | (w22, w23, w24, w25) @ (112(SP), 120(SP), 128(SP), 136(SP)) - MOVQ 56(DI), AX - MULQ CX - ADDQ AX, 112(SP) - ADCQ DX, 120(SP) - ADCQ $0x00, 128(SP) - ADCQ $0x00, 136(SP) - // | a8 * b15 - // | (w23, w24, w25, w26) @ (120(SP), 128(SP), 136(SP), 144(SP)) - MOVQ 64(DI), AX - MULQ CX - ADDQ AX, 120(SP) - ADCQ DX, 128(SP) - ADCQ $0x00, 136(SP) - ADCQ $0x00, 144(SP) + // | - // | a9 * b15 - // | (w24, w25, w26, w27) @ (128(SP), 136(SP), 144(SP), 152(SP)) - MOVQ 72(DI), AX - MULQ CX - ADDQ AX, 128(SP) - ADCQ DX, 136(SP) - ADCQ $0x00, 144(SP) - ADCQ $0x00, 152(SP) +/* montgomery reduction q3 */ - // | a10 * b15 - // | (w25, w26, w27, w28) @ (136(SP), 144(SP), 152(SP), 160(SP)) - MOVQ 80(DI), AX - MULQ CX - ADDQ AX, 136(SP) - ADCQ DX, 144(SP) - ADCQ $0x00, 152(SP) - ADCQ $0x00, 160(SP) + // | - // | a11 * b15 - // | (w26, w27, w28, w29) @ (144(SP), 152(SP), 160(SP), 168(SP)) - MOVQ 88(DI), AX - MULQ CX - ADDQ AX, 144(SP) - ADCQ DX, 152(SP) - ADCQ $0x00, 160(SP) - ADCQ $0x00, 168(SP) +/* i = 8 */ - // | a12 * b15 - // | (w27, w28, w29, w30) @ (152(SP), 160(SP), 168(SP), 176(SP)) - MOVQ 96(DI), AX - MULQ CX - ADDQ AX, 152(SP) - ADCQ DX, 160(SP) - ADCQ $0x00, 168(SP) - ADCQ $0x00, 176(SP) + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 R9 | 9 CX | 10 R11 | 11 R12 | 12 R13 | 13 BX | 14 SI | 15 R10 + // | 16 R14 | 17 112(SP) | 18 104(SP) | 19 96(SP) | 20 88(SP) | 21 80(SP) | 22 72(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | a13 * b15 - // | (w28, w29, w30, w31) @ (160(SP), 168(SP), 176(SP), 184(SP)) - MOVQ 104(DI), AX - MULQ CX - ADDQ AX, 160(SP) - ADCQ DX, 168(SP) - ADCQ $0x00, 176(SP) - ADCQ $0x00, 184(SP) - // | a14 * b15 - // | (w29, w30, w31) @ (168(SP), 176(SP), 184(SP)) - MOVQ 112(DI), AX - MULQ CX - ADDQ AX, 168(SP) - ADCQ DX, 176(SP) - ADCQ $0x00, 184(SP) + // | | u8 = w8 * inp + MOVQ R9, AX + MULQ inp+32(FP) + MOVQ AX, DI + MOVQ $0x00, R8 - // | a15 * b15 - // | (w30, w31) @ (176(SP), 184(SP)) - MOVQ 120(DI), AX - MULQ CX - ADDQ AX, 176(SP) - ADCQ DX, 184(SP) + // | - // | - // | Montgomerry Reduction - MOVQ R15, 192(SP) - MOVQ R14, 200(SP) - MOVQ p+24(FP), R14 +/* */ - // | - // | (u @ CX) = (w0 @ R8) * inp - MOVQ R8, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | save u8 + MOVQ DI, 120(SP) - // | w0 @ R8 - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI + // | j0 - // | w1 @ R9 - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX + // | w8 @ R9 + MOVQ (R15), AX + MULQ DI ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + ADCQ DX, R8 - // | w2 @ R10 - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + // | j1 - // | w3 @ R11 - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX + // | w9 @ CX + MOVQ 8(R15), AX + MULQ DI + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R8, CX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w10 @ R11 + MOVQ 16(R15), AX + MULQ DI ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w4 @ R12 - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX + // | j3 + + // | w11 @ R12 + MOVQ 24(R15), AX + MULQ DI ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w5 @ R13 - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + // | j4 - // | w6 @ 200(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 200(SP) - ADCQ DX, DI - ADDQ SI, 200(SP) - ADCQ $0x00, DI + // | w12 @ R13 + MOVQ 32(R15), AX + MULQ DI + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w13 @ BX + MOVQ 40(R15), AX + MULQ DI + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w14 @ SI + MOVQ 48(R15), AX + MULQ DI + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w7 @ 192(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 192(SP) - ADCQ DX, SI - ADDQ DI, 192(SP) - ADCQ $0x00, SI + // | j7 - // | w8 @ (SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + // | w15 @ R10 + MOVQ 56(R15), AX + MULQ DI + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R8, R10 - // | w9 @ 8(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + // | w16 @ R14 + ADCQ DX, R14 + ADCQ $0x00, R9 - // | w10 @ 16(SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + // | - // | w11 @ 24(SP) - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI +/* i = 9 */ - // | w12 @ 32(SP) - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI - ADDQ SI, 32(SP) - ADCQ $0x00, DI + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 CX | 10 R11 | 11 R12 | 12 R13 | 13 BX | 14 SI | 15 R10 + // | 16 R14 | 17 112(SP) | 18 104(SP) | 19 96(SP) | 20 88(SP) | 21 80(SP) | 22 72(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | w13 @ 40(SP) - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, SI - ADDQ DI, 40(SP) - ADCQ $0x00, SI - // | w14 @ 48(SP) - XORQ DI, DI - MOVQ 112(R14), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, DI - ADDQ SI, 48(SP) - ADCQ $0x00, DI + // | | u9 = w9 * inp + MOVQ CX, AX + MULQ inp+32(FP) + MOVQ AX, DI + MOVQ $0x00, R8 - // | w15 @ 56(SP) - XORQ SI, SI - MOVQ 120(R14), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, SI - ADDQ DI, 56(SP) - ADCQ $0x00, SI + // | - // | w16 @ 64(SP) - ADDQ SI, 64(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 +/* */ - // | - MOVQ 72(SP), R8 + // | save u9 + MOVQ DI, 128(SP) - // | (u @ CX) = (w1 @ R9) * inp - MOVQ R9, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | j0 - // | w1 @ R9 - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, DI + // | w9 @ CX + MOVQ (R15), AX + MULQ DI + ADDQ AX, CX + ADCQ DX, R8 - // | w2 @ R10 - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + // | j1 - // | w3 @ R11 - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX + // | w10 @ R11 + MOVQ 8(R15), AX + MULQ DI ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w4 @ R12 - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX + // | j2 + + // | w11 @ R12 + MOVQ 16(R15), AX + MULQ DI ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w5 @ R13 - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX + // | j3 + + // | w12 @ R13 + MOVQ 24(R15), AX + MULQ DI ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w13 @ BX + MOVQ 32(R15), AX + MULQ DI + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w14 @ SI + MOVQ 40(R15), AX + MULQ DI + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w6 @ 200(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 200(SP) - ADCQ DX, SI - ADDQ DI, 200(SP) - ADCQ $0x00, SI + // | j6 - // | w7 @ 192(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 192(SP) - ADCQ DX, DI - ADDQ SI, 192(SP) - ADCQ $0x00, DI + // | w15 @ R10 + MOVQ 48(R15), AX + MULQ DI + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w8 @ (SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + // | j7 - // | w9 @ 8(SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + // | w16 @ R14 + MOVQ 56(R15), AX + MULQ DI + ADDQ AX, R14 + ADCQ DX, R9 + ADDQ R8, R14 - // | w10 @ 16(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI + // | move to idle register + MOVQ 112(SP), CX - // | w11 @ 24(SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI - ADDQ SI, 24(SP) - ADCQ $0x00, DI + // | w17 @ CX + ADCQ R9, CX + MOVQ $0x00, R9 + ADCQ $0x00, R9 - // | w12 @ 32(SP) - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, SI - ADDQ DI, 32(SP) - ADCQ $0x00, SI + // | - // | w13 @ 40(SP) - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, DI - ADDQ SI, 40(SP) - ADCQ $0x00, DI +/* i = 10 */ - // | w14 @ 48(SP) - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, SI - ADDQ DI, 48(SP) - ADCQ $0x00, SI + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 R11 | 11 R12 | 12 R13 | 13 BX | 14 SI | 15 R10 + // | 16 R14 | 17 CX | 18 104(SP) | 19 96(SP) | 20 88(SP) | 21 80(SP) | 22 72(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | w15 @ 56(SP) - XORQ DI, DI - MOVQ 112(R14), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, DI - ADDQ SI, 56(SP) - ADCQ $0x00, DI - // | w16 @ 64(SP) - XORQ SI, SI - MOVQ 120(R14), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, SI - ADDQ DI, 64(SP) - ADCQ $0x00, SI + // | | u10 = w10 * inp + MOVQ R11, AX + MULQ inp+32(FP) + MOVQ AX, DI + MOVQ $0x00, R8 - // | w17 @ R8 - ADDQ SI, R15 - ADCQ R15, R8 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | - // | - MOVQ 80(SP), R9 +/* */ - // | (u @ CX) = (w2 @ R10) * inp - MOVQ R10, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | save u10 + MOVQ DI, 112(SP) - // | w2 @ R10 - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI + // | j0 - // | w3 @ R11 - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX + // | w10 @ R11 + MOVQ (R15), AX + MULQ DI ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + ADCQ DX, R8 - // | w4 @ R12 - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX + // | j1 + + // | w11 @ R12 + MOVQ 8(R15), AX + MULQ DI ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w5 @ R13 - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX + // | j2 + + // | w12 @ R13 + MOVQ 16(R15), AX + MULQ DI ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w13 @ BX + MOVQ 24(R15), AX + MULQ DI + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w14 @ SI + MOVQ 32(R15), AX + MULQ DI + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w6 @ 200(SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 200(SP) - ADCQ DX, DI - ADDQ SI, 200(SP) - ADCQ $0x00, DI + // | j5 - // | w7 @ 192(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 192(SP) - ADCQ DX, SI - ADDQ DI, 192(SP) - ADCQ $0x00, SI + // | w15 @ R10 + MOVQ 40(R15), AX + MULQ DI + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w8 @ (SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + // | j6 - // | w9 @ 8(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + // | w16 @ R14 + MOVQ 48(R15), AX + MULQ DI + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w10 @ 16(SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + // | j7 - // | w11 @ 24(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI + // | w17 @ CX + MOVQ 56(R15), AX + MULQ DI + ADDQ AX, CX + ADCQ DX, R9 + ADDQ R8, CX - // | w12 @ 32(SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI - ADDQ SI, 32(SP) - ADCQ $0x00, DI + // | move to idle register + MOVQ 104(SP), R11 - // | w13 @ 40(SP) - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, SI - ADDQ DI, 40(SP) - ADCQ $0x00, SI + // | w18 @ R11 + ADCQ R9, R11 + MOVQ $0x00, R9 + ADCQ $0x00, R9 - // | w14 @ 48(SP) - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, DI - ADDQ SI, 48(SP) - ADCQ $0x00, DI + // | - // | w15 @ 56(SP) - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, SI - ADDQ DI, 56(SP) - ADCQ $0x00, SI +/* i = 11 */ - // | w16 @ 64(SP) - XORQ DI, DI - MOVQ 112(R14), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, DI - ADDQ SI, 64(SP) - ADCQ $0x00, DI + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 R12 | 12 R13 | 13 BX | 14 SI | 15 R10 + // | 16 R14 | 17 CX | 18 R11 | 19 96(SP) | 20 88(SP) | 21 80(SP) | 22 72(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | w17 @ R8 - XORQ SI, SI - MOVQ 120(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI - // | w18 @ R9 - ADDQ SI, R15 - ADCQ R15, R9 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | | u11 = w11 * inp + MOVQ R12, AX + MULQ inp+32(FP) + MOVQ AX, DI + MOVQ $0x00, R8 - // | - MOVQ 88(SP), R10 + // | - // | (u @ CX) = (w3 @ R11) * inp - MOVQ R11, AX - MULQ inp+32(FP) - MOVQ AX, CX +/* */ - // | w3 @ R11 - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, DI + // | save u11 + MOVQ DI, 104(SP) - // | w4 @ R12 - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX + // | j0 + + // | w11 @ R12 + MOVQ (R15), AX + MULQ DI ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + ADCQ DX, R8 - // | w5 @ R13 - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + // | j1 - // | w6 @ 200(SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 200(SP) - ADCQ DX, SI - ADDQ DI, 200(SP) - ADCQ $0x00, SI + // | w12 @ R13 + MOVQ 8(R15), AX + MULQ DI + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w13 @ BX + MOVQ 16(R15), AX + MULQ DI + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w14 @ SI + MOVQ 24(R15), AX + MULQ DI + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w7 @ 192(SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 192(SP) - ADCQ DX, DI - ADDQ SI, 192(SP) - ADCQ $0x00, DI + // | j4 - // | w8 @ (SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + // | w15 @ R10 + MOVQ 32(R15), AX + MULQ DI + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w9 @ 8(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + // | j5 - // | w10 @ 16(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI + // | w16 @ R14 + MOVQ 40(R15), AX + MULQ DI + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w11 @ 24(SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI - ADDQ SI, 24(SP) - ADCQ $0x00, DI + // | j6 - // | w12 @ 32(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, SI - ADDQ DI, 32(SP) - ADCQ $0x00, SI + // | w17 @ CX + MOVQ 48(R15), AX + MULQ DI + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R8, CX + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w13 @ 40(SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, DI - ADDQ SI, 40(SP) - ADCQ $0x00, DI + // | j7 - // | w14 @ 48(SP) - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, SI - ADDQ DI, 48(SP) - ADCQ $0x00, SI + // | w18 @ R11 + MOVQ 56(R15), AX + MULQ DI + ADDQ AX, R11 + ADCQ DX, R9 + ADDQ R8, R11 - // | w15 @ 56(SP) - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, DI - ADDQ SI, 56(SP) - ADCQ $0x00, DI + // | move to idle register + MOVQ 96(SP), R12 - // | w16 @ 64(SP) - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, SI - ADDQ DI, 64(SP) - ADCQ $0x00, SI + // | w19 @ R12 + ADCQ R9, R12 + MOVQ $0x00, R9 + ADCQ $0x00, R9 - // | w17 @ R8 - XORQ DI, DI - MOVQ 112(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + // | - // | w18 @ R9 - XORQ SI, SI - MOVQ 120(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI +/* i = 12 */ - // | w19 @ R10 - ADDQ SI, R15 - ADCQ R15, R10 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 R13 | 13 BX | 14 SI | 15 R10 + // | 16 R14 | 17 CX | 18 R11 | 19 R12 | 20 88(SP) | 21 80(SP) | 22 72(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | - MOVQ 96(SP), R11 - // | (u @ CX) = (w4 @ R12) * inp - MOVQ R12, AX + // | | u12 = w12 * inp + MOVQ R13, AX MULQ inp+32(FP) - MOVQ AX, CX + MOVQ AX, DI + MOVQ $0x00, R8 - // | w4 @ R12 - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI + // | - // | w5 @ R13 - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI +/* */ - // | w6 @ 200(SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 200(SP) - ADCQ DX, DI - ADDQ SI, 200(SP) - ADCQ $0x00, DI + // | save u12 + MOVQ DI, 96(SP) - // | w7 @ 192(SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 192(SP) - ADCQ DX, SI - ADDQ DI, 192(SP) - ADCQ $0x00, SI + // | j0 - // | w8 @ (SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + // | w12 @ R13 + MOVQ (R15), AX + MULQ DI + ADDQ AX, R13 + ADCQ DX, R8 - // | w9 @ 8(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + // | j1 - // | w10 @ 16(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + // | w13 @ BX + MOVQ 8(R15), AX + MULQ DI + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w11 @ 24(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI + // | j2 - // | w12 @ 32(SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI - ADDQ SI, 32(SP) - ADCQ $0x00, DI + // | w14 @ SI + MOVQ 16(R15), AX + MULQ DI + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w13 @ 40(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, SI - ADDQ DI, 40(SP) - ADCQ $0x00, SI + // | j3 - // | w14 @ 48(SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, DI - ADDQ SI, 48(SP) - ADCQ $0x00, DI + // | w15 @ R10 + MOVQ 24(R15), AX + MULQ DI + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w15 @ 56(SP) - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, SI - ADDQ DI, 56(SP) - ADCQ $0x00, SI + // | j4 - // | w16 @ 64(SP) - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, DI - ADDQ SI, 64(SP) - ADCQ $0x00, DI + // | w16 @ R14 + MOVQ 32(R15), AX + MULQ DI + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w17 @ R8 - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + // | j5 - // | w18 @ R9 - XORQ DI, DI - MOVQ 112(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI + // | w17 @ CX + MOVQ 40(R15), AX + MULQ DI + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R8, CX + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w19 @ R10 - XORQ SI, SI - MOVQ 120(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + // | j6 - // | w20 @ R11 - ADDQ SI, R15 - ADCQ R15, R11 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | w18 @ R11 + MOVQ 48(R15), AX + MULQ DI + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | - MOVQ 104(SP), R12 + // | j7 - // | (u @ CX) = (w5 @ R13) * inp - MOVQ R13, AX - MULQ inp+32(FP) - MOVQ AX, CX + // | w19 @ R12 + MOVQ 56(R15), AX + MULQ DI + ADDQ AX, R12 + ADCQ DX, R9 + ADDQ R8, R12 - // | w5 @ R13 - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, DI + // | move to idle register + MOVQ 88(SP), R13 - // | w6 @ 200(SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 200(SP) - ADCQ DX, SI - ADDQ DI, 200(SP) - ADCQ $0x00, SI + // | w20 @ R13 + ADCQ R9, R13 + MOVQ $0x00, R9 + ADCQ $0x00, R9 - // | w7 @ 192(SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 192(SP) - ADCQ DX, DI - ADDQ SI, 192(SP) - ADCQ $0x00, DI + // | - // | w8 @ (SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI +/* i = 13 */ - // | w9 @ 8(SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 BX | 14 SI | 15 R10 + // | 16 R14 | 17 CX | 18 R11 | 19 R12 | 20 R13 | 21 80(SP) | 22 72(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | w10 @ 16(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI - // | w11 @ 24(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI - ADDQ SI, 24(SP) - ADCQ $0x00, DI + // | | u13 = w13 * inp + MOVQ BX, AX + MULQ inp+32(FP) + MOVQ AX, DI + MOVQ $0x00, R8 - // | w12 @ 32(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, SI - ADDQ DI, 32(SP) - ADCQ $0x00, SI + // | - // | w13 @ 40(SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, DI - ADDQ SI, 40(SP) - ADCQ $0x00, DI +/* */ - // | w14 @ 48(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, SI - ADDQ DI, 48(SP) - ADCQ $0x00, SI + // | save u13 + MOVQ DI, 88(SP) - // | w15 @ 56(SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, DI - ADDQ SI, 56(SP) - ADCQ $0x00, DI + // | j0 - // | w16 @ 64(SP) - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, SI - ADDQ DI, 64(SP) - ADCQ $0x00, SI + // | w13 @ BX + MOVQ (R15), AX + MULQ DI + ADDQ AX, BX + ADCQ DX, R8 - // | w17 @ R8 - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + // | j1 - // | w18 @ R9 - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + // | w14 @ SI + MOVQ 8(R15), AX + MULQ DI + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w19 @ R10 - XORQ DI, DI - MOVQ 112(R14), AX - MULQ CX + // | j2 + + // | w15 @ R10 + MOVQ 16(R15), AX + MULQ DI ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w20 @ R11 - XORQ SI, SI - MOVQ 120(R14), AX - MULQ CX + // | j3 + + // | w16 @ R14 + MOVQ 24(R15), AX + MULQ DI + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j4 + + // | w17 @ CX + MOVQ 32(R15), AX + MULQ DI + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R8, CX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j5 + + // | w18 @ R11 + MOVQ 40(R15), AX + MULQ DI ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w21 @ R12 - ADDQ SI, R15 - ADCQ R15, R12 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | j6 - // | - MOVQ 112(SP), R13 + // | w19 @ R12 + MOVQ 48(R15), AX + MULQ DI + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | (u @ CX) = (w6 @ 200(SP)) * inp - MOVQ 200(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + // | j7 - // | w6 @ 200(SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, 200(SP) - ADCQ DX, DI + // | w20 @ R13 + MOVQ 56(R15), AX + MULQ DI + ADDQ AX, R13 + ADCQ DX, R9 + ADDQ R8, R13 - // | w7 @ 192(SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 192(SP) - ADCQ DX, SI - ADDQ DI, 192(SP) - ADCQ $0x00, SI + // | move to idle register + MOVQ 80(SP), BX - // | w8 @ (SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + // | w21 @ BX + ADCQ R9, BX + MOVQ $0x00, R9 + ADCQ $0x00, R9 - // | w9 @ 8(SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + // | - // | w10 @ 16(SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI +/* i = 14 */ + + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 SI | 15 R10 + // | 16 R14 | 17 CX | 18 R11 | 19 R12 | 20 R13 | 21 BX | 22 72(SP) | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) + + + // | | u14 = w14 * inp + MOVQ SI, AX + MULQ inp+32(FP) + MOVQ AX, DI + MOVQ $0x00, R8 + + // | + +/* */ - // | w11 @ 24(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI + // | save u14 + MOVQ DI, 80(SP) - // | w12 @ 32(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI - ADDQ SI, 32(SP) - ADCQ $0x00, DI + // | j0 - // | w13 @ 40(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, SI - ADDQ DI, 40(SP) - ADCQ $0x00, SI + // | w14 @ SI + MOVQ (R15), AX + MULQ DI + ADDQ AX, SI + ADCQ DX, R8 - // | w14 @ 48(SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, DI - ADDQ SI, 48(SP) - ADCQ $0x00, DI + // | j1 - // | w15 @ 56(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, SI - ADDQ DI, 56(SP) - ADCQ $0x00, SI + // | w15 @ R10 + MOVQ 8(R15), AX + MULQ DI + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w16 @ 64(SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, DI - ADDQ SI, 64(SP) - ADCQ $0x00, DI + // | j2 - // | w17 @ R8 - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + // | w16 @ R14 + MOVQ 16(R15), AX + MULQ DI + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w18 @ R9 - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI + // | j3 - // | w19 @ R10 - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + // | w17 @ CX + MOVQ 24(R15), AX + MULQ DI + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R8, CX + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w20 @ R11 - XORQ DI, DI - MOVQ 112(R14), AX - MULQ CX + // | j4 + + // | w18 @ R11 + MOVQ 32(R15), AX + MULQ DI ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w21 @ R12 - XORQ SI, SI - MOVQ 120(R14), AX - MULQ CX + // | j5 + + // | w19 @ R12 + MOVQ 40(R15), AX + MULQ DI ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w22 @ R13 - ADDQ SI, R15 - ADCQ R15, R13 - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | j6 - // | - MOVQ 120(SP), BX - MOVQ BX, 200(SP) + // | w20 @ R13 + MOVQ 48(R15), AX + MULQ DI + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | (u @ CX) = (w7 @ 192(SP)) * inp - MOVQ 192(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + // | j7 - // | w7 @ 192(SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, 192(SP) - ADCQ DX, DI + // | w21 @ BX + MOVQ 56(R15), AX + MULQ DI + ADDQ AX, BX + ADCQ DX, R9 + ADDQ R8, BX - // | w8 @ (SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + // | move to idle register + MOVQ 72(SP), SI - // | w9 @ 8(SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + // | w22 @ SI + ADCQ R9, SI + MOVQ $0x00, R9 + ADCQ $0x00, R9 - // | w10 @ 16(SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI + // | - // | w11 @ 24(SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI - ADDQ SI, 24(SP) - ADCQ $0x00, DI +/* i = 15 */ - // | w12 @ 32(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, SI - ADDQ DI, 32(SP) - ADCQ $0x00, SI + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - | 15 R10 + // | 16 R14 | 17 CX | 18 R11 | 19 R12 | 20 R13 | 21 BX | 22 SI | 23 64(SP) | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | w13 @ 40(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, DI - ADDQ SI, 40(SP) - ADCQ $0x00, DI - // | w14 @ 48(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, SI - ADDQ DI, 48(SP) - ADCQ $0x00, SI + // | | u15 = w15 * inp + MOVQ R10, AX + MULQ inp+32(FP) + MOVQ AX, DI + MOVQ $0x00, R8 - // | w15 @ 56(SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, DI - ADDQ SI, 56(SP) - ADCQ $0x00, DI + // | - // | w16 @ 64(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, SI - ADDQ DI, 64(SP) - ADCQ $0x00, SI +/* */ - // | w17 @ R8 - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + // | save u15 + MOVQ DI, 72(SP) - // | w18 @ R9 - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + // | j0 - // | w19 @ R10 - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX + // | w15 @ R10 + MOVQ (R15), AX + MULQ DI ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + ADCQ DX, R8 - // | w20 @ R11 - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX + // | j1 + + // | w16 @ R14 + MOVQ 8(R15), AX + MULQ DI + ADDQ AX, R14 + ADCQ $0x00, DX + ADDQ R8, R14 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j2 + + // | w17 @ CX + MOVQ 16(R15), AX + MULQ DI + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R8, CX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j3 + + // | w18 @ R11 + MOVQ 24(R15), AX + MULQ DI ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w21 @ R12 - XORQ DI, DI - MOVQ 112(R14), AX - MULQ CX + // | j4 + + // | w19 @ R12 + MOVQ 32(R15), AX + MULQ DI ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w22 @ R13 - XORQ SI, SI - MOVQ 120(R14), AX - MULQ CX + // | j5 + + // | w20 @ R13 + MOVQ 40(R15), AX + MULQ DI ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j6 + + // | w21 @ BX + MOVQ 48(R15), AX + MULQ DI + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j7 + + // | w22 @ SI + MOVQ 56(R15), AX + MULQ DI + ADDQ AX, SI + ADCQ DX, R9 + ADDQ R8, SI - // | w23 @ 200(SP) - ADDQ SI, R15 - ADCQ R15, 200(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | move to idle register + MOVQ 64(SP), R10 - // | - MOVQ 128(SP), BX - MOVQ BX, 192(SP) + // | w23 @ R10 + ADCQ R9, R10 + MOVQ $0x00, R9 + ADCQ $0x00, R9 - // | (u @ CX) = (w8 @ (SP)) * inp - MOVQ (SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + // | + // | W q3 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - | 15 - + // | 16 R14 | 17 CX | 18 R11 | 19 R12 | 20 R13 | 21 BX | 22 SI | 23 R10 | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | w8 @ (SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - // | w9 @ 8(SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + // | aggregate carries from q2 & q3 + // | should be added to w24 + ADCQ R9, 136(SP) - // | w10 @ 16(SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + // | - // | w11 @ 24(SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI +/* montgomerry reduction q4 */ - // | w12 @ 32(SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI - ADDQ SI, 32(SP) - ADCQ $0x00, DI + // | - // | w13 @ 40(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, SI - ADDQ DI, 40(SP) - ADCQ $0x00, SI +/* i = 0 */ - // | w14 @ 48(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, DI - ADDQ SI, 48(SP) - ADCQ $0x00, DI + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - | 15 - + // | 16 R14 | 17 CX | 18 R11 | 19 R12 | 20 R13 | 21 BX | 22 SI | 23 R10 | 24 56(SP) | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | w15 @ 56(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, SI - ADDQ DI, 56(SP) - ADCQ $0x00, SI - // | w16 @ 64(SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, DI - ADDQ SI, 64(SP) - ADCQ $0x00, DI + MOVQ $0x00, R8 - // | w17 @ R8 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + // | - // | w18 @ R9 - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI +/* */ - // | w19 @ R10 - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + // | j8 - // | w20 @ R11 - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX + // | w16 @ R14 + MOVQ 64(R15), AX + MULQ 120(SP) + ADDQ AX, R14 + ADCQ DX, R8 + + // | j9 + + // | w17 @ CX + MOVQ 72(R15), AX + MULQ 120(SP) + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R8, CX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j10 + + // | w18 @ R11 + MOVQ 80(R15), AX + MULQ 120(SP) ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w21 @ R12 - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + // | j11 - // | w22 @ R13 - XORQ DI, DI - MOVQ 112(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + // | w19 @ R12 + MOVQ 88(R15), AX + MULQ 120(SP) + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w23 @ 200(SP) - XORQ SI, SI - MOVQ 120(R14), AX - MULQ CX - ADDQ AX, 200(SP) - ADCQ DX, SI - ADDQ DI, 200(SP) - ADCQ $0x00, SI + // | j12 - // | w24 @ 192(SP) - ADDQ SI, R15 - ADCQ R15, 192(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | w20 @ R13 + MOVQ 96(R15), AX + MULQ 120(SP) + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j13 + + // | w21 @ BX + MOVQ 104(R15), AX + MULQ 120(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j14 + + // | w22 @ SI + MOVQ 112(R15), AX + MULQ 120(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 - // | - MOVQ 136(SP), BX - MOVQ BX, (SP) + // | j15 - // | (u @ CX) = (w9 @ 8(SP)) * inp - MOVQ 8(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + // | w23 @ R10 + MOVQ 120(R15), AX + MULQ 120(SP) + ADDQ AX, R10 + ADCQ 136(SP), DX + ADDQ R8, R10 + MOVQ 56(SP), DI - // | w9 @ 8(SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, 8(SP) + // | w24 @ DI ADCQ DX, DI + MOVQ $0x00, R9 + ADCQ $0x00, R9 - // | w10 @ 16(SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI + // | - // | w11 @ 24(SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI - ADDQ SI, 24(SP) - ADCQ $0x00, DI +/* i = 1 */ - // | w12 @ 32(SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, SI - ADDQ DI, 32(SP) - ADCQ $0x00, SI + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - | 15 - + // | 16 R14 | 17 CX | 18 R11 | 19 R12 | 20 R13 | 21 BX | 22 SI | 23 R10 | 24 DI | 25 48(SP) | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | w13 @ 40(SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, DI - ADDQ SI, 40(SP) - ADCQ $0x00, DI - // | w14 @ 48(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, SI - ADDQ DI, 48(SP) - ADCQ $0x00, SI + MOVQ $0x00, R8 - // | w15 @ 56(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, DI - ADDQ SI, 56(SP) - ADCQ $0x00, DI + // | - // | w16 @ 64(SP) - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, SI - ADDQ DI, 64(SP) - ADCQ $0x00, SI +/* */ - // | w17 @ R8 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + // | j8 - // | w18 @ R9 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + // | w17 @ CX + MOVQ 64(R15), AX + MULQ 128(SP) + ADDQ AX, CX + ADCQ DX, R8 + MOVQ CX, 64(SP) - // | w19 @ R10 - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + // | j9 - // | w20 @ R11 - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX + // | w18 @ R11 + MOVQ 72(R15), AX + MULQ 128(SP) ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI - - // | w21 @ R12 - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI - - // | w22 @ R13 - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w23 @ 200(SP) - XORQ DI, DI - MOVQ 112(R14), AX - MULQ CX - ADDQ AX, 200(SP) - ADCQ DX, DI - ADDQ SI, 200(SP) - ADCQ $0x00, DI + // | j10 - // | w24 @ 192(SP) - XORQ SI, SI - MOVQ 120(R14), AX - MULQ CX - ADDQ AX, 192(SP) - ADCQ DX, SI - ADDQ DI, 192(SP) - ADCQ $0x00, SI + // | w19 @ R12 + MOVQ 80(R15), AX + MULQ 128(SP) + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w25 @ (SP) - ADDQ SI, R15 - ADCQ R15, (SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | j11 - // | - MOVQ 144(SP), BX - MOVQ BX, 8(SP) + // | w20 @ R13 + MOVQ 88(R15), AX + MULQ 128(SP) + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j12 + + // | w21 @ BX + MOVQ 96(R15), AX + MULQ 128(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j13 + + // | w22 @ SI + MOVQ 104(R15), AX + MULQ 128(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 - // | (u @ CX) = (w10 @ 16(SP)) * inp - MOVQ 16(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + // | j14 - // | w10 @ 16(SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI + // | w23 @ R10 + MOVQ 112(R15), AX + MULQ 128(SP) + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j15 + + // | w24 @ DI + MOVQ 120(R15), AX + MULQ 128(SP) + ADDQ AX, DI + ADCQ DX, R9 + ADDQ R8, DI + MOVQ 48(SP), CX - // | w11 @ 24(SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI + // | w25 @ CX + ADCQ R9, CX + MOVQ $0x00, R9 + ADCQ $0x00, R9 - // | w12 @ 32(SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI - ADDQ SI, 32(SP) - ADCQ $0x00, DI + // | - // | w13 @ 40(SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, SI - ADDQ DI, 40(SP) - ADCQ $0x00, SI +/* i = 2 */ - // | w14 @ 48(SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, DI - ADDQ SI, 48(SP) - ADCQ $0x00, DI + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - | 15 - + // | 16 R14 | 17 64(SP) | 18 R11 | 19 R12 | 20 R13 | 21 BX | 22 SI | 23 R10 | 24 DI | 25 CX | 26 40(SP) | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | w15 @ 56(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, SI - ADDQ DI, 56(SP) - ADCQ $0x00, SI - // | w16 @ 64(SP) - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, DI - ADDQ SI, 64(SP) - ADCQ $0x00, DI + MOVQ $0x00, R8 - // | w17 @ R8 - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + // | - // | w18 @ R9 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI +/* */ - // | w19 @ R10 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + // | j8 - // | w20 @ R11 - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX + // | w18 @ R11 + MOVQ 64(R15), AX + MULQ 112(SP) ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI + ADCQ DX, R8 + MOVQ R11, 48(SP) - // | w21 @ R12 - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX + // | j9 + + // | w19 @ R12 + MOVQ 72(R15), AX + MULQ 112(SP) ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w22 @ R13 - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX + // | j10 + + // | w20 @ R13 + MOVQ 80(R15), AX + MULQ 112(SP) ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j11 + + // | w21 @ BX + MOVQ 88(R15), AX + MULQ 112(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j12 + + // | w22 @ SI + MOVQ 96(R15), AX + MULQ 112(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w23 @ 200(SP) - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX - ADDQ AX, 200(SP) - ADCQ DX, SI - ADDQ DI, 200(SP) - ADCQ $0x00, SI + // | j13 - // | w24 @ 192(SP) - XORQ DI, DI - MOVQ 112(R14), AX - MULQ CX - ADDQ AX, 192(SP) - ADCQ DX, DI - ADDQ SI, 192(SP) - ADCQ $0x00, DI + // | w23 @ R10 + MOVQ 104(R15), AX + MULQ 112(SP) + ADDQ AX, R10 + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j14 + + // | w24 @ DI + MOVQ 112(R15), AX + MULQ 112(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j15 + + // | w25 @ CX + MOVQ 120(R15), AX + MULQ 112(SP) + ADDQ AX, CX + ADCQ DX, R9 + ADDQ R8, CX + MOVQ 40(SP), R11 - // | w25 @ (SP) - XORQ SI, SI - MOVQ 120(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + // | w26 @ R11 + ADCQ R9, R11 + MOVQ $0x00, R9 + ADCQ $0x00, R9 - // | w26 @ 8(SP) - ADDQ SI, R15 - ADCQ R15, 8(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | - // | - MOVQ 152(SP), BX - MOVQ BX, 16(SP) +/* i = 3 */ - // | (u @ CX) = (w11 @ 24(SP)) * inp - MOVQ 24(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - | 15 - + // | 16 R14 | 17 64(SP) | 18 48(SP) | 19 R12 | 20 R13 | 21 BX | 22 SI | 23 R10 | 24 DI | 25 CX | 26 R11 | 27 32(SP) | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | w11 @ 24(SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI - // | w12 @ 32(SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, SI - ADDQ DI, 32(SP) - ADCQ $0x00, SI + MOVQ $0x00, R8 - // | w13 @ 40(SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, DI - ADDQ SI, 40(SP) - ADCQ $0x00, DI + // | - // | w14 @ 48(SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, SI - ADDQ DI, 48(SP) - ADCQ $0x00, SI +/* */ - // | w15 @ 56(SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, DI - ADDQ SI, 56(SP) - ADCQ $0x00, DI + // | j8 - // | w16 @ 64(SP) - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, SI - ADDQ DI, 64(SP) - ADCQ $0x00, SI + // | w19 @ R12 + MOVQ 64(R15), AX + MULQ 104(SP) + ADDQ AX, R12 + ADCQ DX, R8 + MOVQ R12, 40(SP) - // | w17 @ R8 - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + // | j9 - // | w18 @ R9 - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + // | w20 @ R13 + MOVQ 72(R15), AX + MULQ 104(SP) + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j10 + + // | w21 @ BX + MOVQ 80(R15), AX + MULQ 104(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j11 + + // | w22 @ SI + MOVQ 88(R15), AX + MULQ 104(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w19 @ R10 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX + // | j12 + + // | w23 @ R10 + MOVQ 96(R15), AX + MULQ 104(SP) ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j13 + + // | w24 @ DI + MOVQ 104(R15), AX + MULQ 104(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j14 + + // | w25 @ CX + MOVQ 112(R15), AX + MULQ 104(SP) + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R8, CX + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w20 @ R11 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + // | j15 - // | w21 @ R12 - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + // | w26 @ R11 + MOVQ 120(R15), AX + MULQ 104(SP) + ADDQ AX, R11 + ADCQ DX, R9 + ADDQ R8, R11 + MOVQ 32(SP), R12 - // | w22 @ R13 - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + // | w27 @ R12 + ADCQ R9, R12 + MOVQ $0x00, R9 + ADCQ $0x00, R9 - // | w23 @ 200(SP) - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, 200(SP) - ADCQ DX, DI - ADDQ SI, 200(SP) - ADCQ $0x00, DI + // | - // | w24 @ 192(SP) - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX - ADDQ AX, 192(SP) - ADCQ DX, SI - ADDQ DI, 192(SP) - ADCQ $0x00, SI +/* i = 4 */ - // | w25 @ (SP) - XORQ DI, DI - MOVQ 112(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - | 15 - + // | 16 R14 | 17 64(SP) | 18 48(SP) | 19 40(SP) | 20 R13 | 21 BX | 22 SI | 23 R10 | 24 DI | 25 CX | 26 R11 | 27 R12 | 28 24(SP) | 29 16(SP) | 30 8(SP) | 31 (SP) - // | w26 @ 8(SP) - XORQ SI, SI - MOVQ 120(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI - // | w27 @ 16(SP) - ADDQ SI, R15 - ADCQ R15, 16(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + MOVQ $0x00, R8 - // | - MOVQ 160(SP), BX - MOVQ BX, 24(SP) + // | - // | (u @ CX) = (w12 @ 32(SP)) * inp - MOVQ 32(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX +/* */ - // | w12 @ 32(SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI + // | j8 - // | w13 @ 40(SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, SI - ADDQ DI, 40(SP) - ADCQ $0x00, SI + // | w20 @ R13 + MOVQ 64(R15), AX + MULQ 96(SP) + ADDQ AX, R13 + ADCQ DX, R8 + MOVQ R13, 32(SP) - // | w14 @ 48(SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, DI - ADDQ SI, 48(SP) - ADCQ $0x00, DI + // | j9 - // | w15 @ 56(SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, SI - ADDQ DI, 56(SP) - ADCQ $0x00, SI + // | w21 @ BX + MOVQ 72(R15), AX + MULQ 96(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w16 @ 64(SP) - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, DI - ADDQ SI, 64(SP) - ADCQ $0x00, DI + // | j10 - // | w17 @ R8 - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + // | w22 @ SI + MOVQ 80(R15), AX + MULQ 96(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w18 @ R9 - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI + // | j11 - // | w19 @ R10 - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX + // | w23 @ R10 + MOVQ 88(R15), AX + MULQ 96(SP) ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j12 + + // | w24 @ DI + MOVQ 96(R15), AX + MULQ 96(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j13 + + // | w25 @ CX + MOVQ 104(R15), AX + MULQ 96(SP) + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R8, CX + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w20 @ R11 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI + // | j14 - // | w21 @ R12 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + // | w26 @ R11 + MOVQ 112(R15), AX + MULQ 96(SP) + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w22 @ R13 - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + // | j15 - // | w23 @ 200(SP) - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, 200(SP) - ADCQ DX, SI - ADDQ DI, 200(SP) - ADCQ $0x00, SI + // | w27 @ R12 + MOVQ 120(R15), AX + MULQ 96(SP) + ADDQ AX, R12 + ADCQ DX, R9 + ADDQ R8, R12 + MOVQ 24(SP), R13 - // | w24 @ 192(SP) - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, 192(SP) - ADCQ DX, DI - ADDQ SI, 192(SP) - ADCQ $0x00, DI + // | w28 @ R13 + ADCQ R9, R13 + MOVQ $0x00, R9 + ADCQ $0x00, R9 - // | w25 @ (SP) - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI + // | - // | w26 @ 8(SP) - XORQ DI, DI - MOVQ 112(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI +/* i = 5 */ - // | w27 @ 16(SP) - XORQ SI, SI - MOVQ 120(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - | 15 - + // | 16 R14 | 17 64(SP) | 18 48(SP) | 19 40(SP) | 20 32(SP) | 21 BX | 22 SI | 23 R10 | 24 DI | 25 CX | 26 R11 | 27 R12 | 28 R13 | 29 16(SP) | 30 8(SP) | 31 (SP) - // | w28 @ 24(SP) - ADDQ SI, R15 - ADCQ R15, 24(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 - // | - MOVQ 168(SP), BX - MOVQ BX, 32(SP) + MOVQ $0x00, R8 - // | (u @ CX) = (w13 @ 40(SP)) * inp - MOVQ 40(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + // | - // | w13 @ 40(SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, DI +/* */ - // | w14 @ 48(SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, SI - ADDQ DI, 48(SP) - ADCQ $0x00, SI + // | j8 - // | w15 @ 56(SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, DI - ADDQ SI, 56(SP) - ADCQ $0x00, DI + // | w21 @ BX + MOVQ 64(R15), AX + MULQ 88(SP) + ADDQ AX, BX + ADCQ DX, R8 + MOVQ BX, 24(SP) - // | w16 @ 64(SP) - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, SI - ADDQ DI, 64(SP) - ADCQ $0x00, SI + // | j9 - // | w17 @ R8 - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + // | w22 @ SI + MOVQ 72(R15), AX + MULQ 88(SP) + ADDQ AX, SI + ADCQ $0x00, DX + ADDQ R8, SI + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w18 @ R9 - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + // | j10 - // | w19 @ R10 - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX + // | w23 @ R10 + MOVQ 80(R15), AX + MULQ 88(SP) ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j11 + + // | w24 @ DI + MOVQ 88(R15), AX + MULQ 88(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j12 + + // | w25 @ CX + MOVQ 96(R15), AX + MULQ 88(SP) + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R8, CX + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w20 @ R11 - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + // | j13 - // | w21 @ R12 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + // | w26 @ R11 + MOVQ 104(R15), AX + MULQ 88(SP) + ADDQ AX, R11 + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w22 @ R13 - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + // | j14 - // | w23 @ 200(SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, 200(SP) - ADCQ DX, DI - ADDQ SI, 200(SP) - ADCQ $0x00, DI + // | w27 @ R12 + MOVQ 112(R15), AX + MULQ 88(SP) + ADDQ AX, R12 + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w24 @ 192(SP) - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, 192(SP) - ADCQ DX, SI - ADDQ DI, 192(SP) - ADCQ $0x00, SI + // | j15 - // | w25 @ (SP) - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + // | w28 @ R13 + MOVQ 120(R15), AX + MULQ 88(SP) + ADDQ AX, R13 + ADCQ DX, R9 + ADDQ R8, R13 + MOVQ 16(SP), BX - // | w26 @ 8(SP) - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + // | w29 @ BX + ADCQ R9, BX + MOVQ $0x00, R9 + ADCQ $0x00, R9 - // | w27 @ 16(SP) - XORQ DI, DI - MOVQ 112(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI + // | - // | w28 @ 24(SP) - XORQ SI, SI - MOVQ 120(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI +/* i = 6 */ - // | w29 @ 32(SP) - ADDQ SI, R15 - ADCQ R15, 32(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - | 15 - + // | 16 R14 | 17 64(SP) | 18 48(SP) | 19 40(SP) | 20 32(SP) | 21 24(SP) | 22 SI | 23 R10 | 24 DI | 25 CX | 26 R11 | 27 R12 | 28 R13 | 29 BX | 30 8(SP) | 31 (SP) - // | - MOVQ 176(SP), BX - MOVQ BX, 40(SP) - // | (u @ CX) = (w14 @ 48(SP)) * inp - MOVQ 48(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + MOVQ $0x00, R8 - // | w14 @ 48(SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, 48(SP) - ADCQ DX, DI + // | - // | w15 @ 56(SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, SI - ADDQ DI, 56(SP) - ADCQ $0x00, SI +/* */ - // | w16 @ 64(SP) - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, DI - ADDQ SI, 64(SP) - ADCQ $0x00, DI + // | j8 - // | w17 @ R8 - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, SI - ADDQ DI, R8 - ADCQ $0x00, SI + // | w22 @ SI + MOVQ 64(R15), AX + MULQ 80(SP) + ADDQ AX, SI + ADCQ DX, R8 + MOVQ SI, 16(SP) - // | w18 @ R9 - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, DI - ADDQ SI, R9 - ADCQ $0x00, DI + // | j9 - // | w19 @ R10 - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX + // | w23 @ R10 + MOVQ 72(R15), AX + MULQ 80(SP) ADDQ AX, R10 - ADCQ DX, SI - ADDQ DI, R10 - ADCQ $0x00, SI + ADCQ $0x00, DX + ADDQ R8, R10 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j10 + + // | w24 @ DI + MOVQ 80(R15), AX + MULQ 80(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j11 + + // | w25 @ CX + MOVQ 88(R15), AX + MULQ 80(SP) + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R8, CX + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w20 @ R11 - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX + // | j12 + + // | w26 @ R11 + MOVQ 96(R15), AX + MULQ 80(SP) ADDQ AX, R11 - ADCQ DX, DI - ADDQ SI, R11 - ADCQ $0x00, DI + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w21 @ R12 - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX + // | j13 + + // | w27 @ R12 + MOVQ 104(R15), AX + MULQ 80(SP) ADDQ AX, R12 - ADCQ DX, SI - ADDQ DI, R12 - ADCQ $0x00, SI + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w22 @ R13 - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, DI - ADDQ SI, R13 - ADCQ $0x00, DI + // | j14 - // | w23 @ 200(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 200(SP) - ADCQ DX, SI - ADDQ DI, 200(SP) - ADCQ $0x00, SI + // | w28 @ R13 + MOVQ 112(R15), AX + MULQ 80(SP) + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j15 + + // | w29 @ BX + MOVQ 120(R15), AX + MULQ 80(SP) + ADDQ AX, BX + ADCQ DX, R9 + ADDQ R8, BX + MOVQ 8(SP), SI + + // | w30 @ SI + ADCQ R9, SI + MOVQ $0x00, R9 + ADCQ $0x00, R9 - // | w24 @ 192(SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, 192(SP) - ADCQ DX, DI - ADDQ SI, 192(SP) - ADCQ $0x00, DI + // | - // | w25 @ (SP) - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, SI - ADDQ DI, (SP) - ADCQ $0x00, SI +/* i = 7 */ - // | w26 @ 8(SP) - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, DI - ADDQ SI, 8(SP) - ADCQ $0x00, DI + // | + // | W + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - | 15 - + // | 16 R14 | 17 64(SP) | 18 48(SP) | 19 40(SP) | 20 32(SP) | 21 24(SP) | 22 16(SP) | 23 R10 | 24 DI | 25 CX | 26 R11 | 27 R12 | 28 R13 | 29 BX | 30 SI | 31 (SP) - // | w27 @ 16(SP) - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, SI - ADDQ DI, 16(SP) - ADCQ $0x00, SI - // | w28 @ 24(SP) - XORQ DI, DI - MOVQ 112(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, DI - ADDQ SI, 24(SP) - ADCQ $0x00, DI + MOVQ $0x00, R8 - // | w29 @ 32(SP) - XORQ SI, SI - MOVQ 120(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, SI - ADDQ DI, 32(SP) - ADCQ $0x00, SI + // | - // | w30 @ 40(SP) - ADDQ SI, R15 - ADCQ R15, 40(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 +/* */ - // | - MOVQ 184(SP), BX - MOVQ BX, 48(SP) + // | j8 - // | (u @ CX) = (w15 @ 56(SP)) * inp - MOVQ 56(SP), AX - MULQ inp+32(FP) - MOVQ AX, CX + // | w23 @ R10 + MOVQ 64(R15), AX + MULQ 72(SP) + ADDQ AX, R10 + ADCQ DX, R8 - // | w15 @ 56(SP) - XORQ DI, DI - MOVQ (R14), AX - MULQ CX - ADDQ AX, 56(SP) - ADCQ DX, DI + // | j9 - // | w16 @ 64(SP) - XORQ SI, SI - MOVQ 8(R14), AX - MULQ CX - ADDQ AX, 64(SP) - ADCQ DX, SI - ADDQ DI, 64(SP) - ADCQ $0x00, SI + // | w24 @ DI + MOVQ 72(R15), AX + MULQ 72(SP) + ADDQ AX, DI + ADCQ $0x00, DX + ADDQ R8, DI + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w17 @ R8 - XORQ DI, DI - MOVQ 16(R14), AX - MULQ CX - ADDQ AX, R8 - ADCQ DX, DI - ADDQ SI, R8 - ADCQ $0x00, DI + // | j10 - // | w18 @ R9 - XORQ SI, SI - MOVQ 24(R14), AX - MULQ CX - ADDQ AX, R9 - ADCQ DX, SI - ADDQ DI, R9 - ADCQ $0x00, SI + // | w25 @ CX + MOVQ 80(R15), AX + MULQ 72(SP) + ADDQ AX, CX + ADCQ $0x00, DX + ADDQ R8, CX + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w19 @ R10 - XORQ DI, DI - MOVQ 32(R14), AX - MULQ CX - ADDQ AX, R10 - ADCQ DX, DI - ADDQ SI, R10 - ADCQ $0x00, DI + // | j11 - // | w20 @ R11 - XORQ SI, SI - MOVQ 40(R14), AX - MULQ CX + // | w26 @ R11 + MOVQ 88(R15), AX + MULQ 72(SP) ADDQ AX, R11 - ADCQ DX, SI - ADDQ DI, R11 - ADCQ $0x00, SI + ADCQ $0x00, DX + ADDQ R8, R11 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w21 @ R12 - XORQ DI, DI - MOVQ 48(R14), AX - MULQ CX + // | j12 + + // | w27 @ R12 + MOVQ 96(R15), AX + MULQ 72(SP) ADDQ AX, R12 - ADCQ DX, DI - ADDQ SI, R12 - ADCQ $0x00, DI + ADCQ $0x00, DX + ADDQ R8, R12 + MOVQ $0x00, R8 + ADCQ DX, R8 - // | w22 @ R13 - XORQ SI, SI - MOVQ 56(R14), AX - MULQ CX - ADDQ AX, R13 - ADCQ DX, SI - ADDQ DI, R13 - ADCQ $0x00, SI + // | j13 - // | w23 @ 200(SP) - XORQ DI, DI - MOVQ 64(R14), AX - MULQ CX - ADDQ AX, 200(SP) - ADCQ DX, DI - ADDQ SI, 200(SP) - ADCQ $0x00, DI + // | w28 @ R13 + MOVQ 104(R15), AX + MULQ 72(SP) + ADDQ AX, R13 + ADCQ $0x00, DX + ADDQ R8, R13 + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j14 + + // | w29 @ BX + MOVQ 112(R15), AX + MULQ 72(SP) + ADDQ AX, BX + ADCQ $0x00, DX + ADDQ R8, BX + MOVQ $0x00, R8 + ADCQ DX, R8 + + // | j15 + + // | w30 @ SI + MOVQ 120(R15), AX + MULQ 72(SP) + ADDQ AX, SI + ADCQ DX, R9 + ADDQ R8, SI - // | w24 @ 192(SP) - XORQ SI, SI - MOVQ 72(R14), AX - MULQ CX - ADDQ AX, 192(SP) - ADCQ DX, SI - ADDQ DI, 192(SP) - ADCQ $0x00, SI + // | very last limb goes to short carry register + MOVQ (SP), R8 - // | w25 @ (SP) - XORQ DI, DI - MOVQ 80(R14), AX - MULQ CX - ADDQ AX, (SP) - ADCQ DX, DI - ADDQ SI, (SP) - ADCQ $0x00, DI + // | w-1 @ R8 + ADCQ R9, R8 + MOVQ $0x00, R9 + ADCQ $0x00, R9 - // | w26 @ 8(SP) - XORQ SI, SI - MOVQ 88(R14), AX - MULQ CX - ADDQ AX, 8(SP) - ADCQ DX, SI - ADDQ DI, 8(SP) - ADCQ $0x00, SI + // | + // | W q4 + // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 - | 6 - | 7 - | 8 - | 9 - | 10 - | 11 - | 12 - | 13 - | 14 - | 15 - + // | 16 R14 | 17 64(SP) | 18 48(SP) | 19 40(SP) | 20 32(SP) | 21 24(SP) | 22 16(SP) | 23 R10 | 24 DI | 25 CX | 26 R11 | 27 R12 | 28 R13 | 29 BX | 30 SI | 31 R8 - // | w27 @ 16(SP) - XORQ DI, DI - MOVQ 96(R14), AX - MULQ CX - ADDQ AX, 16(SP) - ADCQ DX, DI - ADDQ SI, 16(SP) - ADCQ $0x00, DI - // | w28 @ 24(SP) - XORQ SI, SI - MOVQ 104(R14), AX - MULQ CX - ADDQ AX, 24(SP) - ADCQ DX, SI - ADDQ DI, 24(SP) - ADCQ $0x00, SI + // | - // | w29 @ 32(SP) - XORQ DI, DI - MOVQ 112(R14), AX - MULQ CX - ADDQ AX, 32(SP) - ADCQ DX, DI - ADDQ SI, 32(SP) - ADCQ $0x00, DI +/* modular reduction */ - // | w30 @ 40(SP) - XORQ SI, SI - MOVQ 120(R14), AX - MULQ CX - ADDQ AX, 40(SP) - ADCQ DX, SI - ADDQ DI, 40(SP) - ADCQ $0x00, SI + MOVQ R14, DX + SUBQ (R15), DX + MOVQ DX, (SP) + MOVQ 64(SP), DX + SBBQ 8(R15), DX + MOVQ DX, 8(SP) + MOVQ 48(SP), DX + SBBQ 16(R15), DX + MOVQ DX, 144(SP) + MOVQ 40(SP), DX + SBBQ 24(R15), DX + MOVQ DX, 152(SP) + MOVQ 32(SP), DX + SBBQ 32(R15), DX + MOVQ DX, 160(SP) + MOVQ 24(SP), DX + SBBQ 40(R15), DX + MOVQ DX, 168(SP) + MOVQ 16(SP), DX + SBBQ 48(R15), DX + MOVQ DX, 176(SP) + MOVQ R10, DX + SBBQ 56(R15), DX + MOVQ DX, 184(SP) + MOVQ DI, DX + SBBQ 64(R15), DX + MOVQ DX, 192(SP) + MOVQ CX, DX + SBBQ 72(R15), DX + MOVQ DX, 200(SP) + MOVQ R11, DX + SBBQ 80(R15), DX + MOVQ DX, 208(SP) + MOVQ R12, DX + SBBQ 88(R15), DX + MOVQ DX, 216(SP) + MOVQ R13, DX + SBBQ 96(R15), DX + MOVQ DX, 224(SP) + MOVQ BX, DX + SBBQ 104(R15), DX + MOVQ DX, 232(SP) + MOVQ SI, DX + SBBQ 112(R15), DX + MOVQ DX, 240(SP) + MOVQ R8, DX + SBBQ 120(R15), DX + MOVQ DX, 248(SP) + SBBQ $0x00, R9 - // | w31 @ 48(SP) - ADDQ SI, R15 - ADCQ R15, 48(SP) - MOVQ $0x0000000000000000, R15 - ADCQ $0x00, R15 + // | - // | Reduce by modulus - MOVQ 64(SP), CX - SUBQ (R14), CX - MOVQ R8, AX - SBBQ 8(R14), AX - MOVQ R9, DX - SBBQ 16(R14), DX - MOVQ R10, BX - SBBQ 24(R14), BX - MOVQ BX, 56(SP) - MOVQ R11, BX - SBBQ 32(R14), BX - MOVQ BX, 208(SP) - MOVQ R12, BX - SBBQ 40(R14), BX - MOVQ BX, 216(SP) - MOVQ R13, BX - SBBQ 48(R14), BX - MOVQ BX, 224(SP) - MOVQ 200(SP), BX - SBBQ 56(R14), BX - MOVQ BX, 232(SP) - MOVQ 192(SP), BX - SBBQ 64(R14), BX - MOVQ BX, 240(SP) - MOVQ (SP), BX - SBBQ 72(R14), BX - MOVQ BX, 248(SP) - MOVQ 8(SP), BX - SBBQ 80(R14), BX - MOVQ BX, 256(SP) - MOVQ 16(SP), BX - SBBQ 88(R14), BX - MOVQ BX, 264(SP) - MOVQ 24(SP), BX - SBBQ 96(R14), BX - MOVQ BX, 272(SP) - MOVQ 32(SP), BX - SBBQ 104(R14), BX - MOVQ BX, 280(SP) - MOVQ 40(SP), BX - SBBQ 112(R14), BX - MOVQ BX, 288(SP) - MOVQ 48(SP), BX - SBBQ 120(R14), BX - MOVQ BX, 296(SP) - SBBQ $0x00, R15 +/* out */ - // | Compare & Return - MOVQ c+0(FP), DI - CMOVQCS 64(SP), CX - MOVQ CX, (DI) - CMOVQCC AX, R8 - MOVQ R8, 8(DI) - CMOVQCC DX, R9 - MOVQ R9, 16(DI) - CMOVQCC 56(SP), R10 - MOVQ R10, 24(DI) + MOVQ c+0(FP), R9 + CMOVQCC (SP), R14 + MOVQ R14, (R9) + MOVQ 64(SP), DX + CMOVQCC 8(SP), DX + MOVQ DX, 8(R9) + MOVQ 48(SP), DX + CMOVQCC 144(SP), DX + MOVQ DX, 16(R9) + MOVQ 40(SP), DX + CMOVQCC 152(SP), DX + MOVQ DX, 24(R9) + MOVQ 32(SP), DX + CMOVQCC 160(SP), DX + MOVQ DX, 32(R9) + MOVQ 24(SP), DX + CMOVQCC 168(SP), DX + MOVQ DX, 40(R9) + MOVQ 16(SP), DX + CMOVQCC 176(SP), DX + MOVQ DX, 48(R9) + CMOVQCC 184(SP), R10 + MOVQ R10, 56(R9) + CMOVQCC 192(SP), DI + MOVQ DI, 64(R9) + CMOVQCC 200(SP), CX + MOVQ CX, 72(R9) CMOVQCC 208(SP), R11 - MOVQ R11, 32(DI) + MOVQ R11, 80(R9) CMOVQCC 216(SP), R12 - MOVQ R12, 40(DI) + MOVQ R12, 88(R9) CMOVQCC 224(SP), R13 - MOVQ R13, 48(DI) - MOVQ 200(SP), BX + MOVQ R13, 96(R9) CMOVQCC 232(SP), BX - MOVQ BX, 56(DI) - MOVQ 192(SP), BX - CMOVQCC 240(SP), BX - MOVQ BX, 64(DI) - MOVQ (SP), BX - CMOVQCC 248(SP), BX - MOVQ BX, 72(DI) - MOVQ 8(SP), BX - CMOVQCC 256(SP), BX - MOVQ BX, 80(DI) - MOVQ 16(SP), BX - CMOVQCC 264(SP), BX - MOVQ BX, 88(DI) - MOVQ 24(SP), BX - CMOVQCC 272(SP), BX - MOVQ BX, 96(DI) - MOVQ 32(SP), BX - CMOVQCC 280(SP), BX - MOVQ BX, 104(DI) - MOVQ 40(SP), BX - CMOVQCC 288(SP), BX - MOVQ BX, 112(DI) - MOVQ 48(SP), BX - CMOVQCC 296(SP), BX - MOVQ BX, 120(DI) + MOVQ BX, 104(R9) + CMOVQCC 240(SP), SI + MOVQ SI, 112(R9) + CMOVQCC 248(SP), R8 + MOVQ R8, 120(R9) RET + // | + +/* end */ + + +// func mul1(c *[1]uint64, a *[1]uint64, b *[1]uint64, p *[1]uint64, inp uint64) +TEXT ·mul1(SB), NOSPLIT, $0-40 + +/* inputs */ + + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + +/* multiplication */ + + MOVQ (SI), DX + MULXQ (DI), R8, R9 + +/* montgommery reduction */ + + MOVQ p+24(FP), R15 + MOVQ R8, DX + MULXQ inp+32(FP), DX, DI + + MULXQ (R15), AX, DI + ADDQ AX, R8 + ADCQ DI, R9 + ADCQ $0x00, R8 + +/* modular reduction */ + + MOVQ R9, AX + SUBQ (R15), AX + SBBQ $0x00, R8 + +/* out */ + MOVQ c+0(FP), DI + CMOVQCC AX, R9 + MOVQ R9, (DI) + RET /* end */ +// func mul_no_adx_bmi2_1(c *[1]uint64, a *[1]uint64, b *[1]uint64, p *[1]uint64, inp uint64) +TEXT ·mul_no_adx_bmi2_1(SB), NOSPLIT, $0-40 + +/* inputs */ + + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + + // | + +/* multiplication */ + + MOVQ (SI), CX + MOVQ (DI), AX + MULQ CX + MOVQ AX, R8 + MOVQ DX, R9 + +/* montgommery reduction */ + + MOVQ p+24(FP), R15 + + MOVQ R8, AX + MULQ inp+32(FP) + MOVQ AX, CX + + MOVQ (R15), AX + MULQ CX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R8 + +/* modular reduction */ + + MOVQ R9, AX + SUBQ (R15), AX + SBBQ $0x00, R8 + +/* out */ + + MOVQ c+0(FP), DI + CMOVQCC AX, R9 + MOVQ R9, (DI) + RET + +/* end */ + +TEXT ·is_even(SB), NOSPLIT, $0-9 + MOVQ a+0(FP), DI + MOVB $0x00, ret+8(FP) + MOVQ 0(DI), AX + TESTQ $1, AX + JNZ ret + MOVB $0x01, ret+8(FP) +ret: + RET diff --git a/x86_is_even.s b/x86_is_even.s deleted file mode 100644 index da3f1e5..0000000 --- a/x86_is_even.s +++ /dev/null @@ -1,11 +0,0 @@ -#include "textflag.h" - -TEXT ·is_even(SB), NOSPLIT, $0-9 - MOVQ a+0(FP), DI - MOVB $0x00, ret+8(FP) - MOVQ 0(DI), AX - TESTQ $1, AX - JNZ ret - MOVB $0x01, ret+8(FP) -ret: - RET diff --git a/x86_single_limb_arithmetic.s b/x86_single_limb_arithmetic.s deleted file mode 100644 index 3bfd5e7..0000000 --- a/x86_single_limb_arithmetic.s +++ /dev/null @@ -1,212 +0,0 @@ -#include "textflag.h" - -// func mul_two_1(a *[1]uint64) -TEXT ·mul_two_1(SB), NOSPLIT, $0-8 - MOVQ a+0(FP), DI - XORQ AX, AX - RCLQ $0x01, (DI) - RET - -// func div_two_1(a *[1]uint64) -TEXT ·div_two_1(SB), NOSPLIT, $0-8 - MOVQ a+0(FP), DI - XORQ AX, AX - RCRQ $0x01, (DI) - RET - -// func cpy(dst *[1]uint64, src *[1]uint64) -TEXT ·cpy1(SB), NOSPLIT, $0-16 - MOVQ dst+0(FP), DI - MOVQ src+8(FP), SI - MOVQ (SI), R8 - MOVQ R8, (DI) - RET - -// func eq(a *[1]uint64, b *[1]uint64) bool -TEXT ·eq1(SB), NOSPLIT, $0-17 - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - MOVB $0x00, ret+16(FP) - MOVQ (DI), R8 - CMPQ (SI), R8 - JNE ret - MOVB $0x01, ret+16(FP) - -ret: - RET - -// func cmp(a *[1]uint64, b *[1]uint64) int8 -TEXT ·cmp1(SB), NOSPLIT, $0-17 - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - MOVQ (DI), R8 - CMPQ (SI), R8 - JB gt - JA lt - MOVB $0x00, ret+16(FP) - JMP ret - -gt: - MOVB $0x01, ret+16(FP) - JMP ret - -lt: - MOVB $0xff, ret+16(FP) - -ret: - RET - -// func add(c *[1]uint64, a *[1]uint64, b *[1]uint64, p *[1]uint64) -TEXT ·add1(SB), NOSPLIT, $0-32 - // | - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI - XORQ AX, AX - - // | - MOVQ (DI), CX - ADDQ (SI), CX - ADCQ $0x00, AX - - // | - MOVQ p+24(FP), SI - MOVQ CX, DX - SUBQ (SI), DX - SBBQ $0x00, AX - - // | - MOVQ c+0(FP), DI - CMOVQCC DX, CX - MOVQ CX, (DI) - RET - -// func addn(a *[1]uint64, b *[1]uint64) uint64 -TEXT ·addn1(SB), NOSPLIT, $0-24 - // | - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - - // | - MOVQ (DI), CX - ADDQ (SI), CX - ADCQ $0x00, AX - - // | - MOVQ CX, (DI) - MOVQ AX, ret+16(FP) - RET - -// func double(c *[1]uint64, a *[1]uint64, p *[1]uint64) -TEXT ·double1(SB), NOSPLIT, $0-24 - // | - MOVQ a+8(FP), DI - XORQ AX, AX - MOVQ (DI), CX - ADDQ CX, CX - ADCQ $0x00, AX - - // | - MOVQ p+16(FP), SI - MOVQ CX, DX - SUBQ (SI), DX - SBBQ $0x00, AX - - // | - MOVQ c+0(FP), DI - CMOVQCC DX, CX - MOVQ CX, (DI) - RET - -// func sub(c *[1]uint64, a *[1]uint64, b *[1]uint64, p *[1]uint64) -TEXT ·sub1(SB), NOSPLIT, $0-32 - // | - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI - XORQ AX, AX - MOVQ (DI), CX - SUBQ (SI), CX - - // | - MOVQ p+24(FP), SI - MOVQ (SI), DX - CMOVQCC AX, DX - - // | - MOVQ c+0(FP), DI - ADDQ DX, CX - MOVQ CX, (DI) - RET - -// func subn(a *[1]uint64, b *[1]uint64) uint64 -TEXT ·subn1(SB), NOSPLIT, $0-24 - // | - MOVQ a+0(FP), DI - MOVQ b+8(FP), SI - XORQ AX, AX - - // | - MOVQ (DI), CX - SUBQ (SI), CX - ADCQ $0x00, AX - - // | - MOVQ CX, (DI) - MOVQ AX, ret+16(FP) - RET - -// func _neg(c *[1]uint64, a *[1]uint64, p *[1]uint64) -TEXT ·_neg1(SB), NOSPLIT, $0-24 - // | - MOVQ a+8(FP), DI - - // | - MOVQ p+16(FP), SI - MOVQ (SI), CX - SUBQ (DI), CX - - // | - MOVQ c+0(FP), DI - MOVQ CX, (DI) - RET - -// func mul(c *[2]uint64, a *[1]uint64, b *[1]uint64, p *[1]uint64, inp uint64) -TEXT ·mul1(SB), NOSPLIT, $0-40 - // | - -/* inputs */ - - MOVQ a+8(FP), DI - MOVQ b+16(FP), SI - - // | - MOVQ (SI), DX - MULXQ (DI), R8, R9 - -/* swap */ - - MOVQ p+24(FP), R15 - - // | - MOVQ R8, DX - MULXQ inp+32(FP), DX, DI - - MULXQ (R15), AX, DI - ADDQ AX, R8 - ADCQ DI, R9 - ADCQ $0x00, R8 - -/* reduction */ - - MOVQ R9, AX - SUBQ (R15), AX - SBBQ $0x00, R8 - - // | - MOVQ c+0(FP), DI - CMOVQCC AX, R9 - MOVQ R9, (DI) - RET - - // | - -/* end */ \ No newline at end of file