crypto/internal/bigmod: switch to saturated limbs

Turns out that unsaturated limbs being more performant for Montgomery multiplication was true in portable C89, but is now a misconception. With add-with-carry instructions, it's possible to run the carry chain across the limbs, instead of needing the limb-by-limb product to fit in two words. Switch to saturated limbs, and import the same Montgomery loop as math/big, along with its assembly for some architectures. Since here we know the sizes we care about, we can drop most of the assembly scaffolding. For amd64, ported to avo, too. We recover all the Go 1.20 performance loss on private key operations on both Intel Xeon and AMD EPYC, with even a 10% improvement over Go 1.19 (which used variable-time math/big) for some operations. goos: linux goarch: amd64 pkg: crypto/rsa cpu: Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz │ go1.19.txt │ go1.20.txt │ new.txt │ │ sec/op │ sec/op vs base │ sec/op vs base │ DecryptPKCS1v15/2048-4 1.175m ± 0% 1.515m ± 0% +28.95% 1.132m ± 0% -3.59% DecryptPKCS1v15/3072-4 3.428m ± 1% 4.516m ± 0% +31.75% 3.198m ± 0% -6.69% DecryptPKCS1v15/4096-4 7.405m ± 0% 10.092m ± 0% +36.29% 6.446m ± 0% -12.95% EncryptPKCS1v15/2048-4 7.426µ ± 0% 170.829µ ± 0% +2200.57% 131.874µ ± 0% +1675.97% DecryptOAEP/2048-4 1.175m ± 0% 1.524m ± 0% +29.68% 1.137m ± 0% -3.26% EncryptOAEP/2048-4 9.609µ ± 0% 173.008µ ± 0% +1700.48% 132.344µ ± 0% +1277.29% SignPKCS1v15/2048-4 1.181m ± 0% 1.563m ± 0% +32.34% 1.177m ± 0% -0.37% VerifyPKCS1v15/2048-4 6.452µ ± 0% 170.092µ ± 0% +2536.06% 131.225µ ± 0% +1933.70% SignPSS/2048-4 1.184m ± 0% 1.574m ± 0% +32.88% 1.175m ± 0% -0.84% VerifyPSS/2048-4 9.151µ ± 1% 172.909µ ± 0% +1789.50% 132.391µ ± 0% +1346.74% │ go1.19.txt │ go1.20.txt │ new.txt │ │ B/op │ B/op vs base │ B/op vs base │ DecryptPKCS1v15/2048-4 24266.5 ± 0% 640.0 ± 0% -97.36% 640.0 ± 0% -97.36% DecryptPKCS1v15/3072-4 45.465Ki ± 0% 3.375Ki ± 0% -92.58% 4.688Ki ± 0% -89.69% DecryptPKCS1v15/4096-4 61.080Ki ± 0% 4.625Ki ± 0% -92.43% 6.250Ki ± 0% -89.77% EncryptPKCS1v15/2048-4 3.138Ki ± 0% 1.146Ki ± 0% -63.49% 1.082Ki ± 0% -65.52% DecryptOAEP/2048-4 24500.0 ± 0% 872.0 ± 0% -96.44% 872.0 ± 0% -96.44% EncryptOAEP/2048-4 3.610Ki ± 0% 1.371Ki ± 0% -62.02% 1.308Ki ± 0% -63.78% SignPKCS1v15/2048-4 26933.0 ± 0% 896.0 ± 0% -96.67% 896.0 ± 0% -96.67% VerifyPKCS1v15/2048-4 3209.0 ± 0% 912.0 ± 0% -71.58% 848.0 ± 0% -73.57% SignPSS/2048-4 26.940Ki ± 0% 1.266Ki ± 0% -95.30% 1.266Ki ± 0% -95.30% VerifyPSS/2048-4 3.337Ki ± 0% 1.094Ki ± 0% -67.22% 1.031Ki ± 0% -69.10% │ go1.19.txt │ go1.20.txt │ new.txt │ │ allocs/op │ allocs/op vs base │ allocs/op vs base │ DecryptPKCS1v15/2048-4 97.000 ± 0% 4.000 ± 0% -95.88% 4.000 ± 0% -95.88% DecryptPKCS1v15/3072-4 107.00 ± 0% 10.00 ± 0% -90.65% 12.00 ± 0% -88.79% DecryptPKCS1v15/4096-4 113.00 ± 0% 10.00 ± 0% -91.15% 12.00 ± 0% -89.38% EncryptPKCS1v15/2048-4 7.000 ± 0% 7.000 ± 0% ~ 7.000 ± 0% ~ DecryptOAEP/2048-4 103.00 ± 0% 10.00 ± 0% -90.29% 10.00 ± 0% -90.29% EncryptOAEP/2048-4 14.00 ± 0% 13.00 ± 0% -7.14% 13.00 ± 0% -7.14% SignPKCS1v15/2048-4 102.000 ± 0% 5.000 ± 0% -95.10% 5.000 ± 0% -95.10% VerifyPKCS1v15/2048-4 7.000 ± 0% 6.000 ± 0% -14.29% 6.000 ± 0% -14.29% SignPSS/2048-4 108.00 ± 0% 10.00 ± 0% -90.74% 10.00 ± 0% -90.74% VerifyPSS/2048-4 12.00 ± 0% 11.00 ± 0% -8.33% 11.00 ± 0% -8.33% goos: linux goarch: amd64 pkg: crypto/rsa cpu: AMD EPYC 7R13 Processor │ go1.19a.txt │ go1.20a.txt │ newa.txt │ │ sec/op │ sec/op vs base │ sec/op vs base │ DecryptPKCS1v15/2048-4 970.0µ ± 0% 1667.6µ ± 0% +71.92% 951.6µ ± 0% -1.90% DecryptPKCS1v15/3072-4 2.949m ± 0% 5.124m ± 0% +73.75% 2.675m ± 0% -9.29% DecryptPKCS1v15/4096-4 6.350m ± 0% 11.660m ± 0% +83.62% 5.746m ± 0% -9.51% EncryptPKCS1v15/2048-4 6.605µ ± 1% 183.807µ ± 0% +2683.05% 123.720µ ± 0% +1773.27% DecryptOAEP/2048-4 973.8µ ± 0% 1670.8µ ± 0% +71.57% 951.8µ ± 0% -2.27% EncryptOAEP/2048-4 8.444µ ± 1% 185.889µ ± 0% +2101.56% 124.142µ ± 0% +1370.27% SignPKCS1v15/2048-4 976.8µ ± 0% 1725.5µ ± 0% +76.65% 979.6µ ± 0% +0.28% VerifyPKCS1v15/2048-4 5.713µ ± 0% 182.983µ ± 0% +3103.19% 122.737µ ± 0% +2048.56% SignPSS/2048-4 980.3µ ± 0% 1729.5µ ± 0% +76.42% 985.7µ ± 3% +0.55% VerifyPSS/2048-4 8.168µ ± 1% 185.312µ ± 0% +2168.76% 123.772µ ± 0% +1415.33% Fixes #59463 Fixes #59442 Updates #57752 Change-Id: I311a9c1f4f5288e47e53ca14f615a443f3132734 Reviewed-on: https://go-review.googlesource.com/c/go/+/471259 Reviewed-by: Matthew Dempsky <mdempsky@google.com> Run-TryBot: Filippo Valsorda <filippo@golang.org> Auto-Submit: Filippo Valsorda <filippo@golang.org> Reviewed-by: Roland Shoemaker <roland@golang.org> TryBot-Result: Gopher Robot <gobot@golang.org>
golang · May 24, 2023 · 7d96475 · 7d96475 · emmansun · Jun 2, 2023
1 parent 1ddab59
commit 7d96475
Show file tree

Hide file tree

Showing 12 changed files with 1,967 additions and 413 deletions.
diff --git a/src/crypto/internal/bigmod/_asm/nat_amd64_asm.go b/src/crypto/internal/bigmod/_asm/nat_amd64_asm.go
@@ -1,131 +1,113 @@
-// Copyright 2022 The Go Authors. All rights reserved.
+// Copyright 2023 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 package main
 
 import (
+	"strconv"
+
 	. "github.com/mmcloughlin/avo/build"
 	. "github.com/mmcloughlin/avo/operand"
 	. "github.com/mmcloughlin/avo/reg"
 )
 
-//go:generate go run . -out ../nat_amd64.s -stubs ../nat_amd64.go -pkg bigmod
+//go:generate go run . -out ../nat_amd64.s -pkg bigmod
 
 func main() {
 	Package("crypto/internal/bigmod")
-	ConstraintExpr("amd64,gc,!purego")
-
-	Implement("montgomeryLoop")
-	Pragma("noescape")
-
-	size := Load(Param("d").Len(), GP64())
-	d := Mem{Base: Load(Param("d").Base(), GP64())}
-	b := Mem{Base: Load(Param("b").Base(), GP64())}
-	m := Mem{Base: Load(Param("m").Base(), GP64())}
-	m0inv := Load(Param("m0inv"), GP64())
-
-	overflow := zero()
-	i := zero()
-	Label("outerLoop")
-
-	ai := Load(Param("a").Base(), GP64())
-	MOVQ(Mem{Base: ai}.Idx(i, 8), ai)
-
-	z := uint128{GP64(), GP64()}
-	mul64(z, b, ai)
-	add64(z, d)
-	f := GP64()
-	MOVQ(m0inv, f)
-	IMULQ(z.lo, f)
-	_MASK(f)
-	addMul64(z, m, f)
-	carry := shiftBy63(z)
-
-	j := zero()
-	INCQ(j)
-	JMP(LabelRef("innerLoopCondition"))
-	Label("innerLoop")
-
-	// z = d[j] + a[i] * b[j] + f * m[j] + carry
-	z = uint128{GP64(), GP64()}
-	mul64(z, b.Idx(j, 8), ai)
-	addMul64(z, m.Idx(j, 8), f)
-	add64(z, d.Idx(j, 8))
-	add64(z, carry)
-	// d[j-1] = z_lo & _MASK
-	storeMasked(z.lo, d.Idx(j, 8).Offset(-8))
-	// carry = z_hi<<1 | z_lo>>_W
-	MOVQ(shiftBy63(z), carry)
-
-	INCQ(j)
-	Label("innerLoopCondition")
-	CMPQ(size, j)
-	JGT(LabelRef("innerLoop"))
-
-	ADDQ(carry, overflow)
-	storeMasked(overflow, d.Idx(size, 8).Offset(-8))
-	SHRQ(Imm(63), overflow)
-
-	INCQ(i)
-	CMPQ(size, i)
-	JGT(LabelRef("outerLoop"))
-
-	Store(overflow, ReturnIndex(0))
-	RET()
-	Generate()
-}
+	ConstraintExpr("!purego")
 
-// zero zeroes a new register and returns it.
-func zero() Register {
-	r := GP64()
-	XORQ(r, r)
-	return r
-}
-
-// _MASK masks out the top bit of r.
-func _MASK(r Register) {
-	BTRQ(Imm(63), r)
-}
-
-type uint128 struct {
-	hi, lo GPVirtual
-}
+	addMulVVW(1024)
+	addMulVVW(1536)
+	addMulVVW(2048)
 
-// storeMasked stores _MASK(src) in dst. It doesn't modify src.
-func storeMasked(src, dst Op) {
-	out := GP64()
-	MOVQ(src, out)
-	_MASK(out)
-	MOVQ(out, dst)
-}
-
-// shiftBy63 returns z >> 63. It reuses z.lo.
-func shiftBy63(z uint128) Register {
-	SHRQ(Imm(63), z.hi, z.lo)
-	result := z.lo
-	z.hi, z.lo = nil, nil
-	return result
-}
-
-// add64 sets r to r + a.
-func add64(r uint128, a Op) {
-	ADDQ(a, r.lo)
-	ADCQ(Imm(0), r.hi)
+	Generate()
 }
 
-// mul64 sets r to a * b.
-func mul64(r uint128, a, b Op) {
-	MOVQ(a, RAX)
-	MULQ(b) // RDX, RAX = RAX * b
-	MOVQ(RAX, r.lo)
-	MOVQ(RDX, r.hi)
-}
+func addMulVVW(bits int) {
+	if bits%64 != 0 {
+		panic("bit size unsupported")
+	}
+
+	Implement("addMulVVW" + strconv.Itoa(bits))
+
+	CMPB(Mem{Symbol: Symbol{Name: "·supportADX"}, Base: StaticBase}, Imm(1))
+	JEQ(LabelRef("adx"))
+
+	z := Mem{Base: Load(Param("z"), GP64())}
+	x := Mem{Base: Load(Param("x"), GP64())}
+	y := Load(Param("y"), GP64())
+
+	carry := GP64()
+	XORQ(carry, carry) // zero out carry
+
+	for i := 0; i < bits/64; i++ {
+		Comment("Iteration " + strconv.Itoa(i))
+		hi, lo := RDX, RAX // implicit MULQ inputs and outputs
+		MOVQ(x.Offset(i*8), lo)
+		MULQ(y)
+		ADDQ(z.Offset(i*8), lo)
+		ADCQ(Imm(0), hi)
+		ADDQ(carry, lo)
+		ADCQ(Imm(0), hi)
+		MOVQ(hi, carry)
+		MOVQ(lo, z.Offset(i*8))
+	}
+
+	Store(carry, ReturnIndex(0))
+	RET()
 
-// addMul64 sets r to r + a * b.
-func addMul64(r uint128, a, b Op) {
-	MOVQ(a, RAX)
-	MULQ(b) // RDX, RAX = RAX * b
-	ADDQ(RAX, r.lo)
-	ADCQ(RDX, r.hi)
+	Label("adx")
+
+	// The ADX strategy implements the following function, where c1 and c2 are
+	// the overflow and the carry flag respectively.
+	//
+	//    func addMulVVW(z, x []uint, y uint) (carry uint) {
+	//        var c1, c2 uint
+	//        for i := range z {
+	//            hi, lo := bits.Mul(x[i], y)
+	//            lo, c1 = bits.Add(lo, z[i], c1)
+	//            z[i], c2 = bits.Add(lo, carry, c2)
+	//            carry = hi
+	//        }
+	//        return carry + c1 + c2
+	//    }
+	//
+	// The loop is fully unrolled and the hi / carry registers are alternated
+	// instead of introducing a MOV.
+
+	z = Mem{Base: Load(Param("z"), GP64())}
+	x = Mem{Base: Load(Param("x"), GP64())}
+	Load(Param("y"), RDX) // implicit source of MULXQ
+
+	carry = GP64()
+	XORQ(carry, carry) // zero out carry
+	z0 := GP64()
+	XORQ(z0, z0) // unset flags and zero out z0
+
+	for i := 0; i < bits/64; i++ {
+		hi, lo := GP64(), GP64()
+
+		Comment("Iteration " + strconv.Itoa(i))
+		MULXQ(x.Offset(i*8), lo, hi)
+		ADCXQ(carry, lo)
+		ADOXQ(z.Offset(i*8), lo)
+		MOVQ(lo, z.Offset(i*8))
+
+		i++
+
+		Comment("Iteration " + strconv.Itoa(i))
+		MULXQ(x.Offset(i*8), lo, carry)
+		ADCXQ(hi, lo)
+		ADOXQ(z.Offset(i*8), lo)
+		MOVQ(lo, z.Offset(i*8))
+	}
+
+	Comment("Add back carry flags and return")
+	ADCXQ(z0, carry)
+	ADOXQ(z0, carry)
+
+	Store(carry, ReturnIndex(0))
+	RET()
 }