Skip to content

Commit

Permalink
crypto/internal/bigmod: switch to saturated limbs
Browse files Browse the repository at this point in the history
Turns out that unsaturated limbs being more performant for Montgomery
multiplication was true in portable C89, but is now a misconception.
With add-with-carry instructions, it's possible to run the carry chain
across the limbs, instead of needing the limb-by-limb product to fit in
two words.

Switch to saturated limbs, and import the same Montgomery loop as
math/big, along with its assembly for some architectures. Since here we
know the sizes we care about, we can drop most of the assembly
scaffolding. For amd64, ported to avo, too.

We recover all the Go 1.20 performance loss on private key operations on
both Intel Xeon and AMD EPYC, with even a 10% improvement over Go 1.19
(which used variable-time math/big) for some operations.

goos: linux
goarch: amd64
pkg: crypto/rsa
cpu: Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
                       │ go1.19.txt  │       go1.20.txt         │         new.txt          │
                       │   sec/op    │    sec/op      vs base   │    sec/op      vs base   │
DecryptPKCS1v15/2048-4   1.175m ± 0%     1.515m ± 0%    +28.95%     1.132m ± 0%     -3.59%
DecryptPKCS1v15/3072-4   3.428m ± 1%     4.516m ± 0%    +31.75%     3.198m ± 0%     -6.69%
DecryptPKCS1v15/4096-4   7.405m ± 0%    10.092m ± 0%    +36.29%     6.446m ± 0%    -12.95%
EncryptPKCS1v15/2048-4   7.426µ ± 0%   170.829µ ± 0%  +2200.57%   131.874µ ± 0%  +1675.97%
DecryptOAEP/2048-4       1.175m ± 0%     1.524m ± 0%    +29.68%     1.137m ± 0%     -3.26%
EncryptOAEP/2048-4       9.609µ ± 0%   173.008µ ± 0%  +1700.48%   132.344µ ± 0%  +1277.29%
SignPKCS1v15/2048-4      1.181m ± 0%     1.563m ± 0%    +32.34%     1.177m ± 0%     -0.37% 
VerifyPKCS1v15/2048-4    6.452µ ± 0%   170.092µ ± 0%  +2536.06%   131.225µ ± 0%  +1933.70%
SignPSS/2048-4           1.184m ± 0%     1.574m ± 0%    +32.88%     1.175m ± 0%     -0.84%
VerifyPSS/2048-4         9.151µ ± 1%   172.909µ ± 0%  +1789.50%   132.391µ ± 0%  +1346.74%

                       │  go1.19.txt   │      go1.20.txt       │       new.txt         │
                       │     B/op      │     B/op      vs base │     B/op      vs base │
DecryptPKCS1v15/2048-4    24266.5 ± 0%     640.0 ± 0%  -97.36%     640.0 ± 0%  -97.36%
DecryptPKCS1v15/3072-4   45.465Ki ± 0%   3.375Ki ± 0%  -92.58%   4.688Ki ± 0%  -89.69%
DecryptPKCS1v15/4096-4   61.080Ki ± 0%   4.625Ki ± 0%  -92.43%   6.250Ki ± 0%  -89.77%
EncryptPKCS1v15/2048-4    3.138Ki ± 0%   1.146Ki ± 0%  -63.49%   1.082Ki ± 0%  -65.52%
DecryptOAEP/2048-4        24500.0 ± 0%     872.0 ± 0%  -96.44%     872.0 ± 0%  -96.44%
EncryptOAEP/2048-4        3.610Ki ± 0%   1.371Ki ± 0%  -62.02%   1.308Ki ± 0%  -63.78%
SignPKCS1v15/2048-4       26933.0 ± 0%     896.0 ± 0%  -96.67%     896.0 ± 0%  -96.67%
VerifyPKCS1v15/2048-4      3209.0 ± 0%     912.0 ± 0%  -71.58%     848.0 ± 0%  -73.57%
SignPSS/2048-4           26.940Ki ± 0%   1.266Ki ± 0%  -95.30%   1.266Ki ± 0%  -95.30%
VerifyPSS/2048-4          3.337Ki ± 0%   1.094Ki ± 0%  -67.22%   1.031Ki ± 0%  -69.10%

                       │  go1.19.txt  │     go1.20.txt      │      new.txt          │
                       │  allocs/op   │ allocs/op   vs base │ allocs/op   vs base   │
DecryptPKCS1v15/2048-4    97.000 ± 0%   4.000 ± 0%  -95.88%     4.000 ± 0%  -95.88%
DecryptPKCS1v15/3072-4    107.00 ± 0%   10.00 ± 0%  -90.65%     12.00 ± 0%  -88.79%
DecryptPKCS1v15/4096-4    113.00 ± 0%   10.00 ± 0%  -91.15%     12.00 ± 0%  -89.38%
EncryptPKCS1v15/2048-4     7.000 ± 0%   7.000 ± 0%        ~     7.000 ± 0%        ~  
DecryptOAEP/2048-4        103.00 ± 0%   10.00 ± 0%  -90.29%     10.00 ± 0%  -90.29%
EncryptOAEP/2048-4         14.00 ± 0%   13.00 ± 0%   -7.14%     13.00 ± 0%   -7.14%
SignPKCS1v15/2048-4      102.000 ± 0%   5.000 ± 0%  -95.10%     5.000 ± 0%  -95.10%
VerifyPKCS1v15/2048-4      7.000 ± 0%   6.000 ± 0%  -14.29%     6.000 ± 0%  -14.29%
SignPSS/2048-4            108.00 ± 0%   10.00 ± 0%  -90.74%     10.00 ± 0%  -90.74%
VerifyPSS/2048-4           12.00 ± 0%   11.00 ± 0%   -8.33%     11.00 ± 0%   -8.33%

goos: linux
goarch: amd64
pkg: crypto/rsa
cpu: AMD EPYC 7R13 Processor
                       │ go1.19a.txt │       go1.20a.txt        │        newa.txt          │
                       │   sec/op    │    sec/op      vs base   │    sec/op      vs base   │
DecryptPKCS1v15/2048-4   970.0µ ± 0%    1667.6µ ± 0%    +71.92%     951.6µ ± 0%     -1.90%
DecryptPKCS1v15/3072-4   2.949m ± 0%     5.124m ± 0%    +73.75%     2.675m ± 0%     -9.29%
DecryptPKCS1v15/4096-4   6.350m ± 0%    11.660m ± 0%    +83.62%     5.746m ± 0%     -9.51%
EncryptPKCS1v15/2048-4   6.605µ ± 1%   183.807µ ± 0%  +2683.05%   123.720µ ± 0%  +1773.27%
DecryptOAEP/2048-4       973.8µ ± 0%    1670.8µ ± 0%    +71.57%     951.8µ ± 0%     -2.27% 
EncryptOAEP/2048-4       8.444µ ± 1%   185.889µ ± 0%  +2101.56%   124.142µ ± 0%  +1370.27%
SignPKCS1v15/2048-4      976.8µ ± 0%    1725.5µ ± 0%    +76.65%     979.6µ ± 0%     +0.28%
VerifyPKCS1v15/2048-4    5.713µ ± 0%   182.983µ ± 0%  +3103.19%   122.737µ ± 0%  +2048.56%
SignPSS/2048-4           980.3µ ± 0%    1729.5µ ± 0%    +76.42%     985.7µ ± 3%     +0.55%
VerifyPSS/2048-4         8.168µ ± 1%   185.312µ ± 0%  +2168.76%   123.772µ ± 0%  +1415.33%

Fixes #59463
Fixes #59442
Updates #57752

Change-Id: I311a9c1f4f5288e47e53ca14f615a443f3132734
Reviewed-on: https://go-review.googlesource.com/c/go/+/471259
Reviewed-by: Matthew Dempsky <mdempsky@google.com>
Run-TryBot: Filippo Valsorda <filippo@golang.org>
Auto-Submit: Filippo Valsorda <filippo@golang.org>
Reviewed-by: Roland Shoemaker <roland@golang.org>
TryBot-Result: Gopher Robot <gobot@golang.org>
  • Loading branch information
FiloSottile authored and gopherbot committed May 24, 2023
1 parent 1ddab59 commit 7d96475
Show file tree
Hide file tree
Showing 12 changed files with 1,967 additions and 413 deletions.
204 changes: 93 additions & 111 deletions src/crypto/internal/bigmod/_asm/nat_amd64_asm.go
Original file line number Diff line number Diff line change
@@ -1,131 +1,113 @@
// Copyright 2022 The Go Authors. All rights reserved.
// Copyright 2023 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package main

import (
"strconv"

. "github.com/mmcloughlin/avo/build"
. "github.com/mmcloughlin/avo/operand"
. "github.com/mmcloughlin/avo/reg"
)

//go:generate go run . -out ../nat_amd64.s -stubs ../nat_amd64.go -pkg bigmod
//go:generate go run . -out ../nat_amd64.s -pkg bigmod

func main() {
Package("crypto/internal/bigmod")
ConstraintExpr("amd64,gc,!purego")

Implement("montgomeryLoop")
Pragma("noescape")

size := Load(Param("d").Len(), GP64())
d := Mem{Base: Load(Param("d").Base(), GP64())}
b := Mem{Base: Load(Param("b").Base(), GP64())}
m := Mem{Base: Load(Param("m").Base(), GP64())}
m0inv := Load(Param("m0inv"), GP64())

overflow := zero()
i := zero()
Label("outerLoop")

ai := Load(Param("a").Base(), GP64())
MOVQ(Mem{Base: ai}.Idx(i, 8), ai)

z := uint128{GP64(), GP64()}
mul64(z, b, ai)
add64(z, d)
f := GP64()
MOVQ(m0inv, f)
IMULQ(z.lo, f)
_MASK(f)
addMul64(z, m, f)
carry := shiftBy63(z)

j := zero()
INCQ(j)
JMP(LabelRef("innerLoopCondition"))
Label("innerLoop")

// z = d[j] + a[i] * b[j] + f * m[j] + carry
z = uint128{GP64(), GP64()}
mul64(z, b.Idx(j, 8), ai)
addMul64(z, m.Idx(j, 8), f)
add64(z, d.Idx(j, 8))
add64(z, carry)
// d[j-1] = z_lo & _MASK
storeMasked(z.lo, d.Idx(j, 8).Offset(-8))
// carry = z_hi<<1 | z_lo>>_W
MOVQ(shiftBy63(z), carry)

INCQ(j)
Label("innerLoopCondition")
CMPQ(size, j)
JGT(LabelRef("innerLoop"))

ADDQ(carry, overflow)
storeMasked(overflow, d.Idx(size, 8).Offset(-8))
SHRQ(Imm(63), overflow)

INCQ(i)
CMPQ(size, i)
JGT(LabelRef("outerLoop"))

Store(overflow, ReturnIndex(0))
RET()
Generate()
}
ConstraintExpr("!purego")

// zero zeroes a new register and returns it.
func zero() Register {
r := GP64()
XORQ(r, r)
return r
}

// _MASK masks out the top bit of r.
func _MASK(r Register) {
BTRQ(Imm(63), r)
}

type uint128 struct {
hi, lo GPVirtual
}
addMulVVW(1024)
addMulVVW(1536)
addMulVVW(2048)

// storeMasked stores _MASK(src) in dst. It doesn't modify src.
func storeMasked(src, dst Op) {
out := GP64()
MOVQ(src, out)
_MASK(out)
MOVQ(out, dst)
}

// shiftBy63 returns z >> 63. It reuses z.lo.
func shiftBy63(z uint128) Register {
SHRQ(Imm(63), z.hi, z.lo)
result := z.lo
z.hi, z.lo = nil, nil
return result
}

// add64 sets r to r + a.
func add64(r uint128, a Op) {
ADDQ(a, r.lo)
ADCQ(Imm(0), r.hi)
Generate()
}

// mul64 sets r to a * b.
func mul64(r uint128, a, b Op) {
MOVQ(a, RAX)
MULQ(b) // RDX, RAX = RAX * b
MOVQ(RAX, r.lo)
MOVQ(RDX, r.hi)
}
func addMulVVW(bits int) {
if bits%64 != 0 {
panic("bit size unsupported")
}

Implement("addMulVVW" + strconv.Itoa(bits))

CMPB(Mem{Symbol: Symbol{Name: "·supportADX"}, Base: StaticBase}, Imm(1))
JEQ(LabelRef("adx"))

z := Mem{Base: Load(Param("z"), GP64())}
x := Mem{Base: Load(Param("x"), GP64())}
y := Load(Param("y"), GP64())

carry := GP64()
XORQ(carry, carry) // zero out carry

for i := 0; i < bits/64; i++ {
Comment("Iteration " + strconv.Itoa(i))
hi, lo := RDX, RAX // implicit MULQ inputs and outputs
MOVQ(x.Offset(i*8), lo)
MULQ(y)
ADDQ(z.Offset(i*8), lo)
ADCQ(Imm(0), hi)
ADDQ(carry, lo)
ADCQ(Imm(0), hi)
MOVQ(hi, carry)
MOVQ(lo, z.Offset(i*8))
}

Store(carry, ReturnIndex(0))
RET()

// addMul64 sets r to r + a * b.
func addMul64(r uint128, a, b Op) {
MOVQ(a, RAX)
MULQ(b) // RDX, RAX = RAX * b
ADDQ(RAX, r.lo)
ADCQ(RDX, r.hi)
Label("adx")

// The ADX strategy implements the following function, where c1 and c2 are
// the overflow and the carry flag respectively.
//
// func addMulVVW(z, x []uint, y uint) (carry uint) {
// var c1, c2 uint
// for i := range z {
// hi, lo := bits.Mul(x[i], y)
// lo, c1 = bits.Add(lo, z[i], c1)
// z[i], c2 = bits.Add(lo, carry, c2)
// carry = hi
// }
// return carry + c1 + c2
// }
//
// The loop is fully unrolled and the hi / carry registers are alternated
// instead of introducing a MOV.

z = Mem{Base: Load(Param("z"), GP64())}
x = Mem{Base: Load(Param("x"), GP64())}
Load(Param("y"), RDX) // implicit source of MULXQ

carry = GP64()
XORQ(carry, carry) // zero out carry
z0 := GP64()
XORQ(z0, z0) // unset flags and zero out z0

for i := 0; i < bits/64; i++ {
hi, lo := GP64(), GP64()

Comment("Iteration " + strconv.Itoa(i))
MULXQ(x.Offset(i*8), lo, hi)
ADCXQ(carry, lo)
ADOXQ(z.Offset(i*8), lo)
MOVQ(lo, z.Offset(i*8))

i++

Comment("Iteration " + strconv.Itoa(i))
MULXQ(x.Offset(i*8), lo, carry)
ADCXQ(hi, lo)
ADOXQ(z.Offset(i*8), lo)
MOVQ(lo, z.Offset(i*8))
}

Comment("Add back carry flags and return")
ADCXQ(z0, carry)
ADOXQ(z0, carry)

Store(carry, ReturnIndex(0))
RET()
}
Loading

1 comment on commit 7d96475

@emmansun
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about ecdsa.signNISTEC()'s performace change? There are two Mul operations.

Please sign in to comment.