Open
Description
What version of Go are you using (go version
)?
$ go version go version devel +3b2a578166 Sat Dec 5 16:20:01 2020 +0000 darwin/arm64
What did you do?
Compiled this function.
type fieldElement struct {
l0, l1, l2, l3, l4 uint64
}
func (v *fieldElement) carryPropagateGeneric() *fieldElement {
c0 := v.l0 >> 51
c1 := v.l1 >> 51
c2 := v.l2 >> 51
c3 := v.l3 >> 51
c4 := v.l4 >> 51
v.l0 = v.l0&maskLow51Bits + c4*19
v.l1 = v.l1&maskLow51Bits + c0
v.l2 = v.l2&maskLow51Bits + c1
v.l3 = v.l3&maskLow51Bits + c2
v.l4 = v.l4&maskLow51Bits + c3
return v
}
Full codebase at FiloSottile/edwards25519#8
What did you expect to see?
// func carryPropagate(v *fieldElement)
TEXT ·carryPropagate(SB),NOFRAME|NOSPLIT,$0-8
MOVD v+0(FP), R20
LDP 0(R20), (R0, R1)
LDP 16(R20), (R2, R3)
MOVD 32(R20), R4
AND $0x7ffffffffffff, R0, R10
AND $0x7ffffffffffff, R1, R11
AND $0x7ffffffffffff, R2, R12
AND $0x7ffffffffffff, R3, R13
AND $0x7ffffffffffff, R4, R14
ADD R0>>51, R11, R11
ADD R1>>51, R12, R12
ADD R2>>51, R13, R13
ADD R3>>51, R14, R14
// R4>>51 * 19 + R10 -> R10
LSR $51, R4, R21
MOVD $19, R22
MADD R22, R10, R21, R10
STP (R10, R11), 0(R20)
STP (R12, R13), 16(R20)
MOVD R14, 32(R20)
RET
What did you see instead?
"".(*fieldElement).carryPropagateGeneric STEXT size=128 args=0x10 locals=0x0 funcid=0x0 leaf
0x0000 00000 (fe_generic.go:183) TEXT "".(*fieldElement).carryPropagateGeneric(SB), LEAF|NOFRAME|ABIInternal, $0-16
0x0000 00000 (fe_generic.go:183) FUNCDATA ZR, gclocals·524d71b8d4b4126db12e7a6de3370d94(SB)
0x0000 00000 (fe_generic.go:183) FUNCDATA $1, gclocals·69c1753bd5f81501d95132d08af04464(SB)
0x0000 00000 (fe_generic.go:184) MOVD "".v(FP), R0
0x0004 00004 (fe_generic.go:184) MOVD (R0), R1
0x0008 00008 (fe_generic.go:185) MOVD 8(R0), R2
0x000c 00012 (fe_generic.go:186) MOVD 16(R0), R3
0x0010 00016 (fe_generic.go:187) MOVD 24(R0), R4
0x0014 00020 (fe_generic.go:188) MOVD 32(R0), R5
0x0018 00024 (fe_generic.go:188) LSR $51, R5, R5
0x001c 00028 (fe_generic.go:190) AND $2251799813685247, R1, R6
0x0020 00032 (fe_generic.go:190) MOVD $19, R7
0x0024 00036 (fe_generic.go:190) MADD R5, R6, R7, R5
0x0028 00040 (fe_generic.go:190) MOVD R5, (R0)
0x002c 00044 (fe_generic.go:191) MOVD 8(R0), R5
0x0030 00048 (fe_generic.go:191) AND $2251799813685247, R5, R5
0x0034 00052 (fe_generic.go:191) ADD R1>>51, R5, R1
0x0038 00056 (fe_generic.go:191) MOVD R1, 8(R0)
0x003c 00060 (fe_generic.go:192) MOVD 16(R0), R1
0x0040 00064 (fe_generic.go:192) AND $2251799813685247, R1, R1
0x0044 00068 (fe_generic.go:192) ADD R2>>51, R1, R1
0x0048 00072 (fe_generic.go:192) MOVD R1, 16(R0)
0x004c 00076 (fe_generic.go:193) MOVD 24(R0), R1
0x0050 00080 (fe_generic.go:193) AND $2251799813685247, R1, R1
0x0054 00084 (fe_generic.go:193) ADD R3>>51, R1, R1
0x0058 00088 (fe_generic.go:193) MOVD R1, 24(R0)
0x005c 00092 (fe_generic.go:194) MOVD 32(R0), R1
0x0060 00096 (fe_generic.go:194) AND $2251799813685247, R1, R1
0x0064 00100 (fe_generic.go:194) ADD R4>>51, R1, R1
0x0068 00104 (fe_generic.go:194) MOVD R1, 32(R0)
0x006c 00108 (fe_generic.go:196) MOVD R0, "".~r0+8(FP)
0x0070 00112 (fe_generic.go:196) RET (R30)
The compiler figures out the same AND, ADD, and LSR+MADD that my hand-written assembly uses, but note how it loads the inputs twice from memory and looks like it doesn't know about STP and LDP.
Not sure which part makes the most effect, but I got a 10% speedup on some high-level functions (although not on microbenchmarks of thinner functions) between my assembly and the compiler with go:noinline
. (Interestingly, if I let the compiler inline the high level functions get even slower, while the thin ones get faster.)
Metadata
Metadata
Assignees
Labels
Type
Projects
Status
Triage Backlog