Skip to content
This repository was archived by the owner on Mar 7, 2025. It is now read-only.

Commit 674baa8

Browse files
authored
Merge pull request #56 from AWSjswinney/arm64-port-pr
bug fix to encode_arm64.s: some registers overwritten in memmove call ARM64 memmove clobbers R16 and R17 as of https://go-review.googlesource.com/c/go/+/243357
2 parents 196ae77 + f81760e commit 674baa8

File tree

2 files changed

+55
-71
lines changed

2 files changed

+55
-71
lines changed

decode_arm64.s

Lines changed: 18 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ loop:
7070
// x := uint32(src[s] >> 2)
7171
// switch
7272
MOVW $60, R1
73-
ADD R4>>2, ZR, R4
73+
LSRW $2, R4, R4
7474
CMPW R4, R1
7575
BLS tagLit60Plus
7676

@@ -111,13 +111,12 @@ doLit:
111111
// is contiguous in memory and so it needs to leave enough source bytes to
112112
// read the next tag without refilling buffers, but Go's Decode assumes
113113
// contiguousness (the src argument is a []byte).
114-
MOVD $16, R1
115-
CMP R1, R4
116-
BGT callMemmove
117-
CMP R1, R2
118-
BLT callMemmove
119-
CMP R1, R3
120-
BLT callMemmove
114+
CMP $16, R4
115+
BGT callMemmove
116+
CMP $16, R2
117+
BLT callMemmove
118+
CMP $16, R3
119+
BLT callMemmove
121120

122121
// !!! Implement the copy from src to dst as a 16-byte load and store.
123122
// (Decode's documentation says that dst and src must not overlap.)
@@ -130,9 +129,8 @@ doLit:
130129
// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
131130
// 16-byte loads and stores. This technique probably wouldn't be as
132131
// effective on architectures that are fussier about alignment.
133-
134-
VLD1 0(R6), [V0.B16]
135-
VST1 [V0.B16], 0(R7)
132+
LDP 0(R6), (R14, R15)
133+
STP (R14, R15), 0(R7)
136134

137135
// d += length
138136
// s += length
@@ -210,8 +208,7 @@ tagLit61:
210208
B doLit
211209

212210
tagLit62Plus:
213-
MOVW $62, R1
214-
CMPW R1, R4
211+
CMPW $62, R4
215212
BHI tagLit63
216213

217214
// case x == 62:
@@ -273,10 +270,9 @@ tagCopy:
273270
// We have a copy tag. We assume that:
274271
// - R3 == src[s] & 0x03
275272
// - R4 == src[s]
276-
MOVD $2, R1
277-
CMP R1, R3
278-
BEQ tagCopy2
279-
BGT tagCopy4
273+
CMP $2, R3
274+
BEQ tagCopy2
275+
BGT tagCopy4
280276

281277
// case tagCopy1:
282278
// s += 2
@@ -346,13 +342,11 @@ doCopy:
346342
// }
347343
// copy 16 bytes
348344
// d += length
349-
MOVD $16, R1
350-
MOVD $8, R0
351-
CMP R1, R4
345+
CMP $16, R4
352346
BGT slowForwardCopy
353-
CMP R0, R5
347+
CMP $8, R5
354348
BLT slowForwardCopy
355-
CMP R1, R14
349+
CMP $16, R14
356350
BLT slowForwardCopy
357351
MOVD 0(R15), R2
358352
MOVD R2, 0(R7)
@@ -426,8 +420,7 @@ makeOffsetAtLeast8:
426420
// // The two previous lines together means that d-offset, and therefore
427421
// // R15, is unchanged.
428422
// }
429-
MOVD $8, R1
430-
CMP R1, R5
423+
CMP $8, R5
431424
BGE fixUpSlowForwardCopy
432425
MOVD (R15), R3
433426
MOVD R3, (R7)
@@ -477,9 +470,7 @@ verySlowForwardCopy:
477470
ADD $1, R15, R15
478471
ADD $1, R7, R7
479472
SUB $1, R4, R4
480-
MOVD $0, R1
481-
CMP R1, R4
482-
BNE verySlowForwardCopy
473+
CBNZ R4, verySlowForwardCopy
483474
B loop
484475

485476
// The code above handles copy tags.

encode_arm64.s

Lines changed: 37 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,9 @@ TEXT ·emitLiteral(SB), NOSPLIT, $32-56
3535
MOVW R3, R4
3636
SUBW $1, R4, R4
3737

38-
MOVW $60, R2
39-
CMPW R2, R4
38+
CMPW $60, R4
4039
BLT oneByte
41-
MOVW $256, R2
42-
CMPW R2, R4
40+
CMPW $256, R4
4341
BLT twoBytes
4442

4543
threeBytes:
@@ -98,8 +96,7 @@ TEXT ·emitCopy(SB), NOSPLIT, $0-48
9896

9997
loop0:
10098
// for length >= 68 { etc }
101-
MOVW $68, R2
102-
CMPW R2, R3
99+
CMPW $68, R3
103100
BLT step1
104101

105102
// Emit a length 64 copy, encoded as 3 bytes.
@@ -112,9 +109,8 @@ loop0:
112109

113110
step1:
114111
// if length > 64 { etc }
115-
MOVD $64, R2
116-
CMP R2, R3
117-
BLE step2
112+
CMP $64, R3
113+
BLE step2
118114

119115
// Emit a length 60 copy, encoded as 3 bytes.
120116
MOVD $0xee, R2
@@ -125,11 +121,9 @@ step1:
125121

126122
step2:
127123
// if length >= 12 || offset >= 2048 { goto step3 }
128-
MOVD $12, R2
129-
CMP R2, R3
124+
CMP $12, R3
130125
BGE step3
131-
MOVW $2048, R2
132-
CMPW R2, R11
126+
CMPW $2048, R11
133127
BGE step3
134128

135129
// Emit the remaining copy, encoded as 2 bytes.
@@ -295,27 +289,24 @@ varTable:
295289
// var table [maxTableSize]uint16
296290
//
297291
// In the asm code, unlike the Go code, we can zero-initialize only the
298-
// first tableSize elements. Each uint16 element is 2 bytes and each VST1
299-
// writes 64 bytes, so we can do only tableSize/32 writes instead of the
300-
// 2048 writes that would zero-initialize all of table's 32768 bytes.
301-
// This clear could overrun the first tableSize elements, but it won't
302-
// overrun the allocated stack size.
292+
// first tableSize elements. Each uint16 element is 2 bytes and each
293+
// iterations writes 64 bytes, so we can do only tableSize/32 writes
294+
// instead of the 2048 writes that would zero-initialize all of table's
295+
// 32768 bytes. This clear could overrun the first tableSize elements, but
296+
// it won't overrun the allocated stack size.
303297
ADD $128, RSP, R17
304298
MOVD R17, R4
305299

306300
// !!! R6 = &src[tableSize]
307301
ADD R6<<1, R17, R6
308302

309-
// zero the SIMD registers
310-
VEOR V0.B16, V0.B16, V0.B16
311-
VEOR V1.B16, V1.B16, V1.B16
312-
VEOR V2.B16, V2.B16, V2.B16
313-
VEOR V3.B16, V3.B16, V3.B16
314-
315303
memclr:
316-
VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(R4)
317-
CMP R4, R6
318-
BHI memclr
304+
STP.P (ZR, ZR), 64(R4)
305+
STP (ZR, ZR), -48(R4)
306+
STP (ZR, ZR), -32(R4)
307+
STP (ZR, ZR), -16(R4)
308+
CMP R4, R6
309+
BHI memclr
319310

320311
// !!! R6 = &src[0]
321312
MOVD R7, R6
@@ -404,8 +395,7 @@ fourByteMatch:
404395
// on inputMargin in encode.go.
405396
MOVD R7, R3
406397
SUB R10, R3, R3
407-
MOVD $16, R2
408-
CMP R2, R3
398+
CMP $16, R3
409399
BLE emitLiteralFastPath
410400

411401
// ----------------------------------------
@@ -454,18 +444,21 @@ inlineEmitLiteralMemmove:
454444
MOVD R3, 24(RSP)
455445

456446
// Finish the "d +=" part of "d += emitLiteral(etc)".
457-
ADD R3, R8, R8
458-
MOVD R7, 80(RSP)
459-
MOVD R8, 88(RSP)
460-
MOVD R15, 120(RSP)
461-
CALL runtime·memmove(SB)
462-
MOVD 64(RSP), R5
463-
MOVD 72(RSP), R6
464-
MOVD 80(RSP), R7
465-
MOVD 88(RSP), R8
466-
MOVD 96(RSP), R9
467-
MOVD 120(RSP), R15
468-
B inner1
447+
ADD R3, R8, R8
448+
MOVD R7, 80(RSP)
449+
MOVD R8, 88(RSP)
450+
MOVD R15, 120(RSP)
451+
CALL runtime·memmove(SB)
452+
MOVD 64(RSP), R5
453+
MOVD 72(RSP), R6
454+
MOVD 80(RSP), R7
455+
MOVD 88(RSP), R8
456+
MOVD 96(RSP), R9
457+
MOVD 120(RSP), R15
458+
ADD $128, RSP, R17
459+
MOVW $0xa7bd, R16
460+
MOVKW $(0x1e35<<16), R16
461+
B inner1
469462

470463
inlineEmitLiteralEnd:
471464
// End inline of the emitLiteral call.
@@ -489,9 +482,9 @@ emitLiteralFastPath:
489482
// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
490483
// 16-byte loads and stores. This technique probably wouldn't be as
491484
// effective on architectures that are fussier about alignment.
492-
VLD1 0(R10), [V0.B16]
493-
VST1 [V0.B16], 0(R8)
494-
ADD R3, R8, R8
485+
LDP 0(R10), (R0, R1)
486+
STP (R0, R1), 0(R8)
487+
ADD R3, R8, R8
495488

496489
inner1:
497490
// for { etc }

0 commit comments

Comments
 (0)