Merge pull request #56 from AWSjswinney/arm64-port-pr

bug fix to encode_arm64.s: some registers overwritten in memmove call

ARM64 memmove clobbers R16 and R17 as of
https://go-review.googlesource.com/c/go/+/243357
This commit is contained in:
Nigel Tao 2020-11-04 09:46:00 +11:00 коммит произвёл GitHub
Родитель 196ae77b8a f81760ec4c
Коммит 674baa8c7f
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
2 изменённых файлов: 55 добавлений и 71 удалений

Просмотреть файл

@ -70,7 +70,7 @@ loop:
// x := uint32(src[s] >> 2)
// switch
MOVW $60, R1
ADD R4>>2, ZR, R4
LSRW $2, R4, R4
CMPW R4, R1
BLS tagLit60Plus
@ -111,13 +111,12 @@ doLit:
// is contiguous in memory and so it needs to leave enough source bytes to
// read the next tag without refilling buffers, but Go's Decode assumes
// contiguousness (the src argument is a []byte).
MOVD $16, R1
CMP R1, R4
BGT callMemmove
CMP R1, R2
BLT callMemmove
CMP R1, R3
BLT callMemmove
CMP $16, R4
BGT callMemmove
CMP $16, R2
BLT callMemmove
CMP $16, R3
BLT callMemmove
// !!! Implement the copy from src to dst as a 16-byte load and store.
// (Decode's documentation says that dst and src must not overlap.)
@ -130,9 +129,8 @@ doLit:
// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
// 16-byte loads and stores. This technique probably wouldn't be as
// effective on architectures that are fussier about alignment.
VLD1 0(R6), [V0.B16]
VST1 [V0.B16], 0(R7)
LDP 0(R6), (R14, R15)
STP (R14, R15), 0(R7)
// d += length
// s += length
@ -210,8 +208,7 @@ tagLit61:
B doLit
tagLit62Plus:
MOVW $62, R1
CMPW R1, R4
CMPW $62, R4
BHI tagLit63
// case x == 62:
@ -273,10 +270,9 @@ tagCopy:
// We have a copy tag. We assume that:
// - R3 == src[s] & 0x03
// - R4 == src[s]
MOVD $2, R1
CMP R1, R3
BEQ tagCopy2
BGT tagCopy4
CMP $2, R3
BEQ tagCopy2
BGT tagCopy4
// case tagCopy1:
// s += 2
@ -346,13 +342,11 @@ doCopy:
// }
// copy 16 bytes
// d += length
MOVD $16, R1
MOVD $8, R0
CMP R1, R4
CMP $16, R4
BGT slowForwardCopy
CMP R0, R5
CMP $8, R5
BLT slowForwardCopy
CMP R1, R14
CMP $16, R14
BLT slowForwardCopy
MOVD 0(R15), R2
MOVD R2, 0(R7)
@ -426,8 +420,7 @@ makeOffsetAtLeast8:
// // The two previous lines together means that d-offset, and therefore
// // R15, is unchanged.
// }
MOVD $8, R1
CMP R1, R5
CMP $8, R5
BGE fixUpSlowForwardCopy
MOVD (R15), R3
MOVD R3, (R7)
@ -477,9 +470,7 @@ verySlowForwardCopy:
ADD $1, R15, R15
ADD $1, R7, R7
SUB $1, R4, R4
MOVD $0, R1
CMP R1, R4
BNE verySlowForwardCopy
CBNZ R4, verySlowForwardCopy
B loop
// The code above handles copy tags.

Просмотреть файл

@ -35,11 +35,9 @@ TEXT ·emitLiteral(SB), NOSPLIT, $32-56
MOVW R3, R4
SUBW $1, R4, R4
MOVW $60, R2
CMPW R2, R4
CMPW $60, R4
BLT oneByte
MOVW $256, R2
CMPW R2, R4
CMPW $256, R4
BLT twoBytes
threeBytes:
@ -98,8 +96,7 @@ TEXT ·emitCopy(SB), NOSPLIT, $0-48
loop0:
// for length >= 68 { etc }
MOVW $68, R2
CMPW R2, R3
CMPW $68, R3
BLT step1
// Emit a length 64 copy, encoded as 3 bytes.
@ -112,9 +109,8 @@ loop0:
step1:
// if length > 64 { etc }
MOVD $64, R2
CMP R2, R3
BLE step2
CMP $64, R3
BLE step2
// Emit a length 60 copy, encoded as 3 bytes.
MOVD $0xee, R2
@ -125,11 +121,9 @@ step1:
step2:
// if length >= 12 || offset >= 2048 { goto step3 }
MOVD $12, R2
CMP R2, R3
CMP $12, R3
BGE step3
MOVW $2048, R2
CMPW R2, R11
CMPW $2048, R11
BGE step3
// Emit the remaining copy, encoded as 2 bytes.
@ -295,27 +289,24 @@ varTable:
// var table [maxTableSize]uint16
//
// In the asm code, unlike the Go code, we can zero-initialize only the
// first tableSize elements. Each uint16 element is 2 bytes and each VST1
// writes 64 bytes, so we can do only tableSize/32 writes instead of the
// 2048 writes that would zero-initialize all of table's 32768 bytes.
// This clear could overrun the first tableSize elements, but it won't
// overrun the allocated stack size.
// first tableSize elements. Each uint16 element is 2 bytes and each
// iterations writes 64 bytes, so we can do only tableSize/32 writes
// instead of the 2048 writes that would zero-initialize all of table's
// 32768 bytes. This clear could overrun the first tableSize elements, but
// it won't overrun the allocated stack size.
ADD $128, RSP, R17
MOVD R17, R4
// !!! R6 = &src[tableSize]
ADD R6<<1, R17, R6
// zero the SIMD registers
VEOR V0.B16, V0.B16, V0.B16
VEOR V1.B16, V1.B16, V1.B16
VEOR V2.B16, V2.B16, V2.B16
VEOR V3.B16, V3.B16, V3.B16
memclr:
VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(R4)
CMP R4, R6
BHI memclr
STP.P (ZR, ZR), 64(R4)
STP (ZR, ZR), -48(R4)
STP (ZR, ZR), -32(R4)
STP (ZR, ZR), -16(R4)
CMP R4, R6
BHI memclr
// !!! R6 = &src[0]
MOVD R7, R6
@ -404,8 +395,7 @@ fourByteMatch:
// on inputMargin in encode.go.
MOVD R7, R3
SUB R10, R3, R3
MOVD $16, R2
CMP R2, R3
CMP $16, R3
BLE emitLiteralFastPath
// ----------------------------------------
@ -454,18 +444,21 @@ inlineEmitLiteralMemmove:
MOVD R3, 24(RSP)
// Finish the "d +=" part of "d += emitLiteral(etc)".
ADD R3, R8, R8
MOVD R7, 80(RSP)
MOVD R8, 88(RSP)
MOVD R15, 120(RSP)
CALL runtime·memmove(SB)
MOVD 64(RSP), R5
MOVD 72(RSP), R6
MOVD 80(RSP), R7
MOVD 88(RSP), R8
MOVD 96(RSP), R9
MOVD 120(RSP), R15
B inner1
ADD R3, R8, R8
MOVD R7, 80(RSP)
MOVD R8, 88(RSP)
MOVD R15, 120(RSP)
CALL runtime·memmove(SB)
MOVD 64(RSP), R5
MOVD 72(RSP), R6
MOVD 80(RSP), R7
MOVD 88(RSP), R8
MOVD 96(RSP), R9
MOVD 120(RSP), R15
ADD $128, RSP, R17
MOVW $0xa7bd, R16
MOVKW $(0x1e35<<16), R16
B inner1
inlineEmitLiteralEnd:
// End inline of the emitLiteral call.
@ -489,9 +482,9 @@ emitLiteralFastPath:
// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
// 16-byte loads and stores. This technique probably wouldn't be as
// effective on architectures that are fussier about alignment.
VLD1 0(R10), [V0.B16]
VST1 [V0.B16], 0(R8)
ADD R3, R8, R8
LDP 0(R10), (R0, R1)
STP (R0, R1), 0(R8)
ADD R3, R8, R8
inner1:
// for { etc }