Merge pull request #56 from AWSjswinney/arm64-port-pr

bug fix to encode_arm64.s: some registers overwritten in memmove call

ARM64 memmove clobbers R16 and R17 as of
https://go-review.googlesource.com/c/go/+/243357
This commit is contained in:
Nigel Tao 2020-11-04 09:46:00 +11:00 коммит произвёл GitHub
Родитель 196ae77b8a f81760ec4c
Коммит 674baa8c7f
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
2 изменённых файлов: 55 добавлений и 71 удалений

Просмотреть файл

@ -70,7 +70,7 @@ loop:
// x := uint32(src[s] >> 2)
// switch
MOVW $60, R1
ADD R4>>2, ZR, R4
LSRW $2, R4, R4
CMPW R4, R1
BLS tagLit60Plus
@ -111,12 +111,11 @@ doLit:
// is contiguous in memory and so it needs to leave enough source bytes to
// read the next tag without refilling buffers, but Go's Decode assumes
// contiguousness (the src argument is a []byte).
MOVD $16, R1
CMP R1, R4
CMP $16, R4
BGT callMemmove
CMP R1, R2
CMP $16, R2
BLT callMemmove
CMP R1, R3
CMP $16, R3
BLT callMemmove
// !!! Implement the copy from src to dst as a 16-byte load and store.
@ -130,9 +129,8 @@ doLit:
// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
// 16-byte loads and stores. This technique probably wouldn't be as
// effective on architectures that are fussier about alignment.
VLD1 0(R6), [V0.B16]
VST1 [V0.B16], 0(R7)
LDP 0(R6), (R14, R15)
STP (R14, R15), 0(R7)
// d += length
// s += length
@ -210,8 +208,7 @@ tagLit61:
B doLit
tagLit62Plus:
MOVW $62, R1
CMPW R1, R4
CMPW $62, R4
BHI tagLit63
// case x == 62:
@ -273,8 +270,7 @@ tagCopy:
// We have a copy tag. We assume that:
// - R3 == src[s] & 0x03
// - R4 == src[s]
MOVD $2, R1
CMP R1, R3
CMP $2, R3
BEQ tagCopy2
BGT tagCopy4
@ -346,13 +342,11 @@ doCopy:
// }
// copy 16 bytes
// d += length
MOVD $16, R1
MOVD $8, R0
CMP R1, R4
CMP $16, R4
BGT slowForwardCopy
CMP R0, R5
CMP $8, R5
BLT slowForwardCopy
CMP R1, R14
CMP $16, R14
BLT slowForwardCopy
MOVD 0(R15), R2
MOVD R2, 0(R7)
@ -426,8 +420,7 @@ makeOffsetAtLeast8:
// // The two previous lines together means that d-offset, and therefore
// // R15, is unchanged.
// }
MOVD $8, R1
CMP R1, R5
CMP $8, R5
BGE fixUpSlowForwardCopy
MOVD (R15), R3
MOVD R3, (R7)
@ -477,9 +470,7 @@ verySlowForwardCopy:
ADD $1, R15, R15
ADD $1, R7, R7
SUB $1, R4, R4
MOVD $0, R1
CMP R1, R4
BNE verySlowForwardCopy
CBNZ R4, verySlowForwardCopy
B loop
// The code above handles copy tags.

Просмотреть файл

@ -35,11 +35,9 @@ TEXT ·emitLiteral(SB), NOSPLIT, $32-56
MOVW R3, R4
SUBW $1, R4, R4
MOVW $60, R2
CMPW R2, R4
CMPW $60, R4
BLT oneByte
MOVW $256, R2
CMPW R2, R4
CMPW $256, R4
BLT twoBytes
threeBytes:
@ -98,8 +96,7 @@ TEXT ·emitCopy(SB), NOSPLIT, $0-48
loop0:
// for length >= 68 { etc }
MOVW $68, R2
CMPW R2, R3
CMPW $68, R3
BLT step1
// Emit a length 64 copy, encoded as 3 bytes.
@ -112,8 +109,7 @@ loop0:
step1:
// if length > 64 { etc }
MOVD $64, R2
CMP R2, R3
CMP $64, R3
BLE step2
// Emit a length 60 copy, encoded as 3 bytes.
@ -125,11 +121,9 @@ step1:
step2:
// if length >= 12 || offset >= 2048 { goto step3 }
MOVD $12, R2
CMP R2, R3
CMP $12, R3
BGE step3
MOVW $2048, R2
CMPW R2, R11
CMPW $2048, R11
BGE step3
// Emit the remaining copy, encoded as 2 bytes.
@ -295,25 +289,22 @@ varTable:
// var table [maxTableSize]uint16
//
// In the asm code, unlike the Go code, we can zero-initialize only the
// first tableSize elements. Each uint16 element is 2 bytes and each VST1
// writes 64 bytes, so we can do only tableSize/32 writes instead of the
// 2048 writes that would zero-initialize all of table's 32768 bytes.
// This clear could overrun the first tableSize elements, but it won't
// overrun the allocated stack size.
// first tableSize elements. Each uint16 element is 2 bytes and each
// iterations writes 64 bytes, so we can do only tableSize/32 writes
// instead of the 2048 writes that would zero-initialize all of table's
// 32768 bytes. This clear could overrun the first tableSize elements, but
// it won't overrun the allocated stack size.
ADD $128, RSP, R17
MOVD R17, R4
// !!! R6 = &src[tableSize]
ADD R6<<1, R17, R6
// zero the SIMD registers
VEOR V0.B16, V0.B16, V0.B16
VEOR V1.B16, V1.B16, V1.B16
VEOR V2.B16, V2.B16, V2.B16
VEOR V3.B16, V3.B16, V3.B16
memclr:
VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(R4)
STP.P (ZR, ZR), 64(R4)
STP (ZR, ZR), -48(R4)
STP (ZR, ZR), -32(R4)
STP (ZR, ZR), -16(R4)
CMP R4, R6
BHI memclr
@ -404,8 +395,7 @@ fourByteMatch:
// on inputMargin in encode.go.
MOVD R7, R3
SUB R10, R3, R3
MOVD $16, R2
CMP R2, R3
CMP $16, R3
BLE emitLiteralFastPath
// ----------------------------------------
@ -465,6 +455,9 @@ inlineEmitLiteralMemmove:
MOVD 88(RSP), R8
MOVD 96(RSP), R9
MOVD 120(RSP), R15
ADD $128, RSP, R17
MOVW $0xa7bd, R16
MOVKW $(0x1e35<<16), R16
B inner1
inlineEmitLiteralEnd:
@ -489,8 +482,8 @@ emitLiteralFastPath:
// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
// 16-byte loads and stores. This technique probably wouldn't be as
// effective on architectures that are fussier about alignment.
VLD1 0(R10), [V0.B16]
VST1 [V0.B16], 0(R8)
LDP 0(R10), (R0, R1)
STP (R0, R1), 0(R8)
ADD R3, R8, R8
inner1: