зеркало из https://github.com/golang/snappy.git
Merge pull request #56 from AWSjswinney/arm64-port-pr
bug fix to encode_arm64.s: some registers overwritten in memmove call ARM64 memmove clobbers R16 and R17 as of https://go-review.googlesource.com/c/go/+/243357
This commit is contained in:
Коммит
674baa8c7f
|
@ -70,7 +70,7 @@ loop:
|
|||
// x := uint32(src[s] >> 2)
|
||||
// switch
|
||||
MOVW $60, R1
|
||||
ADD R4>>2, ZR, R4
|
||||
LSRW $2, R4, R4
|
||||
CMPW R4, R1
|
||||
BLS tagLit60Plus
|
||||
|
||||
|
@ -111,12 +111,11 @@ doLit:
|
|||
// is contiguous in memory and so it needs to leave enough source bytes to
|
||||
// read the next tag without refilling buffers, but Go's Decode assumes
|
||||
// contiguousness (the src argument is a []byte).
|
||||
MOVD $16, R1
|
||||
CMP R1, R4
|
||||
CMP $16, R4
|
||||
BGT callMemmove
|
||||
CMP R1, R2
|
||||
CMP $16, R2
|
||||
BLT callMemmove
|
||||
CMP R1, R3
|
||||
CMP $16, R3
|
||||
BLT callMemmove
|
||||
|
||||
// !!! Implement the copy from src to dst as a 16-byte load and store.
|
||||
|
@ -130,9 +129,8 @@ doLit:
|
|||
// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
|
||||
// 16-byte loads and stores. This technique probably wouldn't be as
|
||||
// effective on architectures that are fussier about alignment.
|
||||
|
||||
VLD1 0(R6), [V0.B16]
|
||||
VST1 [V0.B16], 0(R7)
|
||||
LDP 0(R6), (R14, R15)
|
||||
STP (R14, R15), 0(R7)
|
||||
|
||||
// d += length
|
||||
// s += length
|
||||
|
@ -210,8 +208,7 @@ tagLit61:
|
|||
B doLit
|
||||
|
||||
tagLit62Plus:
|
||||
MOVW $62, R1
|
||||
CMPW R1, R4
|
||||
CMPW $62, R4
|
||||
BHI tagLit63
|
||||
|
||||
// case x == 62:
|
||||
|
@ -273,8 +270,7 @@ tagCopy:
|
|||
// We have a copy tag. We assume that:
|
||||
// - R3 == src[s] & 0x03
|
||||
// - R4 == src[s]
|
||||
MOVD $2, R1
|
||||
CMP R1, R3
|
||||
CMP $2, R3
|
||||
BEQ tagCopy2
|
||||
BGT tagCopy4
|
||||
|
||||
|
@ -346,13 +342,11 @@ doCopy:
|
|||
// }
|
||||
// copy 16 bytes
|
||||
// d += length
|
||||
MOVD $16, R1
|
||||
MOVD $8, R0
|
||||
CMP R1, R4
|
||||
CMP $16, R4
|
||||
BGT slowForwardCopy
|
||||
CMP R0, R5
|
||||
CMP $8, R5
|
||||
BLT slowForwardCopy
|
||||
CMP R1, R14
|
||||
CMP $16, R14
|
||||
BLT slowForwardCopy
|
||||
MOVD 0(R15), R2
|
||||
MOVD R2, 0(R7)
|
||||
|
@ -426,8 +420,7 @@ makeOffsetAtLeast8:
|
|||
// // The two previous lines together means that d-offset, and therefore
|
||||
// // R15, is unchanged.
|
||||
// }
|
||||
MOVD $8, R1
|
||||
CMP R1, R5
|
||||
CMP $8, R5
|
||||
BGE fixUpSlowForwardCopy
|
||||
MOVD (R15), R3
|
||||
MOVD R3, (R7)
|
||||
|
@ -477,9 +470,7 @@ verySlowForwardCopy:
|
|||
ADD $1, R15, R15
|
||||
ADD $1, R7, R7
|
||||
SUB $1, R4, R4
|
||||
MOVD $0, R1
|
||||
CMP R1, R4
|
||||
BNE verySlowForwardCopy
|
||||
CBNZ R4, verySlowForwardCopy
|
||||
B loop
|
||||
|
||||
// The code above handles copy tags.
|
||||
|
|
|
@ -35,11 +35,9 @@ TEXT ·emitLiteral(SB), NOSPLIT, $32-56
|
|||
MOVW R3, R4
|
||||
SUBW $1, R4, R4
|
||||
|
||||
MOVW $60, R2
|
||||
CMPW R2, R4
|
||||
CMPW $60, R4
|
||||
BLT oneByte
|
||||
MOVW $256, R2
|
||||
CMPW R2, R4
|
||||
CMPW $256, R4
|
||||
BLT twoBytes
|
||||
|
||||
threeBytes:
|
||||
|
@ -98,8 +96,7 @@ TEXT ·emitCopy(SB), NOSPLIT, $0-48
|
|||
|
||||
loop0:
|
||||
// for length >= 68 { etc }
|
||||
MOVW $68, R2
|
||||
CMPW R2, R3
|
||||
CMPW $68, R3
|
||||
BLT step1
|
||||
|
||||
// Emit a length 64 copy, encoded as 3 bytes.
|
||||
|
@ -112,8 +109,7 @@ loop0:
|
|||
|
||||
step1:
|
||||
// if length > 64 { etc }
|
||||
MOVD $64, R2
|
||||
CMP R2, R3
|
||||
CMP $64, R3
|
||||
BLE step2
|
||||
|
||||
// Emit a length 60 copy, encoded as 3 bytes.
|
||||
|
@ -125,11 +121,9 @@ step1:
|
|||
|
||||
step2:
|
||||
// if length >= 12 || offset >= 2048 { goto step3 }
|
||||
MOVD $12, R2
|
||||
CMP R2, R3
|
||||
CMP $12, R3
|
||||
BGE step3
|
||||
MOVW $2048, R2
|
||||
CMPW R2, R11
|
||||
CMPW $2048, R11
|
||||
BGE step3
|
||||
|
||||
// Emit the remaining copy, encoded as 2 bytes.
|
||||
|
@ -295,25 +289,22 @@ varTable:
|
|||
// var table [maxTableSize]uint16
|
||||
//
|
||||
// In the asm code, unlike the Go code, we can zero-initialize only the
|
||||
// first tableSize elements. Each uint16 element is 2 bytes and each VST1
|
||||
// writes 64 bytes, so we can do only tableSize/32 writes instead of the
|
||||
// 2048 writes that would zero-initialize all of table's 32768 bytes.
|
||||
// This clear could overrun the first tableSize elements, but it won't
|
||||
// overrun the allocated stack size.
|
||||
// first tableSize elements. Each uint16 element is 2 bytes and each
|
||||
// iterations writes 64 bytes, so we can do only tableSize/32 writes
|
||||
// instead of the 2048 writes that would zero-initialize all of table's
|
||||
// 32768 bytes. This clear could overrun the first tableSize elements, but
|
||||
// it won't overrun the allocated stack size.
|
||||
ADD $128, RSP, R17
|
||||
MOVD R17, R4
|
||||
|
||||
// !!! R6 = &src[tableSize]
|
||||
ADD R6<<1, R17, R6
|
||||
|
||||
// zero the SIMD registers
|
||||
VEOR V0.B16, V0.B16, V0.B16
|
||||
VEOR V1.B16, V1.B16, V1.B16
|
||||
VEOR V2.B16, V2.B16, V2.B16
|
||||
VEOR V3.B16, V3.B16, V3.B16
|
||||
|
||||
memclr:
|
||||
VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(R4)
|
||||
STP.P (ZR, ZR), 64(R4)
|
||||
STP (ZR, ZR), -48(R4)
|
||||
STP (ZR, ZR), -32(R4)
|
||||
STP (ZR, ZR), -16(R4)
|
||||
CMP R4, R6
|
||||
BHI memclr
|
||||
|
||||
|
@ -404,8 +395,7 @@ fourByteMatch:
|
|||
// on inputMargin in encode.go.
|
||||
MOVD R7, R3
|
||||
SUB R10, R3, R3
|
||||
MOVD $16, R2
|
||||
CMP R2, R3
|
||||
CMP $16, R3
|
||||
BLE emitLiteralFastPath
|
||||
|
||||
// ----------------------------------------
|
||||
|
@ -465,6 +455,9 @@ inlineEmitLiteralMemmove:
|
|||
MOVD 88(RSP), R8
|
||||
MOVD 96(RSP), R9
|
||||
MOVD 120(RSP), R15
|
||||
ADD $128, RSP, R17
|
||||
MOVW $0xa7bd, R16
|
||||
MOVKW $(0x1e35<<16), R16
|
||||
B inner1
|
||||
|
||||
inlineEmitLiteralEnd:
|
||||
|
@ -489,8 +482,8 @@ emitLiteralFastPath:
|
|||
// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
|
||||
// 16-byte loads and stores. This technique probably wouldn't be as
|
||||
// effective on architectures that are fussier about alignment.
|
||||
VLD1 0(R10), [V0.B16]
|
||||
VST1 [V0.B16], 0(R8)
|
||||
LDP 0(R10), (R0, R1)
|
||||
STP (R0, R1), 0(R8)
|
||||
ADD R3, R8, R8
|
||||
|
||||
inner1:
|
||||
|
|
Загрузка…
Ссылка в новой задаче