From f81760ec4c9208e6e6866cf18e1888544fd2dbc9 Mon Sep 17 00:00:00 2001 From: Jonathan Swinney Date: Fri, 2 Oct 2020 15:49:34 +0000 Subject: [PATCH] bug fix to encode_arm64.s: some registers overwritten in memmove call MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In encode_arm64.s, encodeBlock, two of the registers added during the port from amd64 were not saved or restored for the memmove call. Instead of saving them, just recalculate their values. Additionally, I made a few small changes to improve things since I've learned a bit more about ARMv8 assembly. - The CMP instruction accepts an immediate as the first argument - use LDP/STP instead of SIMD instructions The change to use the load-pair and store-pair instructions instead of the SIMD instructions results in some modest performance improvements as meastured on Neoverse N1 (Graviton 2). name old time/op new time/op delta WordsDecode1e1-2 25.9ns ± 1% 26.1ns ± 1% +0.66% (p=0.005 n=10+10) WordsDecode1e2-2 107ns ± 0% 105ns ± 0% -1.87% (p=0.000 n=10+10) WordsDecode1e3-2 953ns ± 0% 901ns ± 0% -5.50% (p=0.000 n=10+10) WordsDecode1e4-2 10.6µs ± 0% 9.9µs ± 2% -6.60% (p=0.000 n=7+10) WordsDecode1e5-2 170µs ± 1% 164µs ± 1% -3.12% (p=0.000 n=10+9) WordsDecode1e6-2 1.71ms ± 0% 1.66ms ± 0% -2.98% (p=0.000 n=10+10) WordsEncode1e1-2 22.0ns ± 1% 21.9ns ± 1% -0.67% (p=0.006 n=8+10) WordsEncode1e2-2 248ns ± 0% 245ns ± 0% -1.21% (p=0.002 n=8+10) WordsEncode1e3-2 2.50µs ± 0% 2.49µs ± 0% ~ (p=0.103 n=10+9) WordsEncode1e4-2 27.8µs ± 3% 28.0µs ± 2% ~ (p=0.075 n=10+10) WordsEncode1e5-2 339µs ± 0% 343µs ± 0% +1.18% (p=0.000 n=9+10) WordsEncode1e6-2 3.39ms ± 0% 3.42ms ± 0% +0.94% (p=0.000 n=10+10) RandomEncode-2 74.8µs ± 1% 77.1µs ± 1% +3.16% (p=0.000 n=10+10) _UFlat0-2 68.8µs ± 1% 66.4µs ± 2% -3.54% (p=0.000 n=10+10) _UFlat1-2 770µs ± 0% 740µs ± 1% -3.93% (p=0.000 n=10+10) _UFlat2-2 6.57µs ± 0% 6.55µs ± 0% -0.25% (p=0.000 n=8+10) _UFlat3-2 183ns ± 0% 178ns ± 1% -2.84% (p=0.000 n=9+10) _UFlat4-2 9.76µs ± 1% 9.56µs ± 0% -2.07% (p=0.000 n=10+9) _UFlat5-2 301µs ± 0% 293µs ± 0% -2.67% (p=0.000 n=9+10) _UFlat6-2 280µs ± 1% 267µs ± 1% -4.63% (p=0.000 n=10+10) _UFlat7-2 241µs ± 0% 230µs ± 1% -4.68% (p=0.000 n=9+10) _UFlat8-2 745µs ± 0% 715µs ± 1% -4.11% (p=0.000 n=10+10) _UFlat9-2 1.01ms ± 0% 0.96ms ± 0% -4.60% (p=0.000 n=10+10) _UFlat10-2 62.3µs ± 1% 59.3µs ± 1% -4.72% (p=0.000 n=10+9) _UFlat11-2 258µs ± 0% 252µs ± 1% -2.56% (p=0.000 n=10+10) _ZFlat0-2 135µs ± 1% 132µs ± 1% -1.88% (p=0.000 n=10+8) _ZFlat1-2 1.76ms ± 0% 1.74ms ± 0% -1.00% (p=0.000 n=9+9) _ZFlat2-2 9.54µs ± 0% 9.84µs ± 5% +3.18% (p=0.000 n=10+10) _ZFlat3-2 449ns ± 0% 447ns ± 0% -0.38% (p=0.000 n=10+9) _ZFlat4-2 15.6µs ± 0% 16.0µs ± 4% ~ (p=0.118 n=9+10) _ZFlat5-2 560µs ± 1% 555µs ± 1% -0.89% (p=0.000 n=9+9) _ZFlat6-2 531µs ± 0% 534µs ± 0% +0.64% (p=0.000 n=10+10) _ZFlat7-2 466µs ± 0% 468µs ± 0% +0.32% (p=0.003 n=10+10) _ZFlat8-2 1.42ms ± 0% 1.42ms ± 0% +0.43% (p=0.000 n=10+10) _ZFlat9-2 1.93ms ± 0% 1.94ms ± 0% +0.44% (p=0.000 n=10+10) _ZFlat10-2 120µs ± 0% 121µs ± 3% ~ (p=0.436 n=9+9) _ZFlat11-2 433µs ± 0% 437µs ± 0% +1.03% (p=0.000 n=10+10) ExtendMatch-2 9.77µs ± 0% 9.76µs ± 0% -0.13% (p=0.050 n=10+10) As measured on Cortex-A53 (Raspberry Pi 3) name old time/op new time/op delta WordsDecode1e1-4 152ns ± 2% 151ns ± 0% ~ (p=0.536 n=10+8) WordsDecode1e2-4 639ns ± 0% 617ns ± 0% -3.54% (p=0.000 n=9+8) WordsDecode1e3-4 6.74µs ± 2% 6.35µs ± 0% -5.75% (p=0.000 n=10+9) WordsDecode1e4-4 66.7µs ± 0% 63.5µs ± 0% -4.69% (p=0.000 n=9+9) WordsDecode1e5-4 715µs ± 0% 684µs ± 0% -4.38% (p=0.000 n=8+8) WordsDecode1e6-4 6.87ms ± 2% 6.53ms ± 1% -4.99% (p=0.000 n=10+9) WordsEncode1e1-4 127ns ± 2% 126ns ± 0% ~ (p=0.065 n=10+9) WordsEncode1e2-4 1.58µs ± 0% 1.57µs ± 0% -0.99% (p=0.000 n=8+8) WordsEncode1e3-4 15.1µs ± 0% 14.9µs ± 0% -1.46% (p=0.000 n=9+8) WordsEncode1e4-4 148µs ± 0% 148µs ± 4% ~ (p=0.497 n=9+10) WordsEncode1e5-4 1.54ms ± 0% 1.54ms ± 0% +0.12% (p=0.012 n=10+8) WordsEncode1e6-4 14.4ms ± 0% 14.4ms ± 1% -0.47% (p=0.015 n=9+8) RandomEncode-4 1.13ms ± 1% 1.13ms ± 1% ~ (p=0.529 n=10+10) _UFlat0-4 294µs ± 0% 288µs ± 1% -2.08% (p=0.000 n=9+9) _UFlat1-4 3.05ms ± 1% 2.98ms ± 1% -2.22% (p=0.000 n=9+9) _UFlat2-4 37.3µs ± 0% 37.4µs ± 1% ~ (p=0.093 n=8+9) _UFlat3-4 909ns ± 0% 914ns ± 2% ~ (p=0.526 n=8+10) _UFlat4-4 58.7µs ± 0% 58.1µs ± 0% -1.09% (p=0.000 n=8+10) _UFlat5-4 1.22ms ± 0% 1.19ms ± 1% -2.14% (p=0.000 n=8+8) _UFlat6-4 1.03ms ± 0% 0.99ms ± 0% -3.28% (p=0.000 n=9+8) _UFlat7-4 895µs ± 0% 861µs ± 0% -3.79% (p=0.000 n=8+8) _UFlat8-4 2.83ms ± 0% 2.75ms ± 0% -2.88% (p=0.000 n=7+8) _UFlat9-4 3.85ms ± 1% 3.73ms ± 1% -3.03% (p=0.000 n=8+9) _UFlat10-4 286µs ± 0% 282µs ± 0% -1.59% (p=0.000 n=9+9) _UFlat11-4 1.06ms ± 0% 1.02ms ± 0% -3.58% (p=0.000 n=8+9) _ZFlat0-4 620µs ± 0% 620µs ± 1% ~ (p=0.963 n=9+8) _ZFlat1-4 9.49ms ± 1% 9.67ms ± 3% +1.87% (p=0.000 n=9+10) _ZFlat2-4 61.8µs ± 0% 62.3µs ± 3% ~ (p=0.829 n=8+10) _ZFlat3-4 2.80µs ± 1% 2.79µs ± 0% -0.55% (p=0.000 n=8+8) _ZFlat4-4 108µs ± 0% 109µs ± 0% +0.55% (p=0.000 n=10+8) _ZFlat5-4 2.59ms ± 2% 2.58ms ± 1% ~ (p=0.274 n=10+8) _ZFlat6-4 2.39ms ± 3% 2.40ms ± 1% ~ (p=0.631 n=10+10) _ZFlat7-4 2.11ms ± 0% 2.08ms ± 1% -1.23% (p=0.000 n=10+9) _ZFlat8-4 6.86ms ± 0% 6.92ms ± 1% +0.78% (p=0.000 n=9+8) _ZFlat9-4 9.42ms ± 0% 9.40ms ± 1% ~ (p=0.606 n=8+9) _ZFlat10-4 620µs ± 1% 621µs ± 4% ~ (p=0.173 n=8+10) _ZFlat11-4 1.94ms ± 0% 1.93ms ± 0% -0.52% (p=0.001 n=9+8) ExtendMatch-4 69.3µs ± 2% 69.2µs ± 0% ~ (p=0.515 n=10+8) --- decode_arm64.s | 45 +++++++++++----------------- encode_arm64.s | 81 +++++++++++++++++++++++--------------------------- 2 files changed, 55 insertions(+), 71 deletions(-) diff --git a/decode_arm64.s b/decode_arm64.s index bfafa0c..7a3ead1 100644 --- a/decode_arm64.s +++ b/decode_arm64.s @@ -70,7 +70,7 @@ loop: // x := uint32(src[s] >> 2) // switch MOVW $60, R1 - ADD R4>>2, ZR, R4 + LSRW $2, R4, R4 CMPW R4, R1 BLS tagLit60Plus @@ -111,13 +111,12 @@ doLit: // is contiguous in memory and so it needs to leave enough source bytes to // read the next tag without refilling buffers, but Go's Decode assumes // contiguousness (the src argument is a []byte). - MOVD $16, R1 - CMP R1, R4 - BGT callMemmove - CMP R1, R2 - BLT callMemmove - CMP R1, R3 - BLT callMemmove + CMP $16, R4 + BGT callMemmove + CMP $16, R2 + BLT callMemmove + CMP $16, R3 + BLT callMemmove // !!! Implement the copy from src to dst as a 16-byte load and store. // (Decode's documentation says that dst and src must not overlap.) @@ -130,9 +129,8 @@ doLit: // Note that on arm64, it is legal and cheap to issue unaligned 8-byte or // 16-byte loads and stores. This technique probably wouldn't be as // effective on architectures that are fussier about alignment. - - VLD1 0(R6), [V0.B16] - VST1 [V0.B16], 0(R7) + LDP 0(R6), (R14, R15) + STP (R14, R15), 0(R7) // d += length // s += length @@ -210,8 +208,7 @@ tagLit61: B doLit tagLit62Plus: - MOVW $62, R1 - CMPW R1, R4 + CMPW $62, R4 BHI tagLit63 // case x == 62: @@ -273,10 +270,9 @@ tagCopy: // We have a copy tag. We assume that: // - R3 == src[s] & 0x03 // - R4 == src[s] - MOVD $2, R1 - CMP R1, R3 - BEQ tagCopy2 - BGT tagCopy4 + CMP $2, R3 + BEQ tagCopy2 + BGT tagCopy4 // case tagCopy1: // s += 2 @@ -346,13 +342,11 @@ doCopy: // } // copy 16 bytes // d += length - MOVD $16, R1 - MOVD $8, R0 - CMP R1, R4 + CMP $16, R4 BGT slowForwardCopy - CMP R0, R5 + CMP $8, R5 BLT slowForwardCopy - CMP R1, R14 + CMP $16, R14 BLT slowForwardCopy MOVD 0(R15), R2 MOVD R2, 0(R7) @@ -426,8 +420,7 @@ makeOffsetAtLeast8: // // The two previous lines together means that d-offset, and therefore // // R15, is unchanged. // } - MOVD $8, R1 - CMP R1, R5 + CMP $8, R5 BGE fixUpSlowForwardCopy MOVD (R15), R3 MOVD R3, (R7) @@ -477,9 +470,7 @@ verySlowForwardCopy: ADD $1, R15, R15 ADD $1, R7, R7 SUB $1, R4, R4 - MOVD $0, R1 - CMP R1, R4 - BNE verySlowForwardCopy + CBNZ R4, verySlowForwardCopy B loop // The code above handles copy tags. diff --git a/encode_arm64.s b/encode_arm64.s index 1f565ee..bf83667 100644 --- a/encode_arm64.s +++ b/encode_arm64.s @@ -35,11 +35,9 @@ TEXT ·emitLiteral(SB), NOSPLIT, $32-56 MOVW R3, R4 SUBW $1, R4, R4 - MOVW $60, R2 - CMPW R2, R4 + CMPW $60, R4 BLT oneByte - MOVW $256, R2 - CMPW R2, R4 + CMPW $256, R4 BLT twoBytes threeBytes: @@ -98,8 +96,7 @@ TEXT ·emitCopy(SB), NOSPLIT, $0-48 loop0: // for length >= 68 { etc } - MOVW $68, R2 - CMPW R2, R3 + CMPW $68, R3 BLT step1 // Emit a length 64 copy, encoded as 3 bytes. @@ -112,9 +109,8 @@ loop0: step1: // if length > 64 { etc } - MOVD $64, R2 - CMP R2, R3 - BLE step2 + CMP $64, R3 + BLE step2 // Emit a length 60 copy, encoded as 3 bytes. MOVD $0xee, R2 @@ -125,11 +121,9 @@ step1: step2: // if length >= 12 || offset >= 2048 { goto step3 } - MOVD $12, R2 - CMP R2, R3 + CMP $12, R3 BGE step3 - MOVW $2048, R2 - CMPW R2, R11 + CMPW $2048, R11 BGE step3 // Emit the remaining copy, encoded as 2 bytes. @@ -295,27 +289,24 @@ varTable: // var table [maxTableSize]uint16 // // In the asm code, unlike the Go code, we can zero-initialize only the - // first tableSize elements. Each uint16 element is 2 bytes and each VST1 - // writes 64 bytes, so we can do only tableSize/32 writes instead of the - // 2048 writes that would zero-initialize all of table's 32768 bytes. - // This clear could overrun the first tableSize elements, but it won't - // overrun the allocated stack size. + // first tableSize elements. Each uint16 element is 2 bytes and each + // iterations writes 64 bytes, so we can do only tableSize/32 writes + // instead of the 2048 writes that would zero-initialize all of table's + // 32768 bytes. This clear could overrun the first tableSize elements, but + // it won't overrun the allocated stack size. ADD $128, RSP, R17 MOVD R17, R4 // !!! R6 = &src[tableSize] ADD R6<<1, R17, R6 - // zero the SIMD registers - VEOR V0.B16, V0.B16, V0.B16 - VEOR V1.B16, V1.B16, V1.B16 - VEOR V2.B16, V2.B16, V2.B16 - VEOR V3.B16, V3.B16, V3.B16 - memclr: - VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(R4) - CMP R4, R6 - BHI memclr + STP.P (ZR, ZR), 64(R4) + STP (ZR, ZR), -48(R4) + STP (ZR, ZR), -32(R4) + STP (ZR, ZR), -16(R4) + CMP R4, R6 + BHI memclr // !!! R6 = &src[0] MOVD R7, R6 @@ -404,8 +395,7 @@ fourByteMatch: // on inputMargin in encode.go. MOVD R7, R3 SUB R10, R3, R3 - MOVD $16, R2 - CMP R2, R3 + CMP $16, R3 BLE emitLiteralFastPath // ---------------------------------------- @@ -454,18 +444,21 @@ inlineEmitLiteralMemmove: MOVD R3, 24(RSP) // Finish the "d +=" part of "d += emitLiteral(etc)". - ADD R3, R8, R8 - MOVD R7, 80(RSP) - MOVD R8, 88(RSP) - MOVD R15, 120(RSP) - CALL runtime·memmove(SB) - MOVD 64(RSP), R5 - MOVD 72(RSP), R6 - MOVD 80(RSP), R7 - MOVD 88(RSP), R8 - MOVD 96(RSP), R9 - MOVD 120(RSP), R15 - B inner1 + ADD R3, R8, R8 + MOVD R7, 80(RSP) + MOVD R8, 88(RSP) + MOVD R15, 120(RSP) + CALL runtime·memmove(SB) + MOVD 64(RSP), R5 + MOVD 72(RSP), R6 + MOVD 80(RSP), R7 + MOVD 88(RSP), R8 + MOVD 96(RSP), R9 + MOVD 120(RSP), R15 + ADD $128, RSP, R17 + MOVW $0xa7bd, R16 + MOVKW $(0x1e35<<16), R16 + B inner1 inlineEmitLiteralEnd: // End inline of the emitLiteral call. @@ -489,9 +482,9 @@ emitLiteralFastPath: // Note that on arm64, it is legal and cheap to issue unaligned 8-byte or // 16-byte loads and stores. This technique probably wouldn't be as // effective on architectures that are fussier about alignment. - VLD1 0(R10), [V0.B16] - VST1 [V0.B16], 0(R8) - ADD R3, R8, R8 + LDP 0(R10), (R0, R1) + STP (R0, R1), 0(R8) + ADD R3, R8, R8 inner1: // for { etc }