зеркало из https://github.com/golang/snappy.git
Inline the extendMatch call.
Compared to the previous commit:
name old speed new speed delta
WordsEncode1e1-8 701MB/s ± 0% 699MB/s ± 1% ~ (p=0.123 n=10+10)
WordsEncode1e2-8 460MB/s ± 0% 583MB/s ± 1% +26.64% (p=0.000 n=10+10)
WordsEncode1e3-8 480MB/s ± 0% 647MB/s ± 2% +34.85% (p=0.000 n=10+10)
WordsEncode1e4-8 416MB/s ± 0% 451MB/s ± 0% +8.30% (p=0.000 n=10+8)
WordsEncode1e5-8 297MB/s ± 0% 355MB/s ± 2% +19.50% (p=0.000 n=10+9)
WordsEncode1e6-8 345MB/s ± 0% 433MB/s ± 2% +25.47% (p=0.000 n=10+9)
RandomEncode-8 14.4GB/s ± 2% 14.3GB/s ± 3% ~ (p=0.075 n=10+10)
_ZFlat0-8 891MB/s ± 1% 1040MB/s ± 0% +16.67% (p=0.000 n=9+9)
_ZFlat1-8 471MB/s ± 0% 535MB/s ± 1% +13.68% (p=0.000 n=9+10)
_ZFlat2-8 16.2GB/s ± 3% 16.4GB/s ± 1% ~ (p=0.122 n=10+8)
_ZFlat3-8 676MB/s ± 0% 762MB/s ± 0% +12.62% (p=0.000 n=10+9)
_ZFlat4-8 8.36GB/s ± 1% 9.47GB/s ± 1% +13.28% (p=0.000 n=10+10)
_ZFlat5-8 852MB/s ± 0% 986MB/s ± 1% +15.79% (p=0.000 n=10+9)
_ZFlat6-8 316MB/s ± 0% 380MB/s ± 1% +20.41% (p=0.000 n=8+9)
_ZFlat7-8 296MB/s ± 0% 353MB/s ± 0% +19.44% (p=0.000 n=8+10)
_ZFlat8-8 331MB/s ± 1% 399MB/s ± 0% +20.53% (p=0.000 n=9+8)
_ZFlat9-8 274MB/s ± 0% 329MB/s ± 0% +20.27% (p=0.000 n=8+9)
_ZFlat10-8 1.17GB/s ± 0% 1.35GB/s ± 1% +15.15% (p=0.000 n=9+9)
_ZFlat11-8 462MB/s ± 0% 608MB/s ± 0% +31.54% (p=0.000 n=9+9)
The net effect of the past four inlining commits, when compared to just
before c3defccc
"Inline the emitCopy call":
name old speed new speed delta
WordsEncode1e1-8 701MB/s ± 1% 699MB/s ± 1% ~ (p=0.353 n=10+10)
WordsEncode1e2-8 429MB/s ± 0% 583MB/s ± 1% +35.95% (p=0.000 n=9+10)
WordsEncode1e3-8 447MB/s ± 0% 647MB/s ± 2% +44.85% (p=0.000 n=9+10)
WordsEncode1e4-8 322MB/s ± 1% 451MB/s ± 0% +40.00% (p=0.000 n=10+8)
WordsEncode1e5-8 268MB/s ± 0% 355MB/s ± 2% +32.41% (p=0.000 n=9+9)
WordsEncode1e6-8 313MB/s ± 0% 433MB/s ± 2% +38.28% (p=0.000 n=8+9)
RandomEncode-8 14.4GB/s ± 1% 14.3GB/s ± 3% ~ (p=0.897 n=8+10)
_ZFlat0-8 797MB/s ± 2% 1040MB/s ± 0% +30.53% (p=0.000 n=9+9)
_ZFlat1-8 435MB/s ± 1% 535MB/s ± 1% +22.97% (p=0.000 n=9+10)
_ZFlat2-8 16.1GB/s ± 2% 16.4GB/s ± 1% +1.47% (p=0.001 n=10+8)
_ZFlat3-8 633MB/s ± 0% 762MB/s ± 0% +20.32% (p=0.000 n=10+9)
_ZFlat4-8 7.95GB/s ± 1% 9.47GB/s ± 1% +19.11% (p=0.000 n=10+10)
_ZFlat5-8 771MB/s ± 0% 986MB/s ± 1% +27.83% (p=0.000 n=10+9)
_ZFlat6-8 283MB/s ± 0% 380MB/s ± 1% +34.46% (p=0.000 n=10+9)
_ZFlat7-8 265MB/s ± 0% 353MB/s ± 0% +33.29% (p=0.000 n=9+10)
_ZFlat8-8 299MB/s ± 0% 399MB/s ± 0% +33.36% (p=0.000 n=9+8)
_ZFlat9-8 246MB/s ± 1% 329MB/s ± 0% +33.58% (p=0.000 n=10+9)
_ZFlat10-8 1.05GB/s ± 1% 1.35GB/s ± 1% +28.35% (p=0.000 n=10+9)
_ZFlat11-8 411MB/s ± 0% 608MB/s ± 0% +47.82% (p=0.000 n=10+9)
This commit is contained in:
Родитель
c707890a47
Коммит
dfb3612ba2
|
@ -251,8 +251,8 @@ extendMatchEnd:
|
|||
// - R10 . &src[nextEmit]
|
||||
// - R11 96 prevHash, currHash, nextHash, offset
|
||||
// - R12 104 &src[base], skip
|
||||
// - R13 . &src[nextS]
|
||||
// - R14 . len(src), bytesBetweenHashLookups, x
|
||||
// - R13 . &src[nextS], &src[len(src) - 8]
|
||||
// - R14 . len(src), bytesBetweenHashLookups, &src[len(src)], x
|
||||
// - R15 112 candidate
|
||||
//
|
||||
// The second column (56, 64, etc) is the stack offset to spill the registers
|
||||
|
@ -444,8 +444,7 @@ inlineEmitLiteralMemmove:
|
|||
MOVQ DI, 0(SP)
|
||||
MOVQ R10, 8(SP)
|
||||
MOVQ AX, 16(SP)
|
||||
// Finish the "d +=" part of "d += emitLiteral(etc)".
|
||||
ADDQ AX, DI
|
||||
ADDQ AX, DI // Finish the "d +=" part of "d += emitLiteral(etc)".
|
||||
MOVQ SI, 72(SP)
|
||||
MOVQ DI, 80(SP)
|
||||
MOVQ R15, 112(SP)
|
||||
|
@ -494,35 +493,65 @@ inner1:
|
|||
SUBQ R15, R11
|
||||
SUBQ DX, R11
|
||||
|
||||
// s = extendMatch(src, candidate+4, s+4)
|
||||
// ----------------------------------------
|
||||
// Begin inline of the extendMatch call.
|
||||
//
|
||||
// Push args.
|
||||
MOVQ DX, 0(SP)
|
||||
// s = extendMatch(src, candidate+4, s+4)
|
||||
|
||||
// !!! R14 = &src[len(src)]
|
||||
MOVQ src_len+32(FP), R14
|
||||
MOVQ R14, 8(SP)
|
||||
MOVQ R14, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
|
||||
ADDQ DX, R14
|
||||
|
||||
// !!! R13 = &src[len(src) - 8]
|
||||
MOVQ R14, R13
|
||||
SUBQ $8, R13
|
||||
|
||||
// !!! R15 = &src[candidate + 4]
|
||||
ADDQ $4, R15
|
||||
MOVQ R15, 24(SP)
|
||||
ADDQ DX, R15
|
||||
|
||||
// !!! s += 4
|
||||
ADDQ $4, SI
|
||||
SUBQ DX, SI
|
||||
MOVQ SI, 32(SP)
|
||||
|
||||
// Spill local variables (registers) onto the stack; call; unspill.
|
||||
MOVQ DI, 80(SP)
|
||||
MOVQ R11, 96(SP)
|
||||
MOVQ R12, 104(SP)
|
||||
CALL ·extendMatch(SB)
|
||||
MOVQ 56(SP), CX
|
||||
MOVQ 64(SP), DX
|
||||
MOVQ 80(SP), DI
|
||||
MOVQ 88(SP), R9
|
||||
MOVQ 96(SP), R11
|
||||
MOVQ 104(SP), R12
|
||||
inlineExtendMatchCmp8:
|
||||
// As long as we are 8 or more bytes before the end of src, we can load and
|
||||
// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
|
||||
CMPQ SI, R13
|
||||
JA inlineExtendMatchCmp1
|
||||
MOVQ (R15), AX
|
||||
MOVQ (SI), BX
|
||||
CMPQ AX, BX
|
||||
JNE inlineExtendMatchBSF
|
||||
ADDQ $8, R15
|
||||
ADDQ $8, SI
|
||||
JMP inlineExtendMatchCmp8
|
||||
|
||||
// Finish the "s =" part of "s = extendMatch(etc)", remembering that the SI
|
||||
// register holds &src[s], not s.
|
||||
MOVQ 40(SP), SI
|
||||
ADDQ DX, SI
|
||||
inlineExtendMatchBSF:
|
||||
// If those 8 bytes were not equal, XOR the two 8 byte values, and return
|
||||
// the index of the first byte that differs. The BSF instruction finds the
|
||||
// least significant 1 bit, the amd64 architecture is little-endian, and
|
||||
// the shift by 3 converts a bit index to a byte index.
|
||||
XORQ AX, BX
|
||||
BSFQ BX, BX
|
||||
SHRQ $3, BX
|
||||
ADDQ BX, SI
|
||||
JMP inlineExtendMatchEnd
|
||||
|
||||
inlineExtendMatchCmp1:
|
||||
// In src's tail, compare 1 byte at a time.
|
||||
CMPQ SI, R14
|
||||
JAE inlineExtendMatchEnd
|
||||
MOVB (R15), AX
|
||||
MOVB (SI), BX
|
||||
CMPB AX, BX
|
||||
JNE inlineExtendMatchEnd
|
||||
ADDQ $1, R15
|
||||
ADDQ $1, SI
|
||||
JMP inlineExtendMatchCmp1
|
||||
|
||||
inlineExtendMatchEnd:
|
||||
// End inline of the extendMatch call.
|
||||
// ----------------------------------------
|
||||
|
||||
// ----------------------------------------
|
||||
// Begin inline of the emitCopy call.
|
||||
|
|
Загрузка…
Ссылка в новой задаче