Compared to the previous commit:
name              old speed      new speed      delta
WordsEncode1e1-8   701MB/s ± 0%   699MB/s ± 1%     ~     (p=0.123 n=10+10)
WordsEncode1e2-8   460MB/s ± 0%   583MB/s ± 1%  +26.64%  (p=0.000 n=10+10)
WordsEncode1e3-8   480MB/s ± 0%   647MB/s ± 2%  +34.85%  (p=0.000 n=10+10)
WordsEncode1e4-8   416MB/s ± 0%   451MB/s ± 0%   +8.30%   (p=0.000 n=10+8)
WordsEncode1e5-8   297MB/s ± 0%   355MB/s ± 2%  +19.50%   (p=0.000 n=10+9)
WordsEncode1e6-8   345MB/s ± 0%   433MB/s ± 2%  +25.47%   (p=0.000 n=10+9)
RandomEncode-8    14.4GB/s ± 2%  14.3GB/s ± 3%     ~     (p=0.075 n=10+10)
_ZFlat0-8          891MB/s ± 1%  1040MB/s ± 0%  +16.67%    (p=0.000 n=9+9)
_ZFlat1-8          471MB/s ± 0%   535MB/s ± 1%  +13.68%   (p=0.000 n=9+10)
_ZFlat2-8         16.2GB/s ± 3%  16.4GB/s ± 1%     ~      (p=0.122 n=10+8)
_ZFlat3-8          676MB/s ± 0%   762MB/s ± 0%  +12.62%   (p=0.000 n=10+9)
_ZFlat4-8         8.36GB/s ± 1%  9.47GB/s ± 1%  +13.28%  (p=0.000 n=10+10)
_ZFlat5-8          852MB/s ± 0%   986MB/s ± 1%  +15.79%   (p=0.000 n=10+9)
_ZFlat6-8          316MB/s ± 0%   380MB/s ± 1%  +20.41%    (p=0.000 n=8+9)
_ZFlat7-8          296MB/s ± 0%   353MB/s ± 0%  +19.44%   (p=0.000 n=8+10)
_ZFlat8-8          331MB/s ± 1%   399MB/s ± 0%  +20.53%    (p=0.000 n=9+8)
_ZFlat9-8          274MB/s ± 0%   329MB/s ± 0%  +20.27%    (p=0.000 n=8+9)
_ZFlat10-8        1.17GB/s ± 0%  1.35GB/s ± 1%  +15.15%    (p=0.000 n=9+9)
_ZFlat11-8         462MB/s ± 0%   608MB/s ± 0%  +31.54%    (p=0.000 n=9+9)

The net effect of the past four inlining commits, when compared to just
before c3defccc "Inline the emitCopy call":
name              old speed      new speed      delta
WordsEncode1e1-8   701MB/s ± 1%   699MB/s ± 1%     ~     (p=0.353 n=10+10)
WordsEncode1e2-8   429MB/s ± 0%   583MB/s ± 1%  +35.95%   (p=0.000 n=9+10)
WordsEncode1e3-8   447MB/s ± 0%   647MB/s ± 2%  +44.85%   (p=0.000 n=9+10)
WordsEncode1e4-8   322MB/s ± 1%   451MB/s ± 0%  +40.00%   (p=0.000 n=10+8)
WordsEncode1e5-8   268MB/s ± 0%   355MB/s ± 2%  +32.41%    (p=0.000 n=9+9)
WordsEncode1e6-8   313MB/s ± 0%   433MB/s ± 2%  +38.28%    (p=0.000 n=8+9)
RandomEncode-8    14.4GB/s ± 1%  14.3GB/s ± 3%     ~      (p=0.897 n=8+10)
_ZFlat0-8          797MB/s ± 2%  1040MB/s ± 0%  +30.53%    (p=0.000 n=9+9)
_ZFlat1-8          435MB/s ± 1%   535MB/s ± 1%  +22.97%   (p=0.000 n=9+10)
_ZFlat2-8         16.1GB/s ± 2%  16.4GB/s ± 1%   +1.47%   (p=0.001 n=10+8)
_ZFlat3-8          633MB/s ± 0%   762MB/s ± 0%  +20.32%   (p=0.000 n=10+9)
_ZFlat4-8         7.95GB/s ± 1%  9.47GB/s ± 1%  +19.11%  (p=0.000 n=10+10)
_ZFlat5-8          771MB/s ± 0%   986MB/s ± 1%  +27.83%   (p=0.000 n=10+9)
_ZFlat6-8          283MB/s ± 0%   380MB/s ± 1%  +34.46%   (p=0.000 n=10+9)
_ZFlat7-8          265MB/s ± 0%   353MB/s ± 0%  +33.29%   (p=0.000 n=9+10)
_ZFlat8-8          299MB/s ± 0%   399MB/s ± 0%  +33.36%    (p=0.000 n=9+8)
_ZFlat9-8          246MB/s ± 1%   329MB/s ± 0%  +33.58%   (p=0.000 n=10+9)
_ZFlat10-8        1.05GB/s ± 1%  1.35GB/s ± 1%  +28.35%   (p=0.000 n=10+9)
_ZFlat11-8         411MB/s ± 0%   608MB/s ± 0%  +47.82%   (p=0.000 n=10+9)
This commit is contained in:
Nigel Tao 2016-04-29 14:21:44 +10:00
Родитель c707890a47
Коммит dfb3612ba2
1 изменённых файлов: 56 добавлений и 27 удалений

Просмотреть файл

@ -251,8 +251,8 @@ extendMatchEnd:
// - R10 . &src[nextEmit]
// - R11 96 prevHash, currHash, nextHash, offset
// - R12 104 &src[base], skip
// - R13 . &src[nextS]
// - R14 . len(src), bytesBetweenHashLookups, x
// - R13 . &src[nextS], &src[len(src) - 8]
// - R14 . len(src), bytesBetweenHashLookups, &src[len(src)], x
// - R15 112 candidate
//
// The second column (56, 64, etc) is the stack offset to spill the registers
@ -444,8 +444,7 @@ inlineEmitLiteralMemmove:
MOVQ DI, 0(SP)
MOVQ R10, 8(SP)
MOVQ AX, 16(SP)
// Finish the "d +=" part of "d += emitLiteral(etc)".
ADDQ AX, DI
ADDQ AX, DI // Finish the "d +=" part of "d += emitLiteral(etc)".
MOVQ SI, 72(SP)
MOVQ DI, 80(SP)
MOVQ R15, 112(SP)
@ -494,35 +493,65 @@ inner1:
SUBQ R15, R11
SUBQ DX, R11
// s = extendMatch(src, candidate+4, s+4)
// ----------------------------------------
// Begin inline of the extendMatch call.
//
// Push args.
MOVQ DX, 0(SP)
// s = extendMatch(src, candidate+4, s+4)
// !!! R14 = &src[len(src)]
MOVQ src_len+32(FP), R14
MOVQ R14, 8(SP)
MOVQ R14, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
ADDQ DX, R14
// !!! R13 = &src[len(src) - 8]
MOVQ R14, R13
SUBQ $8, R13
// !!! R15 = &src[candidate + 4]
ADDQ $4, R15
MOVQ R15, 24(SP)
ADDQ DX, R15
// !!! s += 4
ADDQ $4, SI
SUBQ DX, SI
MOVQ SI, 32(SP)
// Spill local variables (registers) onto the stack; call; unspill.
MOVQ DI, 80(SP)
MOVQ R11, 96(SP)
MOVQ R12, 104(SP)
CALL ·extendMatch(SB)
MOVQ 56(SP), CX
MOVQ 64(SP), DX
MOVQ 80(SP), DI
MOVQ 88(SP), R9
MOVQ 96(SP), R11
MOVQ 104(SP), R12
inlineExtendMatchCmp8:
// As long as we are 8 or more bytes before the end of src, we can load and
// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
CMPQ SI, R13
JA inlineExtendMatchCmp1
MOVQ (R15), AX
MOVQ (SI), BX
CMPQ AX, BX
JNE inlineExtendMatchBSF
ADDQ $8, R15
ADDQ $8, SI
JMP inlineExtendMatchCmp8
// Finish the "s =" part of "s = extendMatch(etc)", remembering that the SI
// register holds &src[s], not s.
MOVQ 40(SP), SI
ADDQ DX, SI
inlineExtendMatchBSF:
// If those 8 bytes were not equal, XOR the two 8 byte values, and return
// the index of the first byte that differs. The BSF instruction finds the
// least significant 1 bit, the amd64 architecture is little-endian, and
// the shift by 3 converts a bit index to a byte index.
XORQ AX, BX
BSFQ BX, BX
SHRQ $3, BX
ADDQ BX, SI
JMP inlineExtendMatchEnd
inlineExtendMatchCmp1:
// In src's tail, compare 1 byte at a time.
CMPQ SI, R14
JAE inlineExtendMatchEnd
MOVB (R15), AX
MOVB (SI), BX
CMPB AX, BX
JNE inlineExtendMatchEnd
ADDQ $1, R15
ADDQ $1, SI
JMP inlineExtendMatchCmp1
inlineExtendMatchEnd:
// End inline of the extendMatch call.
// ----------------------------------------
// ----------------------------------------
// Begin inline of the emitCopy call.