Inline the extendMatch call.

Compared to the previous commit: name old speed new speed delta WordsEncode1e1-8 701MB/s ± 0% 699MB/s ± 1% ~ (p=0.123 n=10+10) WordsEncode1e2-8 460MB/s ± 0% 583MB/s ± 1% +26.64% (p=0.000 n=10+10) WordsEncode1e3-8 480MB/s ± 0% 647MB/s ± 2% +34.85% (p=0.000 n=10+10) WordsEncode1e4-8 416MB/s ± 0% 451MB/s ± 0% +8.30% (p=0.000 n=10+8) WordsEncode1e5-8 297MB/s ± 0% 355MB/s ± 2% +19.50% (p=0.000 n=10+9) WordsEncode1e6-8 345MB/s ± 0% 433MB/s ± 2% +25.47% (p=0.000 n=10+9) RandomEncode-8 14.4GB/s ± 2% 14.3GB/s ± 3% ~ (p=0.075 n=10+10) _ZFlat0-8 891MB/s ± 1% 1040MB/s ± 0% +16.67% (p=0.000 n=9+9) _ZFlat1-8 471MB/s ± 0% 535MB/s ± 1% +13.68% (p=0.000 n=9+10) _ZFlat2-8 16.2GB/s ± 3% 16.4GB/s ± 1% ~ (p=0.122 n=10+8) _ZFlat3-8 676MB/s ± 0% 762MB/s ± 0% +12.62% (p=0.000 n=10+9) _ZFlat4-8 8.36GB/s ± 1% 9.47GB/s ± 1% +13.28% (p=0.000 n=10+10) _ZFlat5-8 852MB/s ± 0% 986MB/s ± 1% +15.79% (p=0.000 n=10+9) _ZFlat6-8 316MB/s ± 0% 380MB/s ± 1% +20.41% (p=0.000 n=8+9) _ZFlat7-8 296MB/s ± 0% 353MB/s ± 0% +19.44% (p=0.000 n=8+10) _ZFlat8-8 331MB/s ± 1% 399MB/s ± 0% +20.53% (p=0.000 n=9+8) _ZFlat9-8 274MB/s ± 0% 329MB/s ± 0% +20.27% (p=0.000 n=8+9) _ZFlat10-8 1.17GB/s ± 0% 1.35GB/s ± 1% +15.15% (p=0.000 n=9+9) _ZFlat11-8 462MB/s ± 0% 608MB/s ± 0% +31.54% (p=0.000 n=9+9) The net effect of the past four inlining commits, when compared to just before c3defccc "Inline the emitCopy call": name old speed new speed delta WordsEncode1e1-8 701MB/s ± 1% 699MB/s ± 1% ~ (p=0.353 n=10+10) WordsEncode1e2-8 429MB/s ± 0% 583MB/s ± 1% +35.95% (p=0.000 n=9+10) WordsEncode1e3-8 447MB/s ± 0% 647MB/s ± 2% +44.85% (p=0.000 n=9+10) WordsEncode1e4-8 322MB/s ± 1% 451MB/s ± 0% +40.00% (p=0.000 n=10+8) WordsEncode1e5-8 268MB/s ± 0% 355MB/s ± 2% +32.41% (p=0.000 n=9+9) WordsEncode1e6-8 313MB/s ± 0% 433MB/s ± 2% +38.28% (p=0.000 n=8+9) RandomEncode-8 14.4GB/s ± 1% 14.3GB/s ± 3% ~ (p=0.897 n=8+10) _ZFlat0-8 797MB/s ± 2% 1040MB/s ± 0% +30.53% (p=0.000 n=9+9) _ZFlat1-8 435MB/s ± 1% 535MB/s ± 1% +22.97% (p=0.000 n=9+10) _ZFlat2-8 16.1GB/s ± 2% 16.4GB/s ± 1% +1.47% (p=0.001 n=10+8) _ZFlat3-8 633MB/s ± 0% 762MB/s ± 0% +20.32% (p=0.000 n=10+9) _ZFlat4-8 7.95GB/s ± 1% 9.47GB/s ± 1% +19.11% (p=0.000 n=10+10) _ZFlat5-8 771MB/s ± 0% 986MB/s ± 1% +27.83% (p=0.000 n=10+9) _ZFlat6-8 283MB/s ± 0% 380MB/s ± 1% +34.46% (p=0.000 n=10+9) _ZFlat7-8 265MB/s ± 0% 353MB/s ± 0% +33.29% (p=0.000 n=9+10) _ZFlat8-8 299MB/s ± 0% 399MB/s ± 0% +33.36% (p=0.000 n=9+8) _ZFlat9-8 246MB/s ± 1% 329MB/s ± 0% +33.58% (p=0.000 n=10+9) _ZFlat10-8 1.05GB/s ± 1% 1.35GB/s ± 1% +28.35% (p=0.000 n=10+9) _ZFlat11-8 411MB/s ± 0% 608MB/s ± 0% +47.82% (p=0.000 n=10+9)
2016-04-29 14:21:44 +10:00 · 2016-04-29 14:21:44 +10:00 · dfb3612ba2
--- a/encode_amd64.s
+++ b/encode_amd64.s
@ -251,8 +251,8 @@ extendMatchEnd:
 //	- R10	.	&src[nextEmit]
 //	- R11	96	prevHash, currHash, nextHash, offset
 //	- R12	104	&src[base], skip
-//	- R13	.	&src[nextS]
-//	- R14	.	len(src), bytesBetweenHashLookups, x
+//	- R13	.	&src[nextS], &src[len(src) - 8]
+//	- R14	.	len(src), bytesBetweenHashLookups, &src[len(src)], x
 //	- R15	112	candidate
 //
 // The second column (56, 64, etc) is the stack offset to spill the registers
@ -444,8 +444,7 @@ inlineEmitLiteralMemmove:
 	MOVQ DI, 0(SP)
 	MOVQ R10, 8(SP)
 	MOVQ AX, 16(SP)
-	// Finish the "d +=" part of "d += emitLiteral(etc)".
-	ADDQ AX, DI
+	ADDQ AX, DI              // Finish the "d +=" part of "d += emitLiteral(etc)".
 	MOVQ SI, 72(SP)
 	MOVQ DI, 80(SP)
 	MOVQ R15, 112(SP)
@ -494,35 +493,65 @@ inner1:
 	SUBQ R15, R11
 	SUBQ DX, R11

-	// s = extendMatch(src, candidate+4, s+4)
+	// ----------------------------------------
+	// Begin inline of the extendMatch call.
 	//
-	// Push args.
-	MOVQ DX, 0(SP)
+	// s = extendMatch(src, candidate+4, s+4)
+
+	// !!! R14 = &src[len(src)]
 	MOVQ src_len+32(FP), R14
-	MOVQ R14, 8(SP)
-	MOVQ R14, 16(SP)         // Unnecessary, as the callee ignores it, but conservative.
+	ADDQ DX, R14
+
+	// !!! R13 = &src[len(src) - 8]
+	MOVQ R14, R13
+	SUBQ $8, R13
+
+	// !!! R15 = &src[candidate + 4]
 	ADDQ $4, R15
-	MOVQ R15, 24(SP)
+	ADDQ DX, R15
+
+	// !!! s += 4
 	ADDQ $4, SI
-	SUBQ DX, SI
-	MOVQ SI, 32(SP)

-	// Spill local variables (registers) onto the stack; call; unspill.
-	MOVQ DI, 80(SP)
-	MOVQ R11, 96(SP)
-	MOVQ R12, 104(SP)
-	CALL ·extendMatch(SB)
-	MOVQ 56(SP), CX
-	MOVQ 64(SP), DX
-	MOVQ 80(SP), DI
-	MOVQ 88(SP), R9
-	MOVQ 96(SP), R11
-	MOVQ 104(SP), R12
+inlineExtendMatchCmp8:
+	// As long as we are 8 or more bytes before the end of src, we can load and
+	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
+	CMPQ SI, R13
+	JA   inlineExtendMatchCmp1
+	MOVQ (R15), AX
+	MOVQ (SI), BX
+	CMPQ AX, BX
+	JNE  inlineExtendMatchBSF
+	ADDQ $8, R15
+	ADDQ $8, SI
+	JMP  inlineExtendMatchCmp8

-	// Finish the "s =" part of "s = extendMatch(etc)", remembering that the SI
-	// register holds &src[s], not s.
-	MOVQ 40(SP), SI
-	ADDQ DX, SI
+inlineExtendMatchBSF:
+	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
+	// the index of the first byte that differs. The BSF instruction finds the
+	// least significant 1 bit, the amd64 architecture is little-endian, and
+	// the shift by 3 converts a bit index to a byte index.
+	XORQ AX, BX
+	BSFQ BX, BX
+	SHRQ $3, BX
+	ADDQ BX, SI
+	JMP  inlineExtendMatchEnd
+
+inlineExtendMatchCmp1:
+	// In src's tail, compare 1 byte at a time.
+	CMPQ SI, R14
+	JAE  inlineExtendMatchEnd
+	MOVB (R15), AX
+	MOVB (SI), BX
+	CMPB AX, BX
+	JNE  inlineExtendMatchEnd
+	ADDQ $1, R15
+	ADDQ $1, SI
+	JMP  inlineExtendMatchCmp1
+
+inlineExtendMatchEnd:
+	// End inline of the extendMatch call.
+	// ----------------------------------------

 	// ----------------------------------------
 	// Begin inline of the emitCopy call.