diff --git a/encode_amd64.s b/encode_amd64.s
index 40dcde8..adfd979 100644
--- a/encode_amd64.s
+++ b/encode_amd64.s
@@ -251,8 +251,8 @@ extendMatchEnd:
 //	- R10	.	&src[nextEmit]
 //	- R11	96	prevHash, currHash, nextHash, offset
 //	- R12	104	&src[base], skip
-//	- R13	.	&src[nextS]
-//	- R14	.	len(src), bytesBetweenHashLookups, x
+//	- R13	.	&src[nextS], &src[len(src) - 8]
+//	- R14	.	len(src), bytesBetweenHashLookups, &src[len(src)], x
 //	- R15	112	candidate
 //
 // The second column (56, 64, etc) is the stack offset to spill the registers
@@ -444,8 +444,7 @@ inlineEmitLiteralMemmove:
 	MOVQ DI, 0(SP)
 	MOVQ R10, 8(SP)
 	MOVQ AX, 16(SP)
-	// Finish the "d +=" part of "d += emitLiteral(etc)".
-	ADDQ AX, DI
+	ADDQ AX, DI              // Finish the "d +=" part of "d += emitLiteral(etc)".
 	MOVQ SI, 72(SP)
 	MOVQ DI, 80(SP)
 	MOVQ R15, 112(SP)
@@ -494,35 +493,65 @@ inner1:
 	SUBQ R15, R11
 	SUBQ DX, R11
 
-	// s = extendMatch(src, candidate+4, s+4)
+	// ----------------------------------------
+	// Begin inline of the extendMatch call.
 	//
-	// Push args.
-	MOVQ DX, 0(SP)
+	// s = extendMatch(src, candidate+4, s+4)
+
+	// !!! R14 = &src[len(src)]
 	MOVQ src_len+32(FP), R14
-	MOVQ R14, 8(SP)
-	MOVQ R14, 16(SP)         // Unnecessary, as the callee ignores it, but conservative.
+	ADDQ DX, R14
+
+	// !!! R13 = &src[len(src) - 8]
+	MOVQ R14, R13
+	SUBQ $8, R13
+
+	// !!! R15 = &src[candidate + 4]
 	ADDQ $4, R15
-	MOVQ R15, 24(SP)
+	ADDQ DX, R15
+
+	// !!! s += 4
 	ADDQ $4, SI
-	SUBQ DX, SI
-	MOVQ SI, 32(SP)
 
-	// Spill local variables (registers) onto the stack; call; unspill.
-	MOVQ DI, 80(SP)
-	MOVQ R11, 96(SP)
-	MOVQ R12, 104(SP)
-	CALL ·extendMatch(SB)
-	MOVQ 56(SP), CX
-	MOVQ 64(SP), DX
-	MOVQ 80(SP), DI
-	MOVQ 88(SP), R9
-	MOVQ 96(SP), R11
-	MOVQ 104(SP), R12
+inlineExtendMatchCmp8:
+	// As long as we are 8 or more bytes before the end of src, we can load and
+	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
+	CMPQ SI, R13
+	JA   inlineExtendMatchCmp1
+	MOVQ (R15), AX
+	MOVQ (SI), BX
+	CMPQ AX, BX
+	JNE  inlineExtendMatchBSF
+	ADDQ $8, R15
+	ADDQ $8, SI
+	JMP  inlineExtendMatchCmp8
 
-	// Finish the "s =" part of "s = extendMatch(etc)", remembering that the SI
-	// register holds &src[s], not s.
-	MOVQ 40(SP), SI
-	ADDQ DX, SI
+inlineExtendMatchBSF:
+	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
+	// the index of the first byte that differs. The BSF instruction finds the
+	// least significant 1 bit, the amd64 architecture is little-endian, and
+	// the shift by 3 converts a bit index to a byte index.
+	XORQ AX, BX
+	BSFQ BX, BX
+	SHRQ $3, BX
+	ADDQ BX, SI
+	JMP  inlineExtendMatchEnd
+
+inlineExtendMatchCmp1:
+	// In src's tail, compare 1 byte at a time.
+	CMPQ SI, R14
+	JAE  inlineExtendMatchEnd
+	MOVB (R15), AX
+	MOVB (SI), BX
+	CMPB AX, BX
+	JNE  inlineExtendMatchEnd
+	ADDQ $1, R15
+	ADDQ $1, SI
+	JMP  inlineExtendMatchCmp1
+
+inlineExtendMatchEnd:
+	// End inline of the extendMatch call.
+	// ----------------------------------------
 
 	// ----------------------------------------
 	// Begin inline of the emitCopy call.