diff --git a/encode_amd64.s b/encode_amd64.s index 40dcde8..adfd979 100644 --- a/encode_amd64.s +++ b/encode_amd64.s @@ -251,8 +251,8 @@ extendMatchEnd: // - R10 . &src[nextEmit] // - R11 96 prevHash, currHash, nextHash, offset // - R12 104 &src[base], skip -// - R13 . &src[nextS] -// - R14 . len(src), bytesBetweenHashLookups, x +// - R13 . &src[nextS], &src[len(src) - 8] +// - R14 . len(src), bytesBetweenHashLookups, &src[len(src)], x // - R15 112 candidate // // The second column (56, 64, etc) is the stack offset to spill the registers @@ -444,8 +444,7 @@ inlineEmitLiteralMemmove: MOVQ DI, 0(SP) MOVQ R10, 8(SP) MOVQ AX, 16(SP) - // Finish the "d +=" part of "d += emitLiteral(etc)". - ADDQ AX, DI + ADDQ AX, DI // Finish the "d +=" part of "d += emitLiteral(etc)". MOVQ SI, 72(SP) MOVQ DI, 80(SP) MOVQ R15, 112(SP) @@ -494,35 +493,65 @@ inner1: SUBQ R15, R11 SUBQ DX, R11 - // s = extendMatch(src, candidate+4, s+4) + // ---------------------------------------- + // Begin inline of the extendMatch call. // - // Push args. - MOVQ DX, 0(SP) + // s = extendMatch(src, candidate+4, s+4) + + // !!! R14 = &src[len(src)] MOVQ src_len+32(FP), R14 - MOVQ R14, 8(SP) - MOVQ R14, 16(SP) // Unnecessary, as the callee ignores it, but conservative. + ADDQ DX, R14 + + // !!! R13 = &src[len(src) - 8] + MOVQ R14, R13 + SUBQ $8, R13 + + // !!! R15 = &src[candidate + 4] ADDQ $4, R15 - MOVQ R15, 24(SP) + ADDQ DX, R15 + + // !!! s += 4 ADDQ $4, SI - SUBQ DX, SI - MOVQ SI, 32(SP) - // Spill local variables (registers) onto the stack; call; unspill. - MOVQ DI, 80(SP) - MOVQ R11, 96(SP) - MOVQ R12, 104(SP) - CALL ·extendMatch(SB) - MOVQ 56(SP), CX - MOVQ 64(SP), DX - MOVQ 80(SP), DI - MOVQ 88(SP), R9 - MOVQ 96(SP), R11 - MOVQ 104(SP), R12 +inlineExtendMatchCmp8: + // As long as we are 8 or more bytes before the end of src, we can load and + // compare 8 bytes at a time. If those 8 bytes are equal, repeat. + CMPQ SI, R13 + JA inlineExtendMatchCmp1 + MOVQ (R15), AX + MOVQ (SI), BX + CMPQ AX, BX + JNE inlineExtendMatchBSF + ADDQ $8, R15 + ADDQ $8, SI + JMP inlineExtendMatchCmp8 - // Finish the "s =" part of "s = extendMatch(etc)", remembering that the SI - // register holds &src[s], not s. - MOVQ 40(SP), SI - ADDQ DX, SI +inlineExtendMatchBSF: + // If those 8 bytes were not equal, XOR the two 8 byte values, and return + // the index of the first byte that differs. The BSF instruction finds the + // least significant 1 bit, the amd64 architecture is little-endian, and + // the shift by 3 converts a bit index to a byte index. + XORQ AX, BX + BSFQ BX, BX + SHRQ $3, BX + ADDQ BX, SI + JMP inlineExtendMatchEnd + +inlineExtendMatchCmp1: + // In src's tail, compare 1 byte at a time. + CMPQ SI, R14 + JAE inlineExtendMatchEnd + MOVB (R15), AX + MOVB (SI), BX + CMPB AX, BX + JNE inlineExtendMatchEnd + ADDQ $1, R15 + ADDQ $1, SI + JMP inlineExtendMatchCmp1 + +inlineExtendMatchEnd: + // End inline of the extendMatch call. + // ---------------------------------------- // ---------------------------------------- // Begin inline of the emitCopy call.