Use the same encoding algorithm as C++ snappy.

When encoding the benchmark files, the output size is smaller: len(in) old_len(out) new_len(out) new/old_ratio description 102400 23488 22842 0.97 html 702087 346345 335387 0.97 urls 123093 123034 123034 1.00 jpg 200 144 146 1.01 jpg_200 102400 83786 83817 1.00 pdf 409600 95095 92221 0.97 html4 152089 91386 88017 0.96 txt1 125179 80526 77525 0.96 txt2 426754 244658 234392 0.96 txt3 481861 331356 319097 0.96 txt4 118588 24789 23295 0.94 pb 184320 74129 69526 0.94 gaviota On GOARCH=amd64, the throughput numbers are also faster: benchmark old MB/s new MB/s speedup BenchmarkWordsEncode1e1-8 674.93 681.22 1.01x BenchmarkWordsEncode1e2-8 47.92 49.91 1.04x BenchmarkWordsEncode1e3-8 189.48 213.64 1.13x BenchmarkWordsEncode1e4-8 193.17 245.31 1.27x BenchmarkWordsEncode1e5-8 151.44 178.84 1.18x BenchmarkWordsEncode1e6-8 180.63 203.74 1.13x BenchmarkRandomEncode-8 4700.25 5711.91 1.22x Benchmark_ZFlat0-8 372.12 422.42 1.14x Benchmark_ZFlat1-8 187.62 270.16 1.44x Benchmark_ZFlat2-8 4891.26 5542.08 1.13x Benchmark_ZFlat3-8 86.16 92.53 1.07x Benchmark_ZFlat4-8 570.31 963.51 1.69x Benchmark_ZFlat5-8 366.84 418.91 1.14x Benchmark_ZFlat6-8 164.18 182.67 1.11x Benchmark_ZFlat7-8 155.23 175.64 1.13x Benchmark_ZFlat8-8 169.62 193.08 1.14x Benchmark_ZFlat9-8 149.43 168.62 1.13x Benchmark_ZFlat10-8 412.63 497.87 1.21x Benchmark_ZFlat11-8 247.98 269.43 1.09x
2016-04-03 11:18:01 +10:00 · 2016-04-03 11:18:01 +10:00 · 8939696c22
--- a/encode.go
+++ b/encode.go
@ -14,11 +14,17 @@ import (
 // code.
 const maxOffset = 1 << 15

-func load32(b []byte, i int32) uint32 {
+func load32(b []byte, i int) uint32 {
 	b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
 	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
 }

+func load64(b []byte, i int) uint64 {
+	b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
+	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
+		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+}
+
 // emitLiteral writes a literal chunk and returns the number of bytes written.
 func emitLiteral(dst, lit []byte) int {
 	i, n := 0, uint(len(lit)-1)
@ -58,7 +64,7 @@ func emitLiteral(dst, lit []byte) int {
 }

 // emitCopy writes a copy chunk and returns the number of bytes written.
-func emitCopy(dst []byte, offset, length int32) int {
+func emitCopy(dst []byte, offset, length int) int {
 	i := 0
 	// The maximum length for a single tagCopy1 or tagCopy2 op is 64 bytes. The
 	// threshold for this loop is a little higher (at 68 = 64 + 4), and the
@ -138,8 +144,6 @@ func Encode(dst, src []byte) []byte {
 // that we don't overrun the dst and src buffers.
 //
 // TODO: implement this fast path.
-//
-// TODO: actually use inputMargin inside encodeBlock.
 const inputMargin = 16 - 1

 // minBlockSize is the minimum size of the input to encodeBlock. As above, we
@ -149,6 +153,10 @@ const inputMargin = 16 - 1
 // TODO: can we make this bound a little tighter, raising it by 1 or 2?
 const minBlockSize = inputMargin

+func hash(u, shift uint32) uint32 {
+	return (u * 0x1e35a7bd) >> shift
+}
+
 // encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
 // assumes that the varint-encoded length of the decompressed bytes has already
 // been written.
@ -159,19 +167,27 @@ const minBlockSize = inputMargin
 func encodeBlock(dst, src []byte) (d int) {
 	// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
 	const maxTableSize = 1 << 14
-	shift, tableSize := uint(32-8), 1<<8
+	shift, tableSize := uint32(32-8), 1<<8
 	for tableSize < maxTableSize && tableSize < len(src) {
 		shift--
 		tableSize *= 2
 	}
 	var table [maxTableSize]int32

-	// Iterate over the source bytes.
-	var (
-		s   int32 // The iterator position.
-		t   int32 // The last position with the same hash as s.
-		lit int32 // The start position of any pending literal bytes.
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin

+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	nextHash := hash(load32(src, s), shift)
+
+	for {
 		// Copied from the C++ snappy implementation:
 		//
 		// Heuristic match skipping: If 32 bytes are scanned with no matches
@ -186,43 +202,74 @@ func encodeBlock(dst, src []byte) (d int) {
 		// The "skip" variable keeps track of how many bytes there are since
 		// the last match; dividing it by 32 (ie. right-shifting by five) gives
 		// the number of bytes to move ahead for each iteration.
-		skip uint32 = 32
-	)
-	for uint32(s+3) < uint32(len(src)) { // The uint32 conversions catch overflow from the +3.
-		// Update the hash table.
-		h := load32(src, s)
-		p := &table[(h*0x1e35a7bd)>>shift]
-		// We need to to store values in [-1, inf) in table. To save
-		// some initialization time, (re)use the table's zero value
-		// and shift the values against this zero: add 1 on writes,
-		// subtract 1 on reads.
-		t, *p = *p-1, s+1
-		// If t is invalid or src[s:s+4] differs from src[t:t+4], accumulate a literal byte.
-		if t < 0 || s-t >= maxOffset || h != load32(src, t) {
-			s += int32(skip >> 5)
+		skip := 32
+
+		nextS := s
+		candidate := 0
+		for {
+			s = nextS
+			nextS = s + skip>>5
 			skip++
-			continue
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			candidate = int(table[nextHash])
+			table[nextHash] = int32(s)
+			nextHash = hash(load32(src, nextS), shift)
+			if load32(src, s) == load32(src, candidate) {
+				break
+			}
 		}
-		skip = 32
-		// Otherwise, we have a match. First, emit any pending literal bytes.
-		if lit != s {
-			d += emitLiteral(dst[d:], src[lit:s])
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+		d += emitLiteral(dst[d:], src[nextEmit:s])
+
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+			base := s
+			s += 4
+			for i := candidate + 4; s < len(src) && src[i] == src[s]; i, s = i+1, s+1 {
+			}
+			d += emitCopy(dst[d:], base-candidate, s-base)
+			nextEmit = s
+			if s >= sLimit {
+				goto emitRemainder
+			}
+
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-1 and at s. If
+			// another emitCopy is not our next move, also calculate nextHash
+			// at s+1. At least on GOARCH=amd64, these three hash calculations
+			// are faster as one load64 call (with some shifts) instead of
+			// three load32 calls.
+			x := load64(src, s-1)
+			prevHash := hash(uint32(x>>0), shift)
+			table[prevHash] = int32(s - 1)
+			currHash := hash(uint32(x>>8), shift)
+			candidate = int(table[currHash])
+			table[currHash] = int32(s)
+			if uint32(x>>8) != load32(src, candidate) {
+				nextHash = hash(uint32(x>>16), shift)
+				s++
+				break
+			}
 		}
-		// Extend the match to be as long as possible.
-		s0 := s
-		s, t = s+4, t+4
-		for int(s) < len(src) && src[s] == src[t] {
-			s++
-			t++
-		}
-		// Emit the copied bytes.
-		d += emitCopy(dst[d:], s-t, s-s0)
-		lit = s
 	}

-	// Emit any final pending literal bytes and return.
-	if int(lit) != len(src) {
-		d += emitLiteral(dst[d:], src[lit:])
+emitRemainder:
+	if nextEmit < len(src) {
+		d += emitLiteral(dst[d:], src[nextEmit:])
 	}
 	return d
 }
--- a/snappy_test.go
+++ b/snappy_test.go
@ -14,13 +14,31 @@ import (
 	"math/rand"
 	"net/http"
 	"os"
+	"os/exec"
 	"path/filepath"
+	"runtime"
 	"strings"
 	"testing"
 )

 var download = flag.Bool("download", false, "If true, download any missing files before running benchmarks")

+// goEncoderShouldMatchCppEncoder is whether to test that the algorithm used by
+// Go's encoder matches byte-for-byte what the C++ snappy encoder produces.
+// There is more than one valid encoding of any given input, and there is more
+// than one good algorithm along the frontier of trading off throughput for
+// output size. Nonetheless, we presume that the C++ encoder's algorithm is a
+// good one and has been tested on a wide range of inputs, so matching that
+// exactly should mean that the Go encoder's algorithm is also good, without
+// needing to gather our own corpus of test data.
+//
+// The exact algorithm used, though, is endianness-dependent, as it puns a
+// byte-pointer to a uint32-pointer to load and compare 4 bytes at a time. For
+// example, the "testdata/pi.txt.rawsnappy" file was generated by C++ code on a
+// little-endian system. The runtime package doesn't export endianness per se,
+// but we can restrict this match-C++ test to common little-endian systems.
+const goEncoderShouldMatchCppEncoder = runtime.GOARCH == "386" || runtime.GOARCH == "amd64"
+
 func TestMaxEncodedLenOfMaxBlockSize(t *testing.T) {
 	got := maxEncodedLenOfMaxBlockSize
 	want := MaxEncodedLen(maxBlockSize)
@ -450,6 +468,57 @@ func TestDecodeGoldenInput(t *testing.T) {
 	}
 }

+func TestEncodeGoldenInput(t *testing.T) {
+	if !goEncoderShouldMatchCppEncoder {
+		t.Skipf("skipping testing that the encoding is byte-for-byte identical to C++: GOARCH=%s", runtime.GOARCH)
+	}
+	src, err := ioutil.ReadFile("testdata/pi.txt")
+	if err != nil {
+		t.Fatalf("ReadFile: %v", err)
+	}
+	got := Encode(nil, src)
+	want, err := ioutil.ReadFile("testdata/pi.txt.rawsnappy")
+	if err != nil {
+		t.Fatalf("ReadFile: %v", err)
+	}
+	if err := cmp(got, want); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestSameEncodingAsCpp(t *testing.T) {
+	if !goEncoderShouldMatchCppEncoder {
+		t.Skipf("skipping testing that the encoding is byte-for-byte identical to C++: GOARCH=%s", runtime.GOARCH)
+	}
+	const cmdName = "cmd/snappytool/snappytool"
+	_, err := os.Stat(cmdName)
+	if err != nil {
+		t.Skipf("could not find snappytool: %v", err)
+	}
+	for i, tf := range testFiles {
+		if err := downloadBenchmarkFiles(t, tf.filename); err != nil {
+			t.Fatalf("failed to download testdata: %s", err)
+		}
+		data := readFile(t, filepath.Join(benchDir, tf.filename))
+		if n := tf.sizeLimit; 0 < n && n < len(data) {
+			data = data[:n]
+		}
+
+		got := Encode(nil, data)
+
+		cmd := exec.Command(cmdName, "-e")
+		cmd.Stdin = bytes.NewReader(data)
+		want, err := cmd.Output()
+		if err != nil {
+			t.Fatalf("could not run snappytool: %v", err)
+		}
+
+		if err := cmp(got, want); err != nil {
+			t.Errorf("i=%d: %v", i, err)
+		}
+	}
+}
+
 // TestSlowForwardCopyOverrun tests the "expand the pattern" algorithm
 // described in decode_amd64.s and its claim of a 10 byte overrun worst case.
 func TestSlowForwardCopyOverrun(t *testing.T) {
@ -822,10 +891,17 @@ func benchEncode(b *testing.B, src []byte) {
 	}
 }

+func testOrBenchmark(b testing.TB) string {
+	if _, ok := b.(*testing.B); ok {
+		return "benchmark"
+	}
+	return "test"
+}
+
 func readFile(b testing.TB, filename string) []byte {
 	src, err := ioutil.ReadFile(filename)
 	if err != nil {
-		b.Skipf("skipping benchmark: %v", err)
+		b.Skipf("skipping %s: %v", testOrBenchmark(b), err)
 	}
 	if len(src) == 0 {
 		b.Fatalf("%s has zero length", filename)
@ -906,14 +982,14 @@ const (
 	benchDir = "testdata/bench"
 )

-func downloadBenchmarkFiles(b *testing.B, basename string) (errRet error) {
+func downloadBenchmarkFiles(b testing.TB, basename string) (errRet error) {
 	filename := filepath.Join(benchDir, basename)
 	if stat, err := os.Stat(filename); err == nil && stat.Size() != 0 {
 		return nil
 	}

 	if !*download {
-		b.Skipf("test data not found; skipping benchmark without the -download flag")
+		b.Skipf("test data not found; skipping %s without the -download flag", testOrBenchmark(b))
 	}
 	// Download the official snappy C++ implementation reference test data
 	// files for benchmarking.
@ -947,12 +1023,12 @@ func downloadBenchmarkFiles(b *testing.B, basename string) (errRet error) {
 	return nil
 }

-func benchFile(b *testing.B, n int, decode bool) {
-	if err := downloadBenchmarkFiles(b, testFiles[n].filename); err != nil {
+func benchFile(b *testing.B, i int, decode bool) {
+	if err := downloadBenchmarkFiles(b, testFiles[i].filename); err != nil {
 		b.Fatalf("failed to download testdata: %s", err)
 	}
-	data := readFile(b, filepath.Join(benchDir, testFiles[n].filename))
-	if n := testFiles[n].sizeLimit; 0 < n && n < len(data) {
+	data := readFile(b, filepath.Join(benchDir, testFiles[i].filename))
+	if n := testFiles[i].sizeLimit; 0 < n && n < len(data) {
 		data = data[:n]
 	}
 	if decode {