Use the same encoding algorithm as C++ snappy.

When encoding the benchmark files, the output size is smaller:

len(in)	old_len(out)	new_len(out)	new/old_ratio	description
102400	23488	22842	0.97	html
702087	346345	335387	0.97	urls
123093	123034	123034	1.00	jpg
200	144	146	1.01	jpg_200
102400	83786	83817	1.00	pdf
409600	95095	92221	0.97	html4
152089	91386	88017	0.96	txt1
125179	80526	77525	0.96	txt2
426754	244658	234392	0.96	txt3
481861	331356	319097	0.96	txt4
118588	24789	23295	0.94	pb
184320	74129	69526	0.94	gaviota

On GOARCH=amd64, the throughput numbers are also faster:

benchmark                     old MB/s     new MB/s     speedup
BenchmarkWordsEncode1e1-8     674.93       681.22       1.01x
BenchmarkWordsEncode1e2-8     47.92        49.91        1.04x
BenchmarkWordsEncode1e3-8     189.48       213.64       1.13x
BenchmarkWordsEncode1e4-8     193.17       245.31       1.27x
BenchmarkWordsEncode1e5-8     151.44       178.84       1.18x
BenchmarkWordsEncode1e6-8     180.63       203.74       1.13x
BenchmarkRandomEncode-8       4700.25      5711.91      1.22x
Benchmark_ZFlat0-8            372.12       422.42       1.14x
Benchmark_ZFlat1-8            187.62       270.16       1.44x
Benchmark_ZFlat2-8            4891.26      5542.08      1.13x
Benchmark_ZFlat3-8            86.16        92.53        1.07x
Benchmark_ZFlat4-8            570.31       963.51       1.69x
Benchmark_ZFlat5-8            366.84       418.91       1.14x
Benchmark_ZFlat6-8            164.18       182.67       1.11x
Benchmark_ZFlat7-8            155.23       175.64       1.13x
Benchmark_ZFlat8-8            169.62       193.08       1.14x
Benchmark_ZFlat9-8            149.43       168.62       1.13x
Benchmark_ZFlat10-8           412.63       497.87       1.21x
Benchmark_ZFlat11-8           247.98       269.43       1.09x
This commit is contained in:
Nigel Tao 2016-04-03 11:18:01 +10:00
Родитель ebebc71721
Коммит 8939696c22
2 изменённых файлов: 172 добавлений и 49 удалений

131
encode.go
Просмотреть файл

@ -14,11 +14,17 @@ import (
// code.
const maxOffset = 1 << 15
func load32(b []byte, i int32) uint32 {
func load32(b []byte, i int) uint32 {
b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
}
func load64(b []byte, i int) uint64 {
b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
}
// emitLiteral writes a literal chunk and returns the number of bytes written.
func emitLiteral(dst, lit []byte) int {
i, n := 0, uint(len(lit)-1)
@ -58,7 +64,7 @@ func emitLiteral(dst, lit []byte) int {
}
// emitCopy writes a copy chunk and returns the number of bytes written.
func emitCopy(dst []byte, offset, length int32) int {
func emitCopy(dst []byte, offset, length int) int {
i := 0
// The maximum length for a single tagCopy1 or tagCopy2 op is 64 bytes. The
// threshold for this loop is a little higher (at 68 = 64 + 4), and the
@ -138,8 +144,6 @@ func Encode(dst, src []byte) []byte {
// that we don't overrun the dst and src buffers.
//
// TODO: implement this fast path.
//
// TODO: actually use inputMargin inside encodeBlock.
const inputMargin = 16 - 1
// minBlockSize is the minimum size of the input to encodeBlock. As above, we
@ -149,6 +153,10 @@ const inputMargin = 16 - 1
// TODO: can we make this bound a little tighter, raising it by 1 or 2?
const minBlockSize = inputMargin
func hash(u, shift uint32) uint32 {
return (u * 0x1e35a7bd) >> shift
}
// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
// assumes that the varint-encoded length of the decompressed bytes has already
// been written.
@ -159,19 +167,27 @@ const minBlockSize = inputMargin
func encodeBlock(dst, src []byte) (d int) {
// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
const maxTableSize = 1 << 14
shift, tableSize := uint(32-8), 1<<8
shift, tableSize := uint32(32-8), 1<<8
for tableSize < maxTableSize && tableSize < len(src) {
shift--
tableSize *= 2
}
var table [maxTableSize]int32
// Iterate over the source bytes.
var (
s int32 // The iterator position.
t int32 // The last position with the same hash as s.
lit int32 // The start position of any pending literal bytes.
// sLimit is when to stop looking for offset/length copies. The inputMargin
// lets us use a fast path for emitLiteral in the main loop, while we are
// looking for copies.
sLimit := len(src) - inputMargin
// nextEmit is where in src the next emitLiteral should start from.
nextEmit := 0
// The encoded form must start with a literal, as there are no previous
// bytes to copy, so we start looking for hash matches at s == 1.
s := 1
nextHash := hash(load32(src, s), shift)
for {
// Copied from the C++ snappy implementation:
//
// Heuristic match skipping: If 32 bytes are scanned with no matches
@ -186,43 +202,74 @@ func encodeBlock(dst, src []byte) (d int) {
// The "skip" variable keeps track of how many bytes there are since
// the last match; dividing it by 32 (ie. right-shifting by five) gives
// the number of bytes to move ahead for each iteration.
skip uint32 = 32
)
for uint32(s+3) < uint32(len(src)) { // The uint32 conversions catch overflow from the +3.
// Update the hash table.
h := load32(src, s)
p := &table[(h*0x1e35a7bd)>>shift]
// We need to to store values in [-1, inf) in table. To save
// some initialization time, (re)use the table's zero value
// and shift the values against this zero: add 1 on writes,
// subtract 1 on reads.
t, *p = *p-1, s+1
// If t is invalid or src[s:s+4] differs from src[t:t+4], accumulate a literal byte.
if t < 0 || s-t >= maxOffset || h != load32(src, t) {
s += int32(skip >> 5)
skip := 32
nextS := s
candidate := 0
for {
s = nextS
nextS = s + skip>>5
skip++
continue
if nextS > sLimit {
goto emitRemainder
}
candidate = int(table[nextHash])
table[nextHash] = int32(s)
nextHash = hash(load32(src, nextS), shift)
if load32(src, s) == load32(src, candidate) {
break
}
}
skip = 32
// Otherwise, we have a match. First, emit any pending literal bytes.
if lit != s {
d += emitLiteral(dst[d:], src[lit:s])
// A 4-byte match has been found. We'll later see if more than 4 bytes
// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
// them as literal bytes.
d += emitLiteral(dst[d:], src[nextEmit:s])
// Call emitCopy, and then see if another emitCopy could be our next
// move. Repeat until we find no match for the input immediately after
// what was consumed by the last emitCopy call.
//
// If we exit this loop normally then we need to call emitLiteral next,
// though we don't yet know how big the literal will be. We handle that
// by proceeding to the next iteration of the main loop. We also can
// exit this loop via goto if we get close to exhausting the input.
for {
// Invariant: we have a 4-byte match at s, and no need to emit any
// literal bytes prior to s.
base := s
s += 4
for i := candidate + 4; s < len(src) && src[i] == src[s]; i, s = i+1, s+1 {
}
d += emitCopy(dst[d:], base-candidate, s-base)
nextEmit = s
if s >= sLimit {
goto emitRemainder
}
// We could immediately start working at s now, but to improve
// compression we first update the hash table at s-1 and at s. If
// another emitCopy is not our next move, also calculate nextHash
// at s+1. At least on GOARCH=amd64, these three hash calculations
// are faster as one load64 call (with some shifts) instead of
// three load32 calls.
x := load64(src, s-1)
prevHash := hash(uint32(x>>0), shift)
table[prevHash] = int32(s - 1)
currHash := hash(uint32(x>>8), shift)
candidate = int(table[currHash])
table[currHash] = int32(s)
if uint32(x>>8) != load32(src, candidate) {
nextHash = hash(uint32(x>>16), shift)
s++
break
}
}
// Extend the match to be as long as possible.
s0 := s
s, t = s+4, t+4
for int(s) < len(src) && src[s] == src[t] {
s++
t++
}
// Emit the copied bytes.
d += emitCopy(dst[d:], s-t, s-s0)
lit = s
}
// Emit any final pending literal bytes and return.
if int(lit) != len(src) {
d += emitLiteral(dst[d:], src[lit:])
emitRemainder:
if nextEmit < len(src) {
d += emitLiteral(dst[d:], src[nextEmit:])
}
return d
}

Просмотреть файл

@ -14,13 +14,31 @@ import (
"math/rand"
"net/http"
"os"
"os/exec"
"path/filepath"
"runtime"
"strings"
"testing"
)
var download = flag.Bool("download", false, "If true, download any missing files before running benchmarks")
// goEncoderShouldMatchCppEncoder is whether to test that the algorithm used by
// Go's encoder matches byte-for-byte what the C++ snappy encoder produces.
// There is more than one valid encoding of any given input, and there is more
// than one good algorithm along the frontier of trading off throughput for
// output size. Nonetheless, we presume that the C++ encoder's algorithm is a
// good one and has been tested on a wide range of inputs, so matching that
// exactly should mean that the Go encoder's algorithm is also good, without
// needing to gather our own corpus of test data.
//
// The exact algorithm used, though, is endianness-dependent, as it puns a
// byte-pointer to a uint32-pointer to load and compare 4 bytes at a time. For
// example, the "testdata/pi.txt.rawsnappy" file was generated by C++ code on a
// little-endian system. The runtime package doesn't export endianness per se,
// but we can restrict this match-C++ test to common little-endian systems.
const goEncoderShouldMatchCppEncoder = runtime.GOARCH == "386" || runtime.GOARCH == "amd64"
func TestMaxEncodedLenOfMaxBlockSize(t *testing.T) {
got := maxEncodedLenOfMaxBlockSize
want := MaxEncodedLen(maxBlockSize)
@ -450,6 +468,57 @@ func TestDecodeGoldenInput(t *testing.T) {
}
}
func TestEncodeGoldenInput(t *testing.T) {
if !goEncoderShouldMatchCppEncoder {
t.Skipf("skipping testing that the encoding is byte-for-byte identical to C++: GOARCH=%s", runtime.GOARCH)
}
src, err := ioutil.ReadFile("testdata/pi.txt")
if err != nil {
t.Fatalf("ReadFile: %v", err)
}
got := Encode(nil, src)
want, err := ioutil.ReadFile("testdata/pi.txt.rawsnappy")
if err != nil {
t.Fatalf("ReadFile: %v", err)
}
if err := cmp(got, want); err != nil {
t.Fatal(err)
}
}
func TestSameEncodingAsCpp(t *testing.T) {
if !goEncoderShouldMatchCppEncoder {
t.Skipf("skipping testing that the encoding is byte-for-byte identical to C++: GOARCH=%s", runtime.GOARCH)
}
const cmdName = "cmd/snappytool/snappytool"
_, err := os.Stat(cmdName)
if err != nil {
t.Skipf("could not find snappytool: %v", err)
}
for i, tf := range testFiles {
if err := downloadBenchmarkFiles(t, tf.filename); err != nil {
t.Fatalf("failed to download testdata: %s", err)
}
data := readFile(t, filepath.Join(benchDir, tf.filename))
if n := tf.sizeLimit; 0 < n && n < len(data) {
data = data[:n]
}
got := Encode(nil, data)
cmd := exec.Command(cmdName, "-e")
cmd.Stdin = bytes.NewReader(data)
want, err := cmd.Output()
if err != nil {
t.Fatalf("could not run snappytool: %v", err)
}
if err := cmp(got, want); err != nil {
t.Errorf("i=%d: %v", i, err)
}
}
}
// TestSlowForwardCopyOverrun tests the "expand the pattern" algorithm
// described in decode_amd64.s and its claim of a 10 byte overrun worst case.
func TestSlowForwardCopyOverrun(t *testing.T) {
@ -822,10 +891,17 @@ func benchEncode(b *testing.B, src []byte) {
}
}
func testOrBenchmark(b testing.TB) string {
if _, ok := b.(*testing.B); ok {
return "benchmark"
}
return "test"
}
func readFile(b testing.TB, filename string) []byte {
src, err := ioutil.ReadFile(filename)
if err != nil {
b.Skipf("skipping benchmark: %v", err)
b.Skipf("skipping %s: %v", testOrBenchmark(b), err)
}
if len(src) == 0 {
b.Fatalf("%s has zero length", filename)
@ -906,14 +982,14 @@ const (
benchDir = "testdata/bench"
)
func downloadBenchmarkFiles(b *testing.B, basename string) (errRet error) {
func downloadBenchmarkFiles(b testing.TB, basename string) (errRet error) {
filename := filepath.Join(benchDir, basename)
if stat, err := os.Stat(filename); err == nil && stat.Size() != 0 {
return nil
}
if !*download {
b.Skipf("test data not found; skipping benchmark without the -download flag")
b.Skipf("test data not found; skipping %s without the -download flag", testOrBenchmark(b))
}
// Download the official snappy C++ implementation reference test data
// files for benchmarking.
@ -947,12 +1023,12 @@ func downloadBenchmarkFiles(b *testing.B, basename string) (errRet error) {
return nil
}
func benchFile(b *testing.B, n int, decode bool) {
if err := downloadBenchmarkFiles(b, testFiles[n].filename); err != nil {
func benchFile(b *testing.B, i int, decode bool) {
if err := downloadBenchmarkFiles(b, testFiles[i].filename); err != nil {
b.Fatalf("failed to download testdata: %s", err)
}
data := readFile(b, filepath.Join(benchDir, testFiles[n].filename))
if n := testFiles[n].sizeLimit; 0 < n && n < len(data) {
data := readFile(b, filepath.Join(benchDir, testFiles[i].filename))
if n := testFiles[i].sizeLimit; 0 < n && n < len(data) {
data = data[:n]
}
if decode {