Make heuristic match skipping more aggressive.

This is the Go equivalent of an algorithmic change in the C++ snappy
code:
d53de18799

The discussion is at:
https://groups.google.com/d/topic/snappy-compression/3Qa3fASLkNA/discussion

benchmark                     old MB/s     new MB/s     speedup
BenchmarkWordsEncode1e1-8     680.57       679.12       1.00x
BenchmarkWordsEncode1e2-8     49.90        49.65        0.99x
BenchmarkWordsEncode1e3-8     213.28       212.75       1.00x
BenchmarkWordsEncode1e4-8     247.05       245.76       0.99x
BenchmarkWordsEncode1e5-8     180.68       179.95       1.00x
BenchmarkWordsEncode1e6-8     205.65       204.83       1.00x
BenchmarkRandomEncode-8       5678.83      11217.33     1.98x
Benchmark_ZFlat0-8            422.83       423.18       1.00x
Benchmark_ZFlat1-8            269.60       271.01       1.01x
Benchmark_ZFlat2-8            5517.16      11517.40     2.09x
Benchmark_ZFlat3-8            92.47        92.39        1.00x
Benchmark_ZFlat4-8            954.63       2947.73      3.09x
Benchmark_ZFlat5-8            419.71       419.87       1.00x
Benchmark_ZFlat6-8            184.13       183.45       1.00x
Benchmark_ZFlat7-8            175.83       175.89       1.00x
Benchmark_ZFlat8-8            193.49       193.84       1.00x
Benchmark_ZFlat9-8            169.02       168.59       1.00x
Benchmark_ZFlat10-8           500.19       499.85       1.00x
Benchmark_ZFlat11-8           271.20       270.60       1.00x
This commit is contained in:
Nigel Tao 2016-04-10 16:02:28 +10:00
Родитель cef980a12b
Коммит 9bc0b5ad10
4 изменённых файлов: 27 добавлений и 9 удалений

Просмотреть файл

@ -1,6 +1,9 @@
/*
To build the snappytool binary:
g++ main.cpp /usr/lib/libsnappy.a -o snappytool
or, if you have built the C++ snappy library from source:
g++ main.cpp /path/to/your/snappy/.libs/libsnappy.a -o snappytool
after running "make" from your snappy checkout directory.
*/
#include <errno.h>

Просмотреть файл

@ -204,12 +204,13 @@ func encodeBlock(dst, src []byte) (d int) {
//
// Heuristic match skipping: If 32 bytes are scanned with no matches
// found, start looking only at every other byte. If 32 more bytes are
// scanned, look at every third byte, etc.. When a match is found,
// immediately go back to looking at every byte. This is a small loss
// (~5% performance, ~0.1% density) for compressible data due to more
// bookkeeping, but for non-compressible data (such as JPEG) it's a
// huge win since the compressor quickly "realizes" the data is
// incompressible and doesn't bother looking for matches everywhere.
// scanned (or skipped), look at every third byte, etc.. When a match
// is found, immediately go back to looking at every byte. This is a
// small loss (~5% performance, ~0.1% density) for compressible data
// due to more bookkeeping, but for non-compressible data (such as
// JPEG) it's a huge win since the compressor quickly "realizes" the
// data is incompressible and doesn't bother looking for matches
// everywhere.
//
// The "skip" variable keeps track of how many bytes there are since
// the last match; dividing it by 32 (ie. right-shifting by five) gives
@ -220,8 +221,9 @@ func encodeBlock(dst, src []byte) (d int) {
candidate := 0
for {
s = nextS
nextS = s + skip>>5
skip++
bytesBetweenHashLookups := skip >> 5
nextS = s + bytesBetweenHashLookups
skip += bytesBetweenHashLookups
if nextS > sLimit {
goto emitRemainder
}

Просмотреть файл

@ -459,6 +459,12 @@ func TestDecodeLengthOffset(t *testing.T) {
}
}
// TODO: pi.txt no longer compresses at all, after the Go code implemented the
// same algorithmic change as the C++ code:
// https://github.com/google/snappy/commit/d53de18799418e113e44444252a39b12a0e4e0cc
//
// We should use a more compressible test file.
func TestDecodeGoldenInput(t *testing.T) {
src, err := ioutil.ReadFile("testdata/pi.txt.rawsnappy")
if err != nil {
@ -532,6 +538,7 @@ func TestSameEncodingAsCppLongFiles(t *testing.T) {
if msg := skipTestSameEncodingAsCpp(); msg != "" {
t.Skip(msg)
}
failed := false
for i, tf := range testFiles {
if err := downloadBenchmarkFiles(t, tf.filename); err != nil {
t.Fatalf("failed to download testdata: %s", err)
@ -542,8 +549,14 @@ func TestSameEncodingAsCppLongFiles(t *testing.T) {
}
if err := runTestSameEncodingAsCpp(data); err != nil {
t.Errorf("i=%d: %v", i, err)
failed = true
}
}
if failed {
t.Errorf("was the snappytool program built against the C++ snappy library version " +
"d53de187 or later, commited on 2016-04-05? See " +
"https://github.com/google/snappy/commit/d53de18799418e113e44444252a39b12a0e4e0cc")
}
}
// TestSlowForwardCopyOverrun tests the "expand the pattern" algorithm
@ -595,7 +608,7 @@ func TestSlowForwardCopyOverrun(t *testing.T) {
// incompressible and the second half is very compressible. The encoded form's
// length should be closer to 50% of the original length than 100%.
func TestEncodeNoiseThenRepeats(t *testing.T) {
for _, origLen := range []int{32 * 1024, 256 * 1024, 2048 * 1024} {
for _, origLen := range []int{256 * 1024, 2048 * 1024} {
src := make([]byte, origLen)
rng := rand.New(rand.NewSource(1))
firstHalf, secondHalf := src[:origLen/2], src[origLen/2:]

Двоичные данные
testdata/pi.txt.rawsnappy поставляемый

Двоичный файл не отображается.