зеркало из https://github.com/golang/snappy.git
Make heuristic match skipping more aggressive.
This is the Go equivalent of an algorithmic change in the C++ snappy
code:
d53de18799
The discussion is at:
https://groups.google.com/d/topic/snappy-compression/3Qa3fASLkNA/discussion
benchmark old MB/s new MB/s speedup
BenchmarkWordsEncode1e1-8 680.57 679.12 1.00x
BenchmarkWordsEncode1e2-8 49.90 49.65 0.99x
BenchmarkWordsEncode1e3-8 213.28 212.75 1.00x
BenchmarkWordsEncode1e4-8 247.05 245.76 0.99x
BenchmarkWordsEncode1e5-8 180.68 179.95 1.00x
BenchmarkWordsEncode1e6-8 205.65 204.83 1.00x
BenchmarkRandomEncode-8 5678.83 11217.33 1.98x
Benchmark_ZFlat0-8 422.83 423.18 1.00x
Benchmark_ZFlat1-8 269.60 271.01 1.01x
Benchmark_ZFlat2-8 5517.16 11517.40 2.09x
Benchmark_ZFlat3-8 92.47 92.39 1.00x
Benchmark_ZFlat4-8 954.63 2947.73 3.09x
Benchmark_ZFlat5-8 419.71 419.87 1.00x
Benchmark_ZFlat6-8 184.13 183.45 1.00x
Benchmark_ZFlat7-8 175.83 175.89 1.00x
Benchmark_ZFlat8-8 193.49 193.84 1.00x
Benchmark_ZFlat9-8 169.02 168.59 1.00x
Benchmark_ZFlat10-8 500.19 499.85 1.00x
Benchmark_ZFlat11-8 271.20 270.60 1.00x
This commit is contained in:
Родитель
cef980a12b
Коммит
9bc0b5ad10
|
@ -1,6 +1,9 @@
|
|||
/*
|
||||
To build the snappytool binary:
|
||||
g++ main.cpp /usr/lib/libsnappy.a -o snappytool
|
||||
or, if you have built the C++ snappy library from source:
|
||||
g++ main.cpp /path/to/your/snappy/.libs/libsnappy.a -o snappytool
|
||||
after running "make" from your snappy checkout directory.
|
||||
*/
|
||||
|
||||
#include <errno.h>
|
||||
|
|
18
encode.go
18
encode.go
|
@ -204,12 +204,13 @@ func encodeBlock(dst, src []byte) (d int) {
|
|||
//
|
||||
// Heuristic match skipping: If 32 bytes are scanned with no matches
|
||||
// found, start looking only at every other byte. If 32 more bytes are
|
||||
// scanned, look at every third byte, etc.. When a match is found,
|
||||
// immediately go back to looking at every byte. This is a small loss
|
||||
// (~5% performance, ~0.1% density) for compressible data due to more
|
||||
// bookkeeping, but for non-compressible data (such as JPEG) it's a
|
||||
// huge win since the compressor quickly "realizes" the data is
|
||||
// incompressible and doesn't bother looking for matches everywhere.
|
||||
// scanned (or skipped), look at every third byte, etc.. When a match
|
||||
// is found, immediately go back to looking at every byte. This is a
|
||||
// small loss (~5% performance, ~0.1% density) for compressible data
|
||||
// due to more bookkeeping, but for non-compressible data (such as
|
||||
// JPEG) it's a huge win since the compressor quickly "realizes" the
|
||||
// data is incompressible and doesn't bother looking for matches
|
||||
// everywhere.
|
||||
//
|
||||
// The "skip" variable keeps track of how many bytes there are since
|
||||
// the last match; dividing it by 32 (ie. right-shifting by five) gives
|
||||
|
@ -220,8 +221,9 @@ func encodeBlock(dst, src []byte) (d int) {
|
|||
candidate := 0
|
||||
for {
|
||||
s = nextS
|
||||
nextS = s + skip>>5
|
||||
skip++
|
||||
bytesBetweenHashLookups := skip >> 5
|
||||
nextS = s + bytesBetweenHashLookups
|
||||
skip += bytesBetweenHashLookups
|
||||
if nextS > sLimit {
|
||||
goto emitRemainder
|
||||
}
|
||||
|
|
|
@ -459,6 +459,12 @@ func TestDecodeLengthOffset(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
// TODO: pi.txt no longer compresses at all, after the Go code implemented the
|
||||
// same algorithmic change as the C++ code:
|
||||
// https://github.com/google/snappy/commit/d53de18799418e113e44444252a39b12a0e4e0cc
|
||||
//
|
||||
// We should use a more compressible test file.
|
||||
|
||||
func TestDecodeGoldenInput(t *testing.T) {
|
||||
src, err := ioutil.ReadFile("testdata/pi.txt.rawsnappy")
|
||||
if err != nil {
|
||||
|
@ -532,6 +538,7 @@ func TestSameEncodingAsCppLongFiles(t *testing.T) {
|
|||
if msg := skipTestSameEncodingAsCpp(); msg != "" {
|
||||
t.Skip(msg)
|
||||
}
|
||||
failed := false
|
||||
for i, tf := range testFiles {
|
||||
if err := downloadBenchmarkFiles(t, tf.filename); err != nil {
|
||||
t.Fatalf("failed to download testdata: %s", err)
|
||||
|
@ -542,8 +549,14 @@ func TestSameEncodingAsCppLongFiles(t *testing.T) {
|
|||
}
|
||||
if err := runTestSameEncodingAsCpp(data); err != nil {
|
||||
t.Errorf("i=%d: %v", i, err)
|
||||
failed = true
|
||||
}
|
||||
}
|
||||
if failed {
|
||||
t.Errorf("was the snappytool program built against the C++ snappy library version " +
|
||||
"d53de187 or later, commited on 2016-04-05? See " +
|
||||
"https://github.com/google/snappy/commit/d53de18799418e113e44444252a39b12a0e4e0cc")
|
||||
}
|
||||
}
|
||||
|
||||
// TestSlowForwardCopyOverrun tests the "expand the pattern" algorithm
|
||||
|
@ -595,7 +608,7 @@ func TestSlowForwardCopyOverrun(t *testing.T) {
|
|||
// incompressible and the second half is very compressible. The encoded form's
|
||||
// length should be closer to 50% of the original length than 100%.
|
||||
func TestEncodeNoiseThenRepeats(t *testing.T) {
|
||||
for _, origLen := range []int{32 * 1024, 256 * 1024, 2048 * 1024} {
|
||||
for _, origLen := range []int{256 * 1024, 2048 * 1024} {
|
||||
src := make([]byte, origLen)
|
||||
rng := rand.New(rand.NewSource(1))
|
||||
firstHalf, secondHalf := src[:origLen/2], src[origLen/2:]
|
||||
|
|
Двоичный файл не отображается.
Загрузка…
Ссылка в новой задаче