2011-08-31 15:59:08 +04:00
|
|
|
// Copyright 2011 The Snappy-Go Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style
|
|
|
|
// license that can be found in the LICENSE file.
|
|
|
|
|
|
|
|
package snappy
|
|
|
|
|
|
|
|
import (
|
2011-11-08 03:33:50 +04:00
|
|
|
"encoding/binary"
|
2016-02-11 07:44:51 +03:00
|
|
|
"errors"
|
2015-02-10 03:56:34 +03:00
|
|
|
"io"
|
2011-08-31 15:59:08 +04:00
|
|
|
)
|
|
|
|
|
2016-02-25 07:30:12 +03:00
|
|
|
// maxOffset limits how far copy back-references can go, the same as the C++
|
|
|
|
// code.
|
2011-08-31 15:59:08 +04:00
|
|
|
const maxOffset = 1 << 15
|
|
|
|
|
2016-04-03 04:18:01 +03:00
|
|
|
func load32(b []byte, i int) uint32 {
|
2016-04-03 01:30:29 +03:00
|
|
|
b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
|
|
|
|
return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
|
|
|
|
}
|
|
|
|
|
2016-04-03 04:18:01 +03:00
|
|
|
func load64(b []byte, i int) uint64 {
|
|
|
|
b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
|
|
|
|
return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
|
|
|
|
uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
|
|
|
|
}
|
|
|
|
|
2011-08-31 15:59:08 +04:00
|
|
|
// emitLiteral writes a literal chunk and returns the number of bytes written.
|
|
|
|
func emitLiteral(dst, lit []byte) int {
|
|
|
|
i, n := 0, uint(len(lit)-1)
|
|
|
|
switch {
|
|
|
|
case n < 60:
|
|
|
|
dst[0] = uint8(n)<<2 | tagLiteral
|
|
|
|
i = 1
|
|
|
|
case n < 1<<8:
|
|
|
|
dst[0] = 60<<2 | tagLiteral
|
|
|
|
dst[1] = uint8(n)
|
|
|
|
i = 2
|
|
|
|
case n < 1<<16:
|
|
|
|
dst[0] = 61<<2 | tagLiteral
|
|
|
|
dst[1] = uint8(n)
|
|
|
|
dst[2] = uint8(n >> 8)
|
|
|
|
i = 3
|
|
|
|
case n < 1<<24:
|
|
|
|
dst[0] = 62<<2 | tagLiteral
|
|
|
|
dst[1] = uint8(n)
|
|
|
|
dst[2] = uint8(n >> 8)
|
|
|
|
dst[3] = uint8(n >> 16)
|
|
|
|
i = 4
|
|
|
|
case int64(n) < 1<<32:
|
|
|
|
dst[0] = 63<<2 | tagLiteral
|
|
|
|
dst[1] = uint8(n)
|
|
|
|
dst[2] = uint8(n >> 8)
|
|
|
|
dst[3] = uint8(n >> 16)
|
|
|
|
dst[4] = uint8(n >> 24)
|
|
|
|
i = 5
|
|
|
|
default:
|
|
|
|
panic("snappy: source buffer is too long")
|
|
|
|
}
|
|
|
|
if copy(dst[i:], lit) != len(lit) {
|
|
|
|
panic("snappy: destination buffer is too short")
|
|
|
|
}
|
|
|
|
return i + len(lit)
|
|
|
|
}
|
|
|
|
|
|
|
|
// emitCopy writes a copy chunk and returns the number of bytes written.
|
2016-04-03 04:18:01 +03:00
|
|
|
func emitCopy(dst []byte, offset, length int) int {
|
2011-08-31 15:59:08 +04:00
|
|
|
i := 0
|
2016-04-02 08:22:40 +03:00
|
|
|
// The maximum length for a single tagCopy1 or tagCopy2 op is 64 bytes. The
|
|
|
|
// threshold for this loop is a little higher (at 68 = 64 + 4), and the
|
|
|
|
// length emitted down below is is a little lower (at 60 = 64 - 4), because
|
|
|
|
// it's shorter to encode a length 67 copy as a length 60 tagCopy2 followed
|
|
|
|
// by a length 7 tagCopy1 (which encodes as 3+2 bytes) than to encode it as
|
|
|
|
// a length 64 tagCopy2 followed by a length 3 tagCopy2 (which encodes as
|
|
|
|
// 3+3 bytes). The magic 4 in the 64±4 is because the minimum length for a
|
|
|
|
// tagCopy1 op is 4 bytes, which is why a length 3 copy has to be an
|
|
|
|
// encodes-as-3-bytes tagCopy2 instead of an encodes-as-2-bytes tagCopy1.
|
|
|
|
for length >= 68 {
|
|
|
|
// Emit a length 64 copy, encoded as 3 bytes.
|
|
|
|
dst[i+0] = 63<<2 | tagCopy2
|
|
|
|
dst[i+1] = uint8(offset)
|
|
|
|
dst[i+2] = uint8(offset >> 8)
|
|
|
|
i += 3
|
|
|
|
length -= 64
|
|
|
|
}
|
|
|
|
if length > 64 {
|
|
|
|
// Emit a length 60 copy, encoded as 3 bytes.
|
|
|
|
dst[i+0] = 59<<2 | tagCopy2
|
2011-08-31 15:59:08 +04:00
|
|
|
dst[i+1] = uint8(offset)
|
|
|
|
dst[i+2] = uint8(offset >> 8)
|
|
|
|
i += 3
|
2016-04-02 08:22:40 +03:00
|
|
|
length -= 60
|
|
|
|
}
|
|
|
|
if length >= 12 || offset >= 2048 {
|
|
|
|
// Emit the remaining copy, encoded as 3 bytes.
|
|
|
|
dst[i+0] = uint8(length-1)<<2 | tagCopy2
|
|
|
|
dst[i+1] = uint8(offset)
|
|
|
|
dst[i+2] = uint8(offset >> 8)
|
|
|
|
return i + 3
|
2011-08-31 15:59:08 +04:00
|
|
|
}
|
2016-04-02 08:22:40 +03:00
|
|
|
// Emit the remaining copy, encoded as 2 bytes.
|
|
|
|
dst[i+0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
|
|
|
|
dst[i+1] = uint8(offset)
|
|
|
|
return i + 2
|
2011-08-31 15:59:08 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
// Encode returns the encoded form of src. The returned slice may be a sub-
|
|
|
|
// slice of dst if dst was large enough to hold the entire encoded block.
|
|
|
|
// Otherwise, a newly allocated slice will be returned.
|
2016-02-13 12:09:53 +03:00
|
|
|
//
|
2011-08-31 15:59:08 +04:00
|
|
|
// It is valid to pass a nil dst.
|
2015-07-17 10:21:07 +03:00
|
|
|
func Encode(dst, src []byte) []byte {
|
2016-02-13 12:09:53 +03:00
|
|
|
if n := MaxEncodedLen(len(src)); n < 0 {
|
|
|
|
panic(ErrTooLarge)
|
|
|
|
} else if len(dst) < n {
|
2011-08-31 15:59:08 +04:00
|
|
|
dst = make([]byte, n)
|
|
|
|
}
|
|
|
|
|
|
|
|
// The block starts with the varint-encoded length of the decompressed bytes.
|
2011-11-08 03:33:50 +04:00
|
|
|
d := binary.PutUvarint(dst, uint64(len(src)))
|
2011-08-31 15:59:08 +04:00
|
|
|
|
2016-02-13 06:11:38 +03:00
|
|
|
for len(src) > 0 {
|
|
|
|
p := src
|
|
|
|
src = nil
|
2016-02-22 04:44:36 +03:00
|
|
|
if len(p) > maxBlockSize {
|
|
|
|
p, src = p[:maxBlockSize], p[maxBlockSize:]
|
2016-02-13 06:11:38 +03:00
|
|
|
}
|
2016-04-03 02:25:18 +03:00
|
|
|
if len(p) < minBlockSize {
|
|
|
|
d += emitLiteral(dst[d:], p)
|
|
|
|
} else {
|
|
|
|
d += encodeBlock(dst[d:], p)
|
|
|
|
}
|
2016-02-13 06:11:38 +03:00
|
|
|
}
|
|
|
|
return dst[:d]
|
|
|
|
}
|
|
|
|
|
2016-04-03 02:25:18 +03:00
|
|
|
// inputMargin is the minimum number of extra input bytes to keep, inside
|
|
|
|
// encodeBlock's inner loop. On some architectures, this margin lets us
|
|
|
|
// implement a fast path for emitLiteral, where the copy of short (<= 16 byte)
|
|
|
|
// literals can be implemented as a single load to and store from a 16-byte
|
|
|
|
// register. That literal's actual length can be as short as 1 byte, so this
|
|
|
|
// can copy up to 15 bytes too much, but that's OK as subsequent iterations of
|
|
|
|
// the encoding loop will fix up the copy overrun, and this inputMargin ensures
|
|
|
|
// that we don't overrun the dst and src buffers.
|
|
|
|
//
|
|
|
|
// TODO: implement this fast path.
|
|
|
|
const inputMargin = 16 - 1
|
|
|
|
|
|
|
|
// minBlockSize is the minimum size of the input to encodeBlock. As above, we
|
|
|
|
// want any emitLiteral calls inside encodeBlock's inner loop to use the fast
|
|
|
|
// path if possible, which requires being able to overrun by inputMargin bytes.
|
|
|
|
//
|
|
|
|
// TODO: can we make this bound a little tighter, raising it by 1 or 2?
|
|
|
|
const minBlockSize = inputMargin
|
|
|
|
|
2016-04-03 04:18:01 +03:00
|
|
|
func hash(u, shift uint32) uint32 {
|
|
|
|
return (u * 0x1e35a7bd) >> shift
|
|
|
|
}
|
|
|
|
|
2016-02-22 04:44:36 +03:00
|
|
|
// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
|
|
|
|
// assumes that the varint-encoded length of the decompressed bytes has already
|
|
|
|
// been written.
|
2016-02-13 06:11:38 +03:00
|
|
|
//
|
|
|
|
// It also assumes that:
|
|
|
|
// len(dst) >= MaxEncodedLen(len(src)) &&
|
2016-04-03 02:25:18 +03:00
|
|
|
// minBlockSize <= len(src) && len(src) <= maxBlockSize
|
2016-02-22 04:44:36 +03:00
|
|
|
func encodeBlock(dst, src []byte) (d int) {
|
2011-08-31 15:59:08 +04:00
|
|
|
// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
|
|
|
|
const maxTableSize = 1 << 14
|
2016-04-03 04:18:01 +03:00
|
|
|
shift, tableSize := uint32(32-8), 1<<8
|
2011-08-31 15:59:08 +04:00
|
|
|
for tableSize < maxTableSize && tableSize < len(src) {
|
|
|
|
shift--
|
|
|
|
tableSize *= 2
|
|
|
|
}
|
2016-02-13 06:11:38 +03:00
|
|
|
var table [maxTableSize]int32
|
2011-08-31 15:59:08 +04:00
|
|
|
|
2016-04-03 04:18:01 +03:00
|
|
|
// sLimit is when to stop looking for offset/length copies. The inputMargin
|
|
|
|
// lets us use a fast path for emitLiteral in the main loop, while we are
|
|
|
|
// looking for copies.
|
|
|
|
sLimit := len(src) - inputMargin
|
|
|
|
|
|
|
|
// nextEmit is where in src the next emitLiteral should start from.
|
|
|
|
nextEmit := 0
|
|
|
|
|
|
|
|
// The encoded form must start with a literal, as there are no previous
|
|
|
|
// bytes to copy, so we start looking for hash matches at s == 1.
|
|
|
|
s := 1
|
|
|
|
nextHash := hash(load32(src, s), shift)
|
Fix heuristic match skipping.
The heuristic was introduced in 4e2aa98e, based on the C++ Snappy
implementation, but the Go code contained a flawed optimization. The C++ code
used an explicit skip variable:
uint32 bytes_between_hash_lookups = skip++ >> 5;
next_ip = ip + bytes_between_hash_lookups;
whereas the Go code optimized this to be an implicit skip:
s += 1 + (s-lit)>>5
This is equivalent for small s values (relative to lit, the last hash table
hit), but diverges for large ones. This Go program demonstrates the difference:
// main prints the encoder skipping behavior when seeing no hash table hits.
func main() {
s0, s1 := 0, 0
skip := 32
for i := 0; i < 300; i++ {
// This is the C++ Snappy algorithm.
bytes_between_hash_lookups := skip >> 5
skip++
s0 += bytes_between_hash_lookups
// This is the Go Snappy algorithm.
s1 += 1 + s1>>5
// The intention was that the Go algorithm behaves the same as the C++
// one, but it doesn't.
if i%10 == 0 {
fmt.Printf("%d\t%d\t%d\n", i, s0, s1)
}
}
}
It prints:
0 1 1
10 11 11
20 21 21
30 31 31
40 50 50
50 70 73
60 90 105
70 117 149
80 147 208
90 177 288
100 212 398
110 252 548
120 292 752
130 335 1030
140 385 1408
150 435 1922
160 486 2619
170 546 3568
180 606 4861
190 666 6617
200 735 9005
210 805 12257
220 875 16681
230 952 22697
240 1032 30881
250 1112 42015
260 1197 57161
270 1287 77764
280 1377 105791
290 1470 143914
The C++ algorithm is quadratic. The Go algorithm is exponential.
This commit re-introduces the explicit skip variable, so that the Go
implementation matches the C++ implementation.
For completeness, benchmark numbers are included below, but the worse numbers
merely reflect that the old Go algorithm was too aggressive on skipping ahead
on incompressible input (RandomEncode, ZFlat2 and ZFlat4), and so after an
initial warm-up period, it was essentially performing not much more than a
memcpy. Memcpy is indeed fast in terms of MB/s, but it doesn't compress at all,
which obviously defeats the whole purpose of a compression format like Snappy.
benchmark old MB/s new MB/s speedup
BenchmarkWordsEncode1e1-4 3.65 3.77 1.03x
BenchmarkWordsEncode1e2-4 29.22 29.35 1.00x
BenchmarkWordsEncode1e3-4 99.46 101.20 1.02x
BenchmarkWordsEncode1e4-4 118.11 121.54 1.03x
BenchmarkWordsEncode1e5-4 90.37 91.72 1.01x
BenchmarkWordsEncode1e6-4 107.49 108.88 1.01x
BenchmarkRandomEncode-4 7679.09 4491.97 0.58x
Benchmark_ZFlat0-4 229.41 233.79 1.02x
Benchmark_ZFlat1-4 115.10 116.83 1.02x
Benchmark_ZFlat2-4 7256.88 3003.79 0.41x
Benchmark_ZFlat3-4 53.39 54.02 1.01x
Benchmark_ZFlat4-4 1873.63 289.28 0.15x
Benchmark_ZFlat5-4 233.29 234.95 1.01x
Benchmark_ZFlat6-4 101.33 102.79 1.01x
Benchmark_ZFlat7-4 95.26 96.63 1.01x
Benchmark_ZFlat8-4 105.66 106.89 1.01x
Benchmark_ZFlat9-4 92.04 93.11 1.01x
Benchmark_ZFlat10-4 265.68 265.93 1.00x
Benchmark_ZFlat11-4 149.72 151.32 1.01x
These numbers were generated on an amd64 machine, but on a different machine
than the one used for other recent commits. The raw MB/s numbers are therefore
not directly comparable, although the speedup numbers should be.
2016-02-14 08:54:35 +03:00
|
|
|
|
2016-04-03 04:18:01 +03:00
|
|
|
for {
|
Fix heuristic match skipping.
The heuristic was introduced in 4e2aa98e, based on the C++ Snappy
implementation, but the Go code contained a flawed optimization. The C++ code
used an explicit skip variable:
uint32 bytes_between_hash_lookups = skip++ >> 5;
next_ip = ip + bytes_between_hash_lookups;
whereas the Go code optimized this to be an implicit skip:
s += 1 + (s-lit)>>5
This is equivalent for small s values (relative to lit, the last hash table
hit), but diverges for large ones. This Go program demonstrates the difference:
// main prints the encoder skipping behavior when seeing no hash table hits.
func main() {
s0, s1 := 0, 0
skip := 32
for i := 0; i < 300; i++ {
// This is the C++ Snappy algorithm.
bytes_between_hash_lookups := skip >> 5
skip++
s0 += bytes_between_hash_lookups
// This is the Go Snappy algorithm.
s1 += 1 + s1>>5
// The intention was that the Go algorithm behaves the same as the C++
// one, but it doesn't.
if i%10 == 0 {
fmt.Printf("%d\t%d\t%d\n", i, s0, s1)
}
}
}
It prints:
0 1 1
10 11 11
20 21 21
30 31 31
40 50 50
50 70 73
60 90 105
70 117 149
80 147 208
90 177 288
100 212 398
110 252 548
120 292 752
130 335 1030
140 385 1408
150 435 1922
160 486 2619
170 546 3568
180 606 4861
190 666 6617
200 735 9005
210 805 12257
220 875 16681
230 952 22697
240 1032 30881
250 1112 42015
260 1197 57161
270 1287 77764
280 1377 105791
290 1470 143914
The C++ algorithm is quadratic. The Go algorithm is exponential.
This commit re-introduces the explicit skip variable, so that the Go
implementation matches the C++ implementation.
For completeness, benchmark numbers are included below, but the worse numbers
merely reflect that the old Go algorithm was too aggressive on skipping ahead
on incompressible input (RandomEncode, ZFlat2 and ZFlat4), and so after an
initial warm-up period, it was essentially performing not much more than a
memcpy. Memcpy is indeed fast in terms of MB/s, but it doesn't compress at all,
which obviously defeats the whole purpose of a compression format like Snappy.
benchmark old MB/s new MB/s speedup
BenchmarkWordsEncode1e1-4 3.65 3.77 1.03x
BenchmarkWordsEncode1e2-4 29.22 29.35 1.00x
BenchmarkWordsEncode1e3-4 99.46 101.20 1.02x
BenchmarkWordsEncode1e4-4 118.11 121.54 1.03x
BenchmarkWordsEncode1e5-4 90.37 91.72 1.01x
BenchmarkWordsEncode1e6-4 107.49 108.88 1.01x
BenchmarkRandomEncode-4 7679.09 4491.97 0.58x
Benchmark_ZFlat0-4 229.41 233.79 1.02x
Benchmark_ZFlat1-4 115.10 116.83 1.02x
Benchmark_ZFlat2-4 7256.88 3003.79 0.41x
Benchmark_ZFlat3-4 53.39 54.02 1.01x
Benchmark_ZFlat4-4 1873.63 289.28 0.15x
Benchmark_ZFlat5-4 233.29 234.95 1.01x
Benchmark_ZFlat6-4 101.33 102.79 1.01x
Benchmark_ZFlat7-4 95.26 96.63 1.01x
Benchmark_ZFlat8-4 105.66 106.89 1.01x
Benchmark_ZFlat9-4 92.04 93.11 1.01x
Benchmark_ZFlat10-4 265.68 265.93 1.00x
Benchmark_ZFlat11-4 149.72 151.32 1.01x
These numbers were generated on an amd64 machine, but on a different machine
than the one used for other recent commits. The raw MB/s numbers are therefore
not directly comparable, although the speedup numbers should be.
2016-02-14 08:54:35 +03:00
|
|
|
// Copied from the C++ snappy implementation:
|
|
|
|
//
|
|
|
|
// Heuristic match skipping: If 32 bytes are scanned with no matches
|
|
|
|
// found, start looking only at every other byte. If 32 more bytes are
|
|
|
|
// scanned, look at every third byte, etc.. When a match is found,
|
|
|
|
// immediately go back to looking at every byte. This is a small loss
|
|
|
|
// (~5% performance, ~0.1% density) for compressible data due to more
|
|
|
|
// bookkeeping, but for non-compressible data (such as JPEG) it's a
|
|
|
|
// huge win since the compressor quickly "realizes" the data is
|
|
|
|
// incompressible and doesn't bother looking for matches everywhere.
|
|
|
|
//
|
|
|
|
// The "skip" variable keeps track of how many bytes there are since
|
|
|
|
// the last match; dividing it by 32 (ie. right-shifting by five) gives
|
|
|
|
// the number of bytes to move ahead for each iteration.
|
2016-04-03 04:18:01 +03:00
|
|
|
skip := 32
|
|
|
|
|
|
|
|
nextS := s
|
|
|
|
candidate := 0
|
|
|
|
for {
|
|
|
|
s = nextS
|
|
|
|
nextS = s + skip>>5
|
Fix heuristic match skipping.
The heuristic was introduced in 4e2aa98e, based on the C++ Snappy
implementation, but the Go code contained a flawed optimization. The C++ code
used an explicit skip variable:
uint32 bytes_between_hash_lookups = skip++ >> 5;
next_ip = ip + bytes_between_hash_lookups;
whereas the Go code optimized this to be an implicit skip:
s += 1 + (s-lit)>>5
This is equivalent for small s values (relative to lit, the last hash table
hit), but diverges for large ones. This Go program demonstrates the difference:
// main prints the encoder skipping behavior when seeing no hash table hits.
func main() {
s0, s1 := 0, 0
skip := 32
for i := 0; i < 300; i++ {
// This is the C++ Snappy algorithm.
bytes_between_hash_lookups := skip >> 5
skip++
s0 += bytes_between_hash_lookups
// This is the Go Snappy algorithm.
s1 += 1 + s1>>5
// The intention was that the Go algorithm behaves the same as the C++
// one, but it doesn't.
if i%10 == 0 {
fmt.Printf("%d\t%d\t%d\n", i, s0, s1)
}
}
}
It prints:
0 1 1
10 11 11
20 21 21
30 31 31
40 50 50
50 70 73
60 90 105
70 117 149
80 147 208
90 177 288
100 212 398
110 252 548
120 292 752
130 335 1030
140 385 1408
150 435 1922
160 486 2619
170 546 3568
180 606 4861
190 666 6617
200 735 9005
210 805 12257
220 875 16681
230 952 22697
240 1032 30881
250 1112 42015
260 1197 57161
270 1287 77764
280 1377 105791
290 1470 143914
The C++ algorithm is quadratic. The Go algorithm is exponential.
This commit re-introduces the explicit skip variable, so that the Go
implementation matches the C++ implementation.
For completeness, benchmark numbers are included below, but the worse numbers
merely reflect that the old Go algorithm was too aggressive on skipping ahead
on incompressible input (RandomEncode, ZFlat2 and ZFlat4), and so after an
initial warm-up period, it was essentially performing not much more than a
memcpy. Memcpy is indeed fast in terms of MB/s, but it doesn't compress at all,
which obviously defeats the whole purpose of a compression format like Snappy.
benchmark old MB/s new MB/s speedup
BenchmarkWordsEncode1e1-4 3.65 3.77 1.03x
BenchmarkWordsEncode1e2-4 29.22 29.35 1.00x
BenchmarkWordsEncode1e3-4 99.46 101.20 1.02x
BenchmarkWordsEncode1e4-4 118.11 121.54 1.03x
BenchmarkWordsEncode1e5-4 90.37 91.72 1.01x
BenchmarkWordsEncode1e6-4 107.49 108.88 1.01x
BenchmarkRandomEncode-4 7679.09 4491.97 0.58x
Benchmark_ZFlat0-4 229.41 233.79 1.02x
Benchmark_ZFlat1-4 115.10 116.83 1.02x
Benchmark_ZFlat2-4 7256.88 3003.79 0.41x
Benchmark_ZFlat3-4 53.39 54.02 1.01x
Benchmark_ZFlat4-4 1873.63 289.28 0.15x
Benchmark_ZFlat5-4 233.29 234.95 1.01x
Benchmark_ZFlat6-4 101.33 102.79 1.01x
Benchmark_ZFlat7-4 95.26 96.63 1.01x
Benchmark_ZFlat8-4 105.66 106.89 1.01x
Benchmark_ZFlat9-4 92.04 93.11 1.01x
Benchmark_ZFlat10-4 265.68 265.93 1.00x
Benchmark_ZFlat11-4 149.72 151.32 1.01x
These numbers were generated on an amd64 machine, but on a different machine
than the one used for other recent commits. The raw MB/s numbers are therefore
not directly comparable, although the speedup numbers should be.
2016-02-14 08:54:35 +03:00
|
|
|
skip++
|
2016-04-03 04:18:01 +03:00
|
|
|
if nextS > sLimit {
|
|
|
|
goto emitRemainder
|
|
|
|
}
|
|
|
|
candidate = int(table[nextHash])
|
|
|
|
table[nextHash] = int32(s)
|
|
|
|
nextHash = hash(load32(src, nextS), shift)
|
|
|
|
if load32(src, s) == load32(src, candidate) {
|
|
|
|
break
|
|
|
|
}
|
2011-08-31 15:59:08 +04:00
|
|
|
}
|
2016-04-03 04:18:01 +03:00
|
|
|
|
|
|
|
// A 4-byte match has been found. We'll later see if more than 4 bytes
|
|
|
|
// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
|
|
|
|
// them as literal bytes.
|
|
|
|
d += emitLiteral(dst[d:], src[nextEmit:s])
|
|
|
|
|
|
|
|
// Call emitCopy, and then see if another emitCopy could be our next
|
|
|
|
// move. Repeat until we find no match for the input immediately after
|
|
|
|
// what was consumed by the last emitCopy call.
|
|
|
|
//
|
|
|
|
// If we exit this loop normally then we need to call emitLiteral next,
|
|
|
|
// though we don't yet know how big the literal will be. We handle that
|
|
|
|
// by proceeding to the next iteration of the main loop. We also can
|
|
|
|
// exit this loop via goto if we get close to exhausting the input.
|
|
|
|
for {
|
|
|
|
// Invariant: we have a 4-byte match at s, and no need to emit any
|
|
|
|
// literal bytes prior to s.
|
|
|
|
base := s
|
|
|
|
s += 4
|
|
|
|
for i := candidate + 4; s < len(src) && src[i] == src[s]; i, s = i+1, s+1 {
|
|
|
|
}
|
|
|
|
d += emitCopy(dst[d:], base-candidate, s-base)
|
|
|
|
nextEmit = s
|
|
|
|
if s >= sLimit {
|
|
|
|
goto emitRemainder
|
|
|
|
}
|
|
|
|
|
|
|
|
// We could immediately start working at s now, but to improve
|
|
|
|
// compression we first update the hash table at s-1 and at s. If
|
|
|
|
// another emitCopy is not our next move, also calculate nextHash
|
|
|
|
// at s+1. At least on GOARCH=amd64, these three hash calculations
|
|
|
|
// are faster as one load64 call (with some shifts) instead of
|
|
|
|
// three load32 calls.
|
|
|
|
x := load64(src, s-1)
|
|
|
|
prevHash := hash(uint32(x>>0), shift)
|
|
|
|
table[prevHash] = int32(s - 1)
|
|
|
|
currHash := hash(uint32(x>>8), shift)
|
|
|
|
candidate = int(table[currHash])
|
|
|
|
table[currHash] = int32(s)
|
|
|
|
if uint32(x>>8) != load32(src, candidate) {
|
|
|
|
nextHash = hash(uint32(x>>16), shift)
|
|
|
|
s++
|
|
|
|
break
|
|
|
|
}
|
2011-08-31 15:59:08 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-04-03 04:18:01 +03:00
|
|
|
emitRemainder:
|
|
|
|
if nextEmit < len(src) {
|
|
|
|
d += emitLiteral(dst[d:], src[nextEmit:])
|
2011-08-31 15:59:08 +04:00
|
|
|
}
|
2016-02-13 06:11:38 +03:00
|
|
|
return d
|
2011-08-31 15:59:08 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
// MaxEncodedLen returns the maximum length of a snappy block, given its
|
|
|
|
// uncompressed length.
|
2016-02-13 12:09:53 +03:00
|
|
|
//
|
|
|
|
// It will return a negative value if srcLen is too large to encode.
|
2011-08-31 15:59:08 +04:00
|
|
|
func MaxEncodedLen(srcLen int) int {
|
2016-02-13 12:09:53 +03:00
|
|
|
n := uint64(srcLen)
|
|
|
|
if n > 0xffffffff {
|
|
|
|
return -1
|
|
|
|
}
|
2011-08-31 15:59:08 +04:00
|
|
|
// Compressed data can be defined as:
|
|
|
|
// compressed := item* literal*
|
|
|
|
// item := literal* copy
|
|
|
|
//
|
|
|
|
// The trailing literal sequence has a space blowup of at most 62/60
|
|
|
|
// since a literal of length 60 needs one tag byte + one extra byte
|
|
|
|
// for length information.
|
|
|
|
//
|
|
|
|
// Item blowup is trickier to measure. Suppose the "copy" op copies
|
|
|
|
// 4 bytes of data. Because of a special check in the encoding code,
|
|
|
|
// we produce a 4-byte copy only if the offset is < 65536. Therefore
|
|
|
|
// the copy op takes 3 bytes to encode, and this type of item leads
|
|
|
|
// to at most the 62/60 blowup for representing literals.
|
|
|
|
//
|
|
|
|
// Suppose the "copy" op copies 5 bytes of data. If the offset is big
|
|
|
|
// enough, it will take 5 bytes to encode the copy op. Therefore the
|
|
|
|
// worst case here is a one-byte literal followed by a five-byte copy.
|
|
|
|
// That is, 6 bytes of input turn into 7 bytes of "compressed" data.
|
|
|
|
//
|
|
|
|
// This last factor dominates the blowup, so the final estimate is:
|
2016-02-13 12:09:53 +03:00
|
|
|
n = 32 + n + n/6
|
|
|
|
if n > 0xffffffff {
|
|
|
|
return -1
|
|
|
|
}
|
|
|
|
return int(n)
|
2011-08-31 15:59:08 +04:00
|
|
|
}
|
2015-02-10 03:56:34 +03:00
|
|
|
|
2016-02-11 07:44:51 +03:00
|
|
|
var errClosed = errors.New("snappy: Writer is closed")
|
|
|
|
|
|
|
|
// NewWriter returns a new Writer that compresses to w.
|
|
|
|
//
|
|
|
|
// The Writer returned does not buffer writes. There is no need to Flush or
|
|
|
|
// Close such a Writer.
|
|
|
|
//
|
|
|
|
// Deprecated: the Writer returned is not suitable for many small writes, only
|
|
|
|
// for few large writes. Use NewBufferedWriter instead, which is efficient
|
|
|
|
// regardless of the frequency and shape of the writes, and remember to Close
|
|
|
|
// that Writer when done.
|
2015-02-10 06:07:10 +03:00
|
|
|
func NewWriter(w io.Writer) *Writer {
|
|
|
|
return &Writer{
|
2016-02-11 07:44:51 +03:00
|
|
|
w: w,
|
2016-02-11 09:10:40 +03:00
|
|
|
obuf: make([]byte, obufLen),
|
2016-02-11 07:44:51 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// NewBufferedWriter returns a new Writer that compresses to w, using the
|
|
|
|
// framing format described at
|
|
|
|
// https://github.com/google/snappy/blob/master/framing_format.txt
|
|
|
|
//
|
|
|
|
// The Writer returned buffers writes. Users must call Close to guarantee all
|
|
|
|
// data has been forwarded to the underlying io.Writer. They may also call
|
|
|
|
// Flush zero or more times before calling Close.
|
|
|
|
func NewBufferedWriter(w io.Writer) *Writer {
|
|
|
|
return &Writer{
|
|
|
|
w: w,
|
2016-02-22 04:44:36 +03:00
|
|
|
ibuf: make([]byte, 0, maxBlockSize),
|
2016-02-11 09:10:40 +03:00
|
|
|
obuf: make([]byte, obufLen),
|
2015-02-10 03:56:34 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-02-10 06:07:10 +03:00
|
|
|
// Writer is an io.Writer than can write Snappy-compressed bytes.
|
|
|
|
type Writer struct {
|
2016-02-11 07:44:51 +03:00
|
|
|
w io.Writer
|
|
|
|
err error
|
|
|
|
|
|
|
|
// ibuf is a buffer for the incoming (uncompressed) bytes.
|
|
|
|
//
|
|
|
|
// Its use is optional. For backwards compatibility, Writers created by the
|
|
|
|
// NewWriter function have ibuf == nil, do not buffer incoming bytes, and
|
|
|
|
// therefore do not need to be Flush'ed or Close'd.
|
|
|
|
ibuf []byte
|
|
|
|
|
|
|
|
// obuf is a buffer for the outgoing (compressed) bytes.
|
|
|
|
obuf []byte
|
|
|
|
|
|
|
|
// wroteStreamHeader is whether we have written the stream header.
|
|
|
|
wroteStreamHeader bool
|
2015-02-10 03:56:34 +03:00
|
|
|
}
|
|
|
|
|
2015-02-10 06:07:10 +03:00
|
|
|
// Reset discards the writer's state and switches the Snappy writer to write to
|
|
|
|
// w. This permits reusing a Writer rather than allocating a new one.
|
|
|
|
func (w *Writer) Reset(writer io.Writer) {
|
|
|
|
w.w = writer
|
|
|
|
w.err = nil
|
2016-02-11 07:44:51 +03:00
|
|
|
if w.ibuf != nil {
|
|
|
|
w.ibuf = w.ibuf[:0]
|
|
|
|
}
|
|
|
|
w.wroteStreamHeader = false
|
2015-02-10 06:07:10 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
// Write satisfies the io.Writer interface.
|
2016-02-11 07:44:51 +03:00
|
|
|
func (w *Writer) Write(p []byte) (nRet int, errRet error) {
|
|
|
|
if w.ibuf == nil {
|
|
|
|
// Do not buffer incoming bytes. This does not perform or compress well
|
|
|
|
// if the caller of Writer.Write writes many small slices. This
|
|
|
|
// behavior is therefore deprecated, but still supported for backwards
|
|
|
|
// compatibility with code that doesn't explicitly Flush or Close.
|
|
|
|
return w.write(p)
|
|
|
|
}
|
|
|
|
|
|
|
|
// The remainder of this method is based on bufio.Writer.Write from the
|
|
|
|
// standard library.
|
|
|
|
|
|
|
|
for len(p) > (cap(w.ibuf)-len(w.ibuf)) && w.err == nil {
|
|
|
|
var n int
|
|
|
|
if len(w.ibuf) == 0 {
|
|
|
|
// Large write, empty buffer.
|
|
|
|
// Write directly from p to avoid copy.
|
|
|
|
n, _ = w.write(p)
|
|
|
|
} else {
|
|
|
|
n = copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p)
|
|
|
|
w.ibuf = w.ibuf[:len(w.ibuf)+n]
|
|
|
|
w.Flush()
|
|
|
|
}
|
|
|
|
nRet += n
|
|
|
|
p = p[n:]
|
|
|
|
}
|
|
|
|
if w.err != nil {
|
|
|
|
return nRet, w.err
|
|
|
|
}
|
|
|
|
n := copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p)
|
|
|
|
w.ibuf = w.ibuf[:len(w.ibuf)+n]
|
|
|
|
nRet += n
|
|
|
|
return nRet, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (w *Writer) write(p []byte) (nRet int, errRet error) {
|
2015-02-10 03:56:34 +03:00
|
|
|
if w.err != nil {
|
|
|
|
return 0, w.err
|
|
|
|
}
|
|
|
|
for len(p) > 0 {
|
2016-02-11 09:10:40 +03:00
|
|
|
obufStart := len(magicChunk)
|
|
|
|
if !w.wroteStreamHeader {
|
|
|
|
w.wroteStreamHeader = true
|
|
|
|
copy(w.obuf, magicChunk)
|
|
|
|
obufStart = 0
|
|
|
|
}
|
|
|
|
|
2015-02-10 03:56:34 +03:00
|
|
|
var uncompressed []byte
|
2016-02-22 04:44:36 +03:00
|
|
|
if len(p) > maxBlockSize {
|
|
|
|
uncompressed, p = p[:maxBlockSize], p[maxBlockSize:]
|
2015-02-10 03:56:34 +03:00
|
|
|
} else {
|
|
|
|
uncompressed, p = p, nil
|
|
|
|
}
|
|
|
|
checksum := crc(uncompressed)
|
|
|
|
|
|
|
|
// Compress the buffer, discarding the result if the improvement
|
|
|
|
// isn't at least 12.5%.
|
2016-02-11 09:10:40 +03:00
|
|
|
compressed := Encode(w.obuf[obufHeaderLen:], uncompressed)
|
2015-02-10 03:56:34 +03:00
|
|
|
chunkType := uint8(chunkTypeCompressedData)
|
2016-02-11 09:10:40 +03:00
|
|
|
chunkLen := 4 + len(compressed)
|
|
|
|
obufEnd := obufHeaderLen + len(compressed)
|
|
|
|
if len(compressed) >= len(uncompressed)-len(uncompressed)/8 {
|
|
|
|
chunkType = chunkTypeUncompressedData
|
|
|
|
chunkLen = 4 + len(uncompressed)
|
|
|
|
obufEnd = obufHeaderLen
|
2015-02-10 03:56:34 +03:00
|
|
|
}
|
|
|
|
|
2016-02-11 09:10:40 +03:00
|
|
|
// Fill in the per-chunk header that comes before the body.
|
|
|
|
w.obuf[len(magicChunk)+0] = chunkType
|
|
|
|
w.obuf[len(magicChunk)+1] = uint8(chunkLen >> 0)
|
|
|
|
w.obuf[len(magicChunk)+2] = uint8(chunkLen >> 8)
|
|
|
|
w.obuf[len(magicChunk)+3] = uint8(chunkLen >> 16)
|
|
|
|
w.obuf[len(magicChunk)+4] = uint8(checksum >> 0)
|
|
|
|
w.obuf[len(magicChunk)+5] = uint8(checksum >> 8)
|
|
|
|
w.obuf[len(magicChunk)+6] = uint8(checksum >> 16)
|
|
|
|
w.obuf[len(magicChunk)+7] = uint8(checksum >> 24)
|
|
|
|
|
|
|
|
if _, err := w.w.Write(w.obuf[obufStart:obufEnd]); err != nil {
|
2015-02-10 03:56:34 +03:00
|
|
|
w.err = err
|
2016-02-11 07:44:51 +03:00
|
|
|
return nRet, err
|
2015-02-10 03:56:34 +03:00
|
|
|
}
|
2016-02-11 09:10:40 +03:00
|
|
|
if chunkType == chunkTypeUncompressedData {
|
|
|
|
if _, err := w.w.Write(uncompressed); err != nil {
|
|
|
|
w.err = err
|
|
|
|
return nRet, err
|
|
|
|
}
|
2015-02-10 03:56:34 +03:00
|
|
|
}
|
2016-02-11 07:44:51 +03:00
|
|
|
nRet += len(uncompressed)
|
|
|
|
}
|
|
|
|
return nRet, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Flush flushes the Writer to its underlying io.Writer.
|
|
|
|
func (w *Writer) Flush() error {
|
|
|
|
if w.err != nil {
|
|
|
|
return w.err
|
|
|
|
}
|
|
|
|
if len(w.ibuf) == 0 {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
w.write(w.ibuf)
|
|
|
|
w.ibuf = w.ibuf[:0]
|
|
|
|
return w.err
|
|
|
|
}
|
|
|
|
|
|
|
|
// Close calls Flush and then closes the Writer.
|
|
|
|
func (w *Writer) Close() error {
|
|
|
|
w.Flush()
|
|
|
|
ret := w.err
|
|
|
|
if w.err == nil {
|
|
|
|
w.err = errClosed
|
2015-02-10 03:56:34 +03:00
|
|
|
}
|
2016-02-11 07:44:51 +03:00
|
|
|
return ret
|
2015-02-10 03:56:34 +03:00
|
|
|
}
|