diff --git a/.hgignore b/.hgignore index 3509e7a2bdbe..2c80192f3dea 100644 --- a/.hgignore +++ b/.hgignore @@ -36,3 +36,6 @@ _OPT\.OBJ/ # Java HTML5 parser classes ^parser/html/java/(html|java)parser/ + +# SVN directories +\.svn/ diff --git a/other-licenses/snappy/README b/other-licenses/snappy/README index 1a56b5bae8f0..098305829b82 100644 --- a/other-licenses/snappy/README +++ b/other-licenses/snappy/README @@ -6,6 +6,8 @@ Mozilla does not modify the actual snappy source with the exception of the Snappy comes from: http://code.google.com/p/snappy/ +We are currently using revision: 56 + To upgrade to a newer version: 1. Check out the new code using subversion. 2. Update 'snappy-stubs-public.h' in this directory with any changes that were @@ -20,3 +22,4 @@ To upgrade to a newer version: - 'autogen.sh' - 'configure.ac' - 'Makefile.am' + 5. Update the revision stamp in this file. diff --git a/other-licenses/snappy/src/framing_format.txt b/other-licenses/snappy/src/framing_format.txt new file mode 100644 index 000000000000..08fda03f7d4f --- /dev/null +++ b/other-licenses/snappy/src/framing_format.txt @@ -0,0 +1,124 @@ +Snappy framing format description +Last revised: 2011-12-15 + +This format decribes a framing format for Snappy, allowing compressing to +files or streams that can then more easily be decompressed without having +to hold the entire stream in memory. It also provides data checksums to +help verify integrity. It does not provide metadata checksums, so it does +not protect against e.g. all forms of truncations. + +Implementation of the framing format is optional for Snappy compressors and +decompressor; it is not part of the Snappy core specification. + + +1. General structure + +The file consists solely of chunks, lying back-to-back with no padding +in between. Each chunk consists first a single byte of chunk identifier, +then a two-byte little-endian length of the chunk in bytes (from 0 to 65535, +inclusive), and then the data if any. The three bytes of chunk header is not +counted in the data length. + +The different chunk types are listed below. The first chunk must always +be the stream identifier chunk (see section 4.1, below). The stream +ends when the file ends -- there is no explicit end-of-file marker. + + +2. File type identification + +The following identifiers for this format are recommended where appropriate. +However, note that none have been registered officially, so this is only to +be taken as a guideline. We use "Snappy framed" to distinguish between this +format and raw Snappy data. + + File extension: .sz + MIME type: application/x-snappy-framed + HTTP Content-Encoding: x-snappy-framed + + +3. Checksum format + +Some chunks have data protected by a checksum (the ones that do will say so +explicitly). The checksums are always masked CRC-32Cs. + +A description of CRC-32C can be found in RFC 3720, section 12.1, with +examples in section B.4. + +Checksums are not stored directly, but masked, as checksumming data and +then its own checksum can be problematic. The masking is the same as used +in Apache Hadoop: Rotate the checksum by 15 bits, then add the constant +0xa282ead8 (using wraparound as normal for unsigned integers). This is +equivalent to the following C code: + + uint32_t mask_checksum(uint32_t x) { + return ((x >> 15) | (x << 17)) + 0xa282ead8; + } + +Note that the masking is reversible. + +The checksum is always stored as a four bytes long integer, in little-endian. + + +4. Chunk types + +The currently supported chunk types are described below. The list may +be extended in the future. + + +4.1. Stream identifier (chunk type 0xff) + +The stream identifier is always the first element in the stream. +It is exactly six bytes long and contains "sNaPpY" in ASCII. This means that +a valid Snappy framed stream always starts with the bytes + + 0xff 0x06 0x00 0x73 0x4e 0x61 0x50 0x70 0x59 + +The stream identifier chunk can come multiple times in the stream besides +the first; if such a chunk shows up, it should simply be ignored, assuming +it has the right length and contents. This allows for easy concatenation of +compressed files without the need for re-framing. + + +4.2. Compressed data (chunk type 0x00) + +Compressed data chunks contain a normal Snappy compressed bitstream; +see the compressed format specification. The compressed data is preceded by +the CRC-32C (see section 3) of the _uncompressed_ data. + +Note that the data portion of the chunk, i.e., the compressed contents, +can be at most 65531 bytes (2^16 - 1, minus the checksum). +However, we place an additional restriction that the uncompressed data +in a chunk must be no longer than 32768 bytes. This allows consumers to +easily use small fixed-size buffers. + + +4.3. Uncompressed data (chunk type 0x01) + +Uncompressed data chunks allow a compressor to send uncompressed, +raw data; this is useful if, for instance, uncompressible or +near-incompressible data is detected, and faster decompression is desired. + +As in the compressed chunks, the data is preceded by its own masked +CRC-32C (see section 3). + +An uncompressed data chunk, like compressed data chunks, should contain +no more than 32768 data bytes, so the maximum legal chunk length with the +checksum is 32772. + + +4.4. Reserved unskippable chunks (chunk types 0x02-0x7f) + +These are reserved for future expansion. A decoder that sees such a chunk +should immediately return an error, as it must assume it cannot decode the +stream correctly. + +Future versions of this specification may define meanings for these chunks. + + +4.5. Reserved skippable chunks (chunk types 0x80-0xfe) + +These are also reserved for future expansion, but unlike the chunks +described in 4.4, a decoder seeing these must skip them and continue +decoding. + +Future versions of this specification may define meanings for these chunks. diff --git a/other-licenses/snappy/src/snappy-stubs-internal.h b/other-licenses/snappy/src/snappy-stubs-internal.h index 021528893357..12ba1ab8b18f 100644 --- a/other-licenses/snappy/src/snappy-stubs-internal.h +++ b/other-licenses/snappy/src/snappy-stubs-internal.h @@ -86,10 +86,9 @@ using namespace std; // version (anyone who wants to regenerate it can just do the call // themselves within main()). #define DEFINE_bool(flag_name, default_value, description) \ - bool FLAGS_ ## flag_name = default_value; + bool FLAGS_ ## flag_name = default_value #define DECLARE_bool(flag_name) \ - extern bool FLAGS_ ## flag_name; -#define REGISTER_MODULE_INITIALIZER(name, code) + extern bool FLAGS_ ## flag_name namespace snappy { diff --git a/other-licenses/snappy/src/snappy-test.cc b/other-licenses/snappy/src/snappy-test.cc index 2c503886e47f..223cd92d5ba3 100644 --- a/other-licenses/snappy/src/snappy-test.cc +++ b/other-licenses/snappy/src/snappy-test.cc @@ -353,7 +353,6 @@ int ZLib::CompressAtMostOrAll(Bytef *dest, uLongf *destLen, // compression. err = deflate(&comp_stream_, flush_mode); - const uLong source_bytes_consumed = *sourceLen - comp_stream_.avail_in; *sourceLen = comp_stream_.avail_in; if ((err == Z_STREAM_END || err == Z_OK) @@ -397,7 +396,6 @@ int ZLib::CompressChunkOrAll(Bytef *dest, uLongf *destLen, int ZLib::Compress(Bytef *dest, uLongf *destLen, const Bytef *source, uLong sourceLen) { int err; - const uLongf orig_destLen = *destLen; if ( (err=CompressChunkOrAll(dest, destLen, source, sourceLen, Z_FINISH)) != Z_OK ) return err; diff --git a/other-licenses/snappy/src/snappy-test.h b/other-licenses/snappy/src/snappy-test.h index 649f26e7ac1f..ef6a95539d1e 100644 --- a/other-licenses/snappy/src/snappy-test.h +++ b/other-licenses/snappy/src/snappy-test.h @@ -135,7 +135,7 @@ namespace File { while (!feof(fp)) { char buf[4096]; size_t ret = fread(buf, 1, 4096, fp); - if (ret == -1) { + if (ret == 0 && ferror(fp)) { perror("fread"); exit(1); } diff --git a/other-licenses/snappy/src/snappy.cc b/other-licenses/snappy/src/snappy.cc index c79edb58a7fe..5dce19a4ac60 100644 --- a/other-licenses/snappy/src/snappy.cc +++ b/other-licenses/snappy/src/snappy.cc @@ -194,13 +194,13 @@ static inline char* EmitLiteral(char* op, return op + len; } -static inline char* EmitCopyLessThan64(char* op, int offset, int len) { +static inline char* EmitCopyLessThan64(char* op, size_t offset, int len) { DCHECK_LE(len, 64); DCHECK_GE(len, 4); DCHECK_LT(offset, 65536); if ((len < 12) && (offset < 2048)) { - int len_minus_4 = len - 4; + size_t len_minus_4 = len - 4; assert(len_minus_4 < 8); // Must fit in 3 bits *op++ = COPY_1_BYTE_OFFSET | ((len_minus_4) << 2) | ((offset >> 8) << 5); *op++ = offset & 0xff; @@ -212,7 +212,7 @@ static inline char* EmitCopyLessThan64(char* op, int offset, int len) { return op; } -static inline char* EmitCopy(char* op, int offset, int len) { +static inline char* EmitCopy(char* op, size_t offset, int len) { // Emit 64 byte copies but make sure to keep at least four bytes reserved while (len >= 68) { op = EmitCopyLessThan64(op, offset, 64); @@ -249,7 +249,7 @@ uint16* WorkingMemory::GetHashTable(size_t input_size, int* table_size) { // compression, and if the input is short, we won't need that // many hash table entries anyway. assert(kMaxHashTableSize >= 256); - int htsize = 256; + size_t htsize = 256; while (htsize < kMaxHashTableSize && htsize < input_size) { htsize <<= 1; } @@ -304,14 +304,14 @@ char* CompressFragment(const char* input, CHECK_LE(input_size, kBlockSize); CHECK_EQ(table_size & (table_size - 1), 0) << ": table must be power of two"; const int shift = 32 - Bits::Log2Floor(table_size); - DCHECK_EQ(kuint32max >> shift, table_size - 1); + DCHECK_EQ(static_cast(kuint32max >> shift), table_size - 1); const char* ip_end = input + input_size; const char* base_ip = ip; // Bytes in [next_emit, ip) will be emitted as literal bytes. Or // [next_emit, ip_end) after the main loop. const char* next_emit = ip; - const int kInputMarginBytes = 15; + const size_t kInputMarginBytes = 15; if (PREDICT_TRUE(input_size >= kInputMarginBytes)) { const char* ip_limit = input + input_size - kInputMarginBytes; @@ -387,7 +387,7 @@ char* CompressFragment(const char* input, const char* base = ip; int matched = 4 + FindMatchLength(candidate + 4, ip + 4, ip_end); ip += matched; - int offset = base - candidate; + size_t offset = base - candidate; DCHECK_EQ(0, memcmp(base, candidate, matched)); op = EmitCopy(op, offset, matched); // We could immediately start working at ip now, but to improve @@ -435,12 +435,26 @@ char* CompressFragment(const char* input, // bool CheckLength() const; // // // Called repeatedly during decompression -// bool Append(const char* ip, uint32 length, bool allow_fast_path); -// bool AppendFromSelf(uint32 offset, uint32 length); -// }; +// bool Append(const char* ip, size_t length); +// bool AppendFromSelf(uint32 offset, size_t length); // -// "allow_fast_path" is a parameter that says if there is at least 16 -// readable bytes in "ip". It is currently only used by SnappyArrayWriter. +// // The difference between TryFastAppend and Append is that TryFastAppend +// // is allowed to read up to bytes from the input buffer, +// // whereas Append is allowed to read . +// // +// // Also, TryFastAppend is allowed to return false, declining the append, +// // without it being a fatal error -- just "return false" would be +// // a perfectly legal implementation of TryFastAppend. The intention +// // is for TryFastAppend to allow a fast path in the common case of +// // a small append. +// // +// // NOTE(user): TryFastAppend must always return decline (return false) +// // if is 61 or more, as in this case the literal length is not +// // decoded fully. In practice, this should not be a big problem, +// // as it is unlikely that one would implement a fast path accepting +// // this much data. +// bool TryFastAppend(const char* ip, size_t available, size_t length); +// }; // ----------------------------------------------------------------------- // Lookup table for decompression code. Generated by ComputeTable() below. @@ -587,7 +601,6 @@ static void ComputeTable() { CHECK_EQ(dst[i], char_table[i]); } } -REGISTER_MODULE_INITIALIZER(snappy, ComputeTable()); #endif /* !NDEBUG */ // Helper class for decompression @@ -655,29 +668,41 @@ class SnappyDecompressor { template void DecompressAllTags(Writer* writer) { const char* ip = ip_; - for ( ;; ) { - if (ip_limit_ - ip < 5) { - ip_ = ip; - if (!RefillTag()) return; - ip = ip_; - } + // We could have put this refill fragment only at the beginning of the loop. + // However, duplicating it at the end of each branch gives the compiler more + // scope to optimize the expression based on the local + // context, which overall increases speed. + #define MAYBE_REFILL() \ + if (ip_limit_ - ip < 5) { \ + ip_ = ip; \ + if (!RefillTag()) return; \ + ip = ip_; \ + } + + MAYBE_REFILL(); + for ( ;; ) { const unsigned char c = *(reinterpret_cast(ip++)); if ((c & 0x3) == LITERAL) { - uint32 literal_length = c >> 2; - if (PREDICT_FALSE(literal_length >= 60)) { + size_t literal_length = (c >> 2) + 1u; + if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length)) { + DCHECK_LT(literal_length, 61); + ip += literal_length; + MAYBE_REFILL(); + continue; + } + if (PREDICT_FALSE(literal_length >= 61)) { // Long literal. - const uint32 literal_length_length = literal_length - 59; + const size_t literal_length_length = literal_length - 60; literal_length = - LittleEndian::Load32(ip) & wordmask[literal_length_length]; + (LittleEndian::Load32(ip) & wordmask[literal_length_length]) + 1; ip += literal_length_length; } - ++literal_length; - uint32 avail = ip_limit_ - ip; + size_t avail = ip_limit_ - ip; while (avail < literal_length) { - if (!writer->Append(ip, avail, false)) return; + if (!writer->Append(ip, avail)) return; literal_length -= avail; reader_->Skip(peeked_); size_t n; @@ -687,11 +712,11 @@ class SnappyDecompressor { if (avail == 0) return; // Premature end of input ip_limit_ = ip + avail; } - bool allow_fast_path = (avail >= 16); - if (!writer->Append(ip, literal_length, allow_fast_path)) { + if (!writer->Append(ip, literal_length)) { return; } ip += literal_length; + MAYBE_REFILL(); } else { const uint32 entry = char_table[c]; const uint32 trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11]; @@ -705,8 +730,11 @@ class SnappyDecompressor { if (!writer->AppendFromSelf(copy_offset + trailer, length)) { return; } + MAYBE_REFILL(); } } + +#undef MAYBE_REFILL } }; @@ -796,7 +824,7 @@ bool GetUncompressedLength(Source* source, uint32* result) { size_t Compress(Source* reader, Sink* writer) { size_t written = 0; - int N = reader->Available(); + size_t N = reader->Available(); char ulength[Varint::kMax32]; char* p = Varint::Encode32(ulength, N); writer->Append(ulength, p-ulength); @@ -811,10 +839,10 @@ size_t Compress(Source* reader, Sink* writer) { size_t fragment_size; const char* fragment = reader->Peek(&fragment_size); DCHECK_NE(fragment_size, 0) << ": premature end of input"; - const int num_to_read = min(N, kBlockSize); + const size_t num_to_read = min(N, kBlockSize); size_t bytes_read = fragment_size; - int pending_advance = 0; + size_t pending_advance = 0; if (bytes_read >= num_to_read) { // Buffer returned by reader is large enough pending_advance = num_to_read; @@ -902,26 +930,34 @@ class SnappyArrayWriter { return op_ == op_limit_; } - inline bool Append(const char* ip, uint32 len, bool allow_fast_path) { + inline bool Append(const char* ip, size_t len) { char* op = op_; - const int space_left = op_limit_ - op; - if (allow_fast_path && len <= 16 && space_left >= 16) { - // Fast path, used for the majority (about 90%) of dynamic invocations. - UNALIGNED_STORE64(op, UNALIGNED_LOAD64(ip)); - UNALIGNED_STORE64(op + 8, UNALIGNED_LOAD64(ip + 8)); - } else { - if (space_left < len) { - return false; - } - memcpy(op, ip, len); + const size_t space_left = op_limit_ - op; + if (space_left < len) { + return false; } + memcpy(op, ip, len); op_ = op + len; return true; } - inline bool AppendFromSelf(uint32 offset, uint32 len) { + inline bool TryFastAppend(const char* ip, size_t available, size_t len) { char* op = op_; - const int space_left = op_limit_ - op; + const size_t space_left = op_limit_ - op; + if (len <= 16 && available >= 16 && space_left >= 16) { + // Fast path, used for the majority (about 95%) of invocations. + UNALIGNED_STORE64(op, UNALIGNED_LOAD64(ip)); + UNALIGNED_STORE64(op + 8, UNALIGNED_LOAD64(ip + 8)); + op_ = op + len; + return true; + } else { + return false; + } + } + + inline bool AppendFromSelf(size_t offset, size_t len) { + char* op = op_; + const size_t space_left = op_limit_ - op; if (op - base_ <= offset - 1u) { // -1u catches offset==0 return false; @@ -985,11 +1021,14 @@ class SnappyDecompressionValidator { inline bool CheckLength() const { return expected_ == produced_; } - inline bool Append(const char* ip, uint32 len, bool allow_fast_path) { + inline bool Append(const char* ip, size_t len) { produced_ += len; return produced_ <= expected_; } - inline bool AppendFromSelf(uint32 offset, uint32 len) { + inline bool TryFastAppend(const char* ip, size_t available, size_t length) { + return false; + } + inline bool AppendFromSelf(size_t offset, size_t len) { if (produced_ <= offset - 1u) return false; // -1u catches offset==0 produced_ += len; return produced_ <= expected_; diff --git a/other-licenses/snappy/src/snappy.h b/other-licenses/snappy/src/snappy.h index 8d6ef2294f55..8c2075fefa5d 100644 --- a/other-licenses/snappy/src/snappy.h +++ b/other-licenses/snappy/src/snappy.h @@ -144,10 +144,10 @@ namespace snappy { // decompression code should not rely on this guarantee since older // compression code may not obey it. static const int kBlockLog = 15; - static const int kBlockSize = 1 << kBlockLog; + static const size_t kBlockSize = 1 << kBlockLog; static const int kMaxHashTableBits = 14; - static const int kMaxHashTableSize = 1 << kMaxHashTableBits; + static const size_t kMaxHashTableSize = 1 << kMaxHashTableBits; } // end namespace snappy diff --git a/other-licenses/snappy/src/snappy_unittest.cc b/other-licenses/snappy/src/snappy_unittest.cc index 6fff333ad139..0984e3e26672 100644 --- a/other-licenses/snappy/src/snappy_unittest.cc +++ b/other-licenses/snappy/src/snappy_unittest.cc @@ -300,7 +300,7 @@ static bool Uncompress(const string& compressed, CompressorType comp, reinterpret_cast(compressed.data()), compressed.size()); CHECK_EQ(Z_OK, ret); - CHECK_EQ(destlen, size); + CHECK_EQ(static_cast(size), destlen); break; } #endif // ZLIB_VERSION @@ -316,7 +316,7 @@ static bool Uncompress(const string& compressed, CompressorType comp, &destlen, NULL); CHECK_EQ(LZO_E_OK, ret); - CHECK_EQ(destlen, size); + CHECK_EQ(static_cast(size), destlen); break; } #endif // LZO_VERSION