From 9378ba44b3f46d697653003c784be87746e138d2 Mon Sep 17 00:00:00 2001 From: Douglas Gregor Date: Mon, 20 Apr 2009 07:08:21 +0000 Subject: [PATCH] Move the on-disk hash table code into its own header. No functionality change. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@69580 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/clang/Basic/OnDiskHashTable.h | 343 ++++++++++++++++++++++++++ lib/Lex/PTHLexer.cpp | 173 +------------ tools/clang-cc/CacheTokens.cpp | 162 +----------- 3 files changed, 347 insertions(+), 331 deletions(-) create mode 100644 include/clang/Basic/OnDiskHashTable.h diff --git a/include/clang/Basic/OnDiskHashTable.h b/include/clang/Basic/OnDiskHashTable.h new file mode 100644 index 0000000000..5008be16f1 --- /dev/null +++ b/include/clang/Basic/OnDiskHashTable.h @@ -0,0 +1,343 @@ +//===--- OnDiskHashTable.h - On-Disk Hash Table Implementation --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines facilities for reading and writing on-disk hash +// tables. +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_CLANG_BASIC_ON_DISK_HASH_TABLE_H +#define LLVM_CLANG_BASIC_ON_DISK_HASH_TABLE_H + +#include "llvm/Support/Allocator.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/DataTypes.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/System/Host.h" +#include +#include + +namespace clang { + +// Bernstein hash function: +// This is basically copy-and-paste from StringMap. This likely won't +// stay here, which is why I didn't both to expose this function from +// String Map. +inline unsigned BernsteinHash(const char* x) { + unsigned int R = 0; + for ( ; *x != '\0' ; ++x) R = R * 33 + *x; + return R + (R >> 5); +} + +inline unsigned BernsteinHash(const char* x, unsigned n) { + unsigned int R = 0; + for (unsigned i = 0 ; i < n ; ++i, ++x) R = R * 33 + *x; + return R + (R >> 5); +} + +namespace io { + +typedef uint32_t Offset; + +inline void Emit8(llvm::raw_ostream& Out, uint32_t V) { + Out << (unsigned char)(V); +} + +inline void Emit16(llvm::raw_ostream& Out, uint32_t V) { + Out << (unsigned char)(V); + Out << (unsigned char)(V >> 8); + assert((V >> 16) == 0); +} + +inline void Emit32(llvm::raw_ostream& Out, uint32_t V) { + Out << (unsigned char)(V); + Out << (unsigned char)(V >> 8); + Out << (unsigned char)(V >> 16); + Out << (unsigned char)(V >> 24); +} + +inline void Emit64(llvm::raw_ostream& Out, uint64_t V) { + Out << (unsigned char)(V); + Out << (unsigned char)(V >> 8); + Out << (unsigned char)(V >> 16); + Out << (unsigned char)(V >> 24); + Out << (unsigned char)(V >> 32); + Out << (unsigned char)(V >> 40); + Out << (unsigned char)(V >> 48); + Out << (unsigned char)(V >> 56); +} + +inline void Pad(llvm::raw_fd_ostream& Out, unsigned A) { + Offset off = (Offset) Out.tell(); + uint32_t n = ((uintptr_t)(off+A-1) & ~(uintptr_t)(A-1)) - off; + for (; n ; --n) + Emit8(Out, 0); +} + +inline uint16_t ReadUnalignedLE16(const unsigned char *&Data) { + uint16_t V = ((uint16_t)Data[0]) | + ((uint16_t)Data[1] << 8); + Data += 2; + return V; +} + +inline uint32_t ReadUnalignedLE32(const unsigned char *&Data) { + uint32_t V = ((uint32_t)Data[0]) | + ((uint32_t)Data[1] << 8) | + ((uint32_t)Data[2] << 16) | + ((uint32_t)Data[3] << 24); + Data += 4; + return V; +} + +inline uint64_t ReadUnalignedLE64(const unsigned char *&Data) { + uint64_t V = ((uint64_t)Data[0]) | + ((uint64_t)Data[1] << 8) | + ((uint64_t)Data[2] << 16) | + ((uint64_t)Data[3] << 24) | + ((uint64_t)Data[4] << 32) | + ((uint64_t)Data[5] << 40) | + ((uint64_t)Data[6] << 48) | + ((uint64_t)Data[7] << 56); + Data += 8; + return V; +} + +inline uint32_t ReadLE32(const unsigned char *&Data) { + // Hosts that directly support little-endian 32-bit loads can just + // use them. Big-endian hosts need a bswap. + uint32_t V = *((uint32_t*)Data); + if (llvm::sys::isBigEndianHost()) + V = llvm::ByteSwap_32(V); + Data += 4; + return V; +} + +} // end namespace io + +template +class OnDiskChainedHashTableGenerator { + unsigned NumBuckets; + unsigned NumEntries; + llvm::BumpPtrAllocator BA; + + class Item { + public: + typename Info::key_type key; + typename Info::data_type data; + Item *next; + const uint32_t hash; + + Item(typename Info::key_type_ref k, typename Info::data_type_ref d) + : key(k), data(d), next(0), hash(Info::ComputeHash(k)) {} + }; + + class Bucket { + public: + io::Offset off; + Item* head; + unsigned length; + + Bucket() {} + }; + + Bucket* Buckets; + +private: + void insert(Bucket* b, size_t size, Item* E) { + unsigned idx = E->hash & (size - 1); + Bucket& B = b[idx]; + E->next = B.head; + ++B.length; + B.head = E; + } + + void resize(size_t newsize) { + Bucket* newBuckets = (Bucket*) std::calloc(newsize, sizeof(Bucket)); + // Populate newBuckets with the old entries. + for (unsigned i = 0; i < NumBuckets; ++i) + for (Item* E = Buckets[i].head; E ; ) { + Item* N = E->next; + E->next = 0; + insert(newBuckets, newsize, E); + E = N; + } + + free(Buckets); + NumBuckets = newsize; + Buckets = newBuckets; + } + +public: + + void insert(typename Info::key_type_ref key, + typename Info::data_type_ref data) { + + ++NumEntries; + if (4*NumEntries >= 3*NumBuckets) resize(NumBuckets*2); + insert(Buckets, NumBuckets, new (BA.Allocate()) Item(key, data)); + } + + io::Offset Emit(llvm::raw_fd_ostream& out) { + using namespace clang::io; + + // Emit the payload of the table. + for (unsigned i = 0; i < NumBuckets; ++i) { + Bucket& B = Buckets[i]; + if (!B.head) continue; + + // Store the offset for the data of this bucket. + // FIXME: need tell() to work on other raw ostreams + B.off = out.tell(); + + // Write out the number of items in the bucket. + Emit16(out, B.length); + + // Write out the entries in the bucket. + for (Item *I = B.head; I ; I = I->next) { + Emit32(out, I->hash); + const std::pair& Len = + Info::EmitKeyDataLength(out, I->key, I->data); + Info::EmitKey(out, I->key, Len.first); + Info::EmitData(out, I->key, I->data, Len.second); + } + } + + // Emit the hashtable itself. + Pad(out, 4); + io::Offset TableOff = out.tell(); + Emit32(out, NumBuckets); + Emit32(out, NumEntries); + for (unsigned i = 0; i < NumBuckets; ++i) Emit32(out, Buckets[i].off); + + return TableOff; + } + + OnDiskChainedHashTableGenerator() { + NumEntries = 0; + NumBuckets = 64; + // Note that we do not need to run the constructors of the individual + // Bucket objects since 'calloc' returns bytes that are all 0. + Buckets = (Bucket*) std::calloc(NumBuckets, sizeof(Bucket)); + } + + ~OnDiskChainedHashTableGenerator() { + std::free(Buckets); + } +}; + +template +class OnDiskChainedHashTable { + const unsigned NumBuckets; + const unsigned NumEntries; + const unsigned char* const Buckets; + const unsigned char* const Base; +public: + typedef typename Info::internal_key_type internal_key_type; + typedef typename Info::external_key_type external_key_type; + typedef typename Info::data_type data_type; + + OnDiskChainedHashTable(unsigned numBuckets, unsigned numEntries, + const unsigned char* buckets, + const unsigned char* base) + : NumBuckets(numBuckets), NumEntries(numEntries), + Buckets(buckets), Base(base) { + assert((reinterpret_cast(buckets) & 0x3) == 0 && + "'buckets' must have a 4-byte alignment"); + } + + unsigned getNumBuckets() const { return NumBuckets; } + unsigned getNumEntries() const { return NumEntries; } + const unsigned char* getBase() const { return Base; } + const unsigned char* getBuckets() const { return Buckets; } + + bool isEmpty() const { return NumEntries == 0; } + + class iterator { + internal_key_type key; + const unsigned char* const data; + const unsigned len; + public: + iterator() : data(0), len(0) {} + iterator(const internal_key_type k, const unsigned char* d, unsigned l) + : key(k), data(d), len(l) {} + + data_type operator*() const { return Info::ReadData(key, data, len); } + bool operator==(const iterator& X) const { return X.data == data; } + bool operator!=(const iterator& X) const { return X.data != data; } + }; + + iterator find(const external_key_type& eKey) { + using namespace io; + const internal_key_type& iKey = Info::GetInternalKey(eKey); + unsigned key_hash = Info::ComputeHash(iKey); + + // Each bucket is just a 32-bit offset into the PTH file. + unsigned idx = key_hash & (NumBuckets - 1); + const unsigned char* Bucket = Buckets + sizeof(uint32_t)*idx; + + unsigned offset = ReadLE32(Bucket); + if (offset == 0) return iterator(); // Empty bucket. + const unsigned char* Items = Base + offset; + + // 'Items' starts with a 16-bit unsigned integer representing the + // number of items in this bucket. + unsigned len = ReadUnalignedLE16(Items); + + for (unsigned i = 0; i < len; ++i) { + // Read the hash. + uint32_t item_hash = ReadUnalignedLE32(Items); + + // Determine the length of the key and the data. + const std::pair& L = Info::ReadKeyDataLength(Items); + unsigned item_len = L.first + L.second; + + // Compare the hashes. If they are not the same, skip the entry entirely. + if (item_hash != key_hash) { + Items += item_len; + continue; + } + + // Read the key. + const internal_key_type& X = + Info::ReadKey((const unsigned char* const) Items, L.first); + + // If the key doesn't match just skip reading the value. + if (!Info::EqualKey(X, iKey)) { + Items += item_len; + continue; + } + + // The key matches! + return iterator(X, Items + L.first, L.second); + } + + return iterator(); + } + + iterator end() const { return iterator(); } + + + static OnDiskChainedHashTable* Create(const unsigned char* buckets, + const unsigned char* const base) { + using namespace io; + assert(buckets > base); + assert((reinterpret_cast(buckets) & 0x3) == 0 && + "buckets should be 4-byte aligned."); + + unsigned numBuckets = ReadLE32(buckets); + unsigned numEntries = ReadLE32(buckets); + return new OnDiskChainedHashTable(numBuckets, numEntries, buckets, + base); + } +}; + +} // end namespace clang + +#endif diff --git a/lib/Lex/PTHLexer.cpp b/lib/Lex/PTHLexer.cpp index 923b26cf7d..916bdefdf2 100644 --- a/lib/Lex/PTHLexer.cpp +++ b/lib/Lex/PTHLexer.cpp @@ -14,6 +14,7 @@ #include "clang/Basic/TokenKinds.h" #include "clang/Basic/FileManager.h" #include "clang/Basic/IdentifierTable.h" +#include "clang/Basic/OnDiskHashTable.h" #include "clang/Lex/PTHLexer.h" #include "clang/Lex/Preprocessor.h" #include "clang/Lex/PTHManager.h" @@ -21,74 +22,13 @@ #include "clang/Lex/Preprocessor.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/OwningPtr.h" -#include "llvm/Support/Compiler.h" -#include "llvm/Support/MathExtras.h" #include "llvm/Support/MemoryBuffer.h" -#include "llvm/System/Host.h" #include using namespace clang; +using namespace clang::io; #define DISK_TOKEN_SIZE (1+1+2+4+4) -//===----------------------------------------------------------------------===// -// Utility methods for reading from the mmap'ed PTH file. -//===----------------------------------------------------------------------===// - -static inline uint16_t ReadUnalignedLE16(const unsigned char *&Data) { - uint16_t V = ((uint16_t)Data[0]) | - ((uint16_t)Data[1] << 8); - Data += 2; - return V; -} - -static inline uint32_t ReadUnalignedLE32(const unsigned char *&Data) { - uint32_t V = ((uint32_t)Data[0]) | - ((uint32_t)Data[1] << 8) | - ((uint32_t)Data[2] << 16) | - ((uint32_t)Data[3] << 24); - Data += 4; - return V; -} - -static inline uint64_t ReadUnalignedLE64(const unsigned char *&Data) { - uint64_t V = ((uint64_t)Data[0]) | - ((uint64_t)Data[1] << 8) | - ((uint64_t)Data[2] << 16) | - ((uint64_t)Data[3] << 24) | - ((uint64_t)Data[4] << 32) | - ((uint64_t)Data[5] << 40) | - ((uint64_t)Data[6] << 48) | - ((uint64_t)Data[7] << 56); - Data += 8; - return V; -} - -static inline uint32_t ReadLE32(const unsigned char *&Data) { - // Hosts that directly support little-endian 32-bit loads can just - // use them. Big-endian hosts need a bswap. - uint32_t V = *((uint32_t*)Data); - if (llvm::sys::isBigEndianHost()) - V = llvm::ByteSwap_32(V); - Data += 4; - return V; -} - -// Bernstein hash function: -// This is basically copy-and-paste from StringMap. This likely won't -// stay here, which is why I didn't both to expose this function from -// String Map. -static unsigned BernsteinHash(const char* x) { - unsigned int R = 0; - for ( ; *x != '\0' ; ++x) R = R * 33 + *x; - return R + (R >> 5); -} - -static unsigned BernsteinHash(const char* x, unsigned n) { - unsigned int R = 0; - for (unsigned i = 0 ; i < n ; ++i, ++x) R = R * 33 + *x; - return R + (R >> 5); -} - //===----------------------------------------------------------------------===// // PTHLexer methods. //===----------------------------------------------------------------------===// @@ -343,115 +283,6 @@ SourceLocation PTHLexer::getSourceLocation() { return FileStartLoc.getFileLocWithOffset(Offset); } -//===----------------------------------------------------------------------===// -// OnDiskChainedHashTable -//===----------------------------------------------------------------------===// - -template -class OnDiskChainedHashTable { - const unsigned NumBuckets; - const unsigned NumEntries; - const unsigned char* const Buckets; - const unsigned char* const Base; -public: - typedef typename Info::internal_key_type internal_key_type; - typedef typename Info::external_key_type external_key_type; - typedef typename Info::data_type data_type; - - OnDiskChainedHashTable(unsigned numBuckets, unsigned numEntries, - const unsigned char* buckets, - const unsigned char* base) - : NumBuckets(numBuckets), NumEntries(numEntries), - Buckets(buckets), Base(base) { - assert((reinterpret_cast(buckets) & 0x3) == 0 && - "'buckets' must have a 4-byte alignment"); - } - - unsigned getNumBuckets() const { return NumBuckets; } - unsigned getNumEntries() const { return NumEntries; } - const unsigned char* getBase() const { return Base; } - const unsigned char* getBuckets() const { return Buckets; } - - bool isEmpty() const { return NumEntries == 0; } - - class iterator { - internal_key_type key; - const unsigned char* const data; - const unsigned len; - public: - iterator() : data(0), len(0) {} - iterator(const internal_key_type k, const unsigned char* d, unsigned l) - : key(k), data(d), len(l) {} - - data_type operator*() const { return Info::ReadData(key, data, len); } - bool operator==(const iterator& X) const { return X.data == data; } - bool operator!=(const iterator& X) const { return X.data != data; } - }; - - iterator find(const external_key_type& eKey) { - const internal_key_type& iKey = Info::GetInternalKey(eKey); - unsigned key_hash = Info::ComputeHash(iKey); - - // Each bucket is just a 32-bit offset into the PTH file. - unsigned idx = key_hash & (NumBuckets - 1); - const unsigned char* Bucket = Buckets + sizeof(uint32_t)*idx; - - unsigned offset = ReadLE32(Bucket); - if (offset == 0) return iterator(); // Empty bucket. - const unsigned char* Items = Base + offset; - - // 'Items' starts with a 16-bit unsigned integer representing the - // number of items in this bucket. - unsigned len = ReadUnalignedLE16(Items); - - for (unsigned i = 0; i < len; ++i) { - // Read the hash. - uint32_t item_hash = ReadUnalignedLE32(Items); - - // Determine the length of the key and the data. - const std::pair& L = Info::ReadKeyDataLength(Items); - unsigned item_len = L.first + L.second; - - // Compare the hashes. If they are not the same, skip the entry entirely. - if (item_hash != key_hash) { - Items += item_len; - continue; - } - - // Read the key. - const internal_key_type& X = - Info::ReadKey((const unsigned char* const) Items, L.first); - - // If the key doesn't match just skip reading the value. - if (!Info::EqualKey(X, iKey)) { - Items += item_len; - continue; - } - - // The key matches! - return iterator(X, Items + L.first, L.second); - } - - return iterator(); - } - - iterator end() const { return iterator(); } - - - static OnDiskChainedHashTable* Create(const unsigned char* buckets, - const unsigned char* const base) { - - assert(buckets > base); - assert((reinterpret_cast(buckets) & 0x3) == 0 && - "buckets should be 4-byte aligned."); - - unsigned numBuckets = ReadLE32(buckets); - unsigned numEntries = ReadLE32(buckets); - return new OnDiskChainedHashTable(numBuckets, numEntries, buckets, - base); - } -}; - //===----------------------------------------------------------------------===// // PTH file lookup: map from strings to file data. //===----------------------------------------------------------------------===// diff --git a/tools/clang-cc/CacheTokens.cpp b/tools/clang-cc/CacheTokens.cpp index 2b08818f8f..a886ba135a 100644 --- a/tools/clang-cc/CacheTokens.cpp +++ b/tools/clang-cc/CacheTokens.cpp @@ -17,6 +17,7 @@ #include "clang/Basic/SourceManager.h" #include "clang/Basic/IdentifierTable.h" #include "clang/Basic/Diagnostic.h" +#include "clang/Basic/OnDiskHashTable.h" #include "clang/Lex/Lexer.h" #include "clang/Lex/Preprocessor.h" #include "llvm/ADT/StringMap.h" @@ -32,166 +33,7 @@ #endif using namespace clang; - -typedef uint32_t Offset; - -static void Emit8(llvm::raw_ostream& Out, uint32_t V) { - Out << (unsigned char)(V); -} - -static void Emit16(llvm::raw_ostream& Out, uint32_t V) { - Out << (unsigned char)(V); - Out << (unsigned char)(V >> 8); - assert((V >> 16) == 0); -} - -static void Emit32(llvm::raw_ostream& Out, uint32_t V) { - Out << (unsigned char)(V); - Out << (unsigned char)(V >> 8); - Out << (unsigned char)(V >> 16); - Out << (unsigned char)(V >> 24); -} - -static void Emit64(llvm::raw_ostream& Out, uint64_t V) { - Out << (unsigned char)(V); - Out << (unsigned char)(V >> 8); - Out << (unsigned char)(V >> 16); - Out << (unsigned char)(V >> 24); - Out << (unsigned char)(V >> 32); - Out << (unsigned char)(V >> 40); - Out << (unsigned char)(V >> 48); - Out << (unsigned char)(V >> 56); -} - -static void Pad(llvm::raw_fd_ostream& Out, unsigned A) { - Offset off = (Offset) Out.tell(); - uint32_t n = ((uintptr_t)(off+A-1) & ~(uintptr_t)(A-1)) - off; - for (; n ; --n) - Emit8(Out, 0); -} - -// Bernstein hash function: -// This is basically copy-and-paste from StringMap. This likely won't -// stay here, which is why I didn't both to expose this function from -// String Map. -static unsigned BernsteinHash(const char* x) { - unsigned int R = 0; - for ( ; *x != '\0' ; ++x) R = R * 33 + *x; - return R + (R >> 5); -} - -//===----------------------------------------------------------------------===// -// On Disk Hashtable Logic. This will eventually get refactored and put -// elsewhere. -//===----------------------------------------------------------------------===// - -template -class OnDiskChainedHashTableGenerator { - unsigned NumBuckets; - unsigned NumEntries; - llvm::BumpPtrAllocator BA; - - class Item { - public: - typename Info::key_type key; - typename Info::data_type data; - Item *next; - const uint32_t hash; - - Item(typename Info::key_type_ref k, typename Info::data_type_ref d) - : key(k), data(d), next(0), hash(Info::ComputeHash(k)) {} - }; - - class Bucket { - public: - Offset off; - Item* head; - unsigned length; - - Bucket() {} - }; - - Bucket* Buckets; - -private: - void insert(Bucket* b, size_t size, Item* E) { - unsigned idx = E->hash & (size - 1); - Bucket& B = b[idx]; - E->next = B.head; - ++B.length; - B.head = E; - } - - void resize(size_t newsize) { - Bucket* newBuckets = (Bucket*) calloc(newsize, sizeof(Bucket)); - // Populate newBuckets with the old entries. - for (unsigned i = 0; i < NumBuckets; ++i) - for (Item* E = Buckets[i].head; E ; ) { - Item* N = E->next; - E->next = 0; - insert(newBuckets, newsize, E); - E = N; - } - - free(Buckets); - NumBuckets = newsize; - Buckets = newBuckets; - } - -public: - - void insert(typename Info::key_type_ref key, - typename Info::data_type_ref data) { - - ++NumEntries; - if (4*NumEntries >= 3*NumBuckets) resize(NumBuckets*2); - insert(Buckets, NumBuckets, new (BA.Allocate()) Item(key, data)); - } - - Offset Emit(llvm::raw_fd_ostream& out) { - // Emit the payload of the table. - for (unsigned i = 0; i < NumBuckets; ++i) { - Bucket& B = Buckets[i]; - if (!B.head) continue; - - // Store the offset for the data of this bucket. - B.off = out.tell(); - - // Write out the number of items in the bucket. - Emit16(out, B.length); - - // Write out the entries in the bucket. - for (Item *I = B.head; I ; I = I->next) { - Emit32(out, I->hash); - const std::pair& Len = - Info::EmitKeyDataLength(out, I->key, I->data); - Info::EmitKey(out, I->key, Len.first); - Info::EmitData(out, I->key, I->data, Len.second); - } - } - - // Emit the hashtable itself. - Pad(out, 4); - Offset TableOff = out.tell(); - Emit32(out, NumBuckets); - Emit32(out, NumEntries); - for (unsigned i = 0; i < NumBuckets; ++i) Emit32(out, Buckets[i].off); - - return TableOff; - } - - OnDiskChainedHashTableGenerator() { - NumEntries = 0; - NumBuckets = 64; - // Note that we do not need to run the constructors of the individual - // Bucket objects since 'calloc' returns bytes that are all 0. - Buckets = (Bucket*) calloc(NumBuckets, sizeof(Bucket)); - } - - ~OnDiskChainedHashTableGenerator() { - free(Buckets); - } -}; +using namespace clang::io; //===----------------------------------------------------------------------===// // PTH-specific stuff.