diff --git a/Driver/CacheTokens.cpp b/Driver/CacheTokens.cpp index 58f1d21c37..d8a1fe1db1 100644 --- a/Driver/CacheTokens.cpp +++ b/Driver/CacheTokens.cpp @@ -53,6 +53,16 @@ static void Pad(llvm::raw_fd_ostream& Out, unsigned A) { for ( ; n ; --n ) Emit8(Out, 0); } +// Bernstein hash function: +// This is basically copy-and-paste from StringMap. This likely won't +// stay here, which is why I didn't both to expose this function from +// String Map. +static unsigned BernsteinHash(const char* x) { + unsigned int R = 0; + for ( ; *x != '\0' ; ++x) R = R * 33 + *x; + return R + (R >> 5); +} + //===----------------------------------------------------------------------===// // On Disk Hashtable Logic. This will eventually get refactored and put // elsewhere. @@ -193,14 +203,7 @@ public: typedef const PCHEntry& data_type_ref; static unsigned ComputeHash(const FileEntry* FE) { - // Bernstein hash function: - // This is basically copy-and-paste from StringMap. This likely won't - // stay here, which is why I didn't both to expose this function from - // String Map. There are plenty of other hash functions which are likely - // to perform better and be faster. - unsigned int R = 0; - for (const char* x = FE->getName(); *x != '\0' ; ++x) R = R * 33 + *x; - return R + (R >> 5); + return BernsteinHash(FE->getName()); } static std::pair @@ -273,6 +276,10 @@ class VISIBILITY_HIDDEN PTHWriter { for ( ; I != E ; ++I) Out << *I; } + /// EmitIdentifierTable - Emits two tables to the PTH file. The first is + /// a hashtable mapping from identifier strings to persistent IDs. + /// The second is a straight table mapping from persistent IDs to string data + /// (the keys of the first table). std::pair EmitIdentifierTable(); /// EmitFileTable - Emit a table mapping from file name strings to PTH @@ -335,83 +342,6 @@ void PTHWriter::EmitToken(const Token& T) { Emit32(PP.getSourceManager().getFileOffset(T.getLocation())); } -namespace { -struct VISIBILITY_HIDDEN IDData { - const IdentifierInfo* II; - uint32_t FileOffset; -}; - -class VISIBILITY_HIDDEN CompareIDDataIndex { - IDData* Table; -public: - CompareIDDataIndex(IDData* table) : Table(table) {} - - bool operator()(unsigned i, unsigned j) const { - const IdentifierInfo* II_i = Table[i].II; - const IdentifierInfo* II_j = Table[j].II; - - unsigned i_len = II_i->getLength(); - unsigned j_len = II_j->getLength(); - - if (i_len > j_len) - return false; - - if (i_len < j_len) - return true; - - // Otherwise, compare the strings themselves! - return strncmp(II_i->getName(), II_j->getName(), i_len) < 0; - } -}; -} - -std::pair PTHWriter::EmitIdentifierTable() { - llvm::BumpPtrAllocator Alloc; - - // Build an inverse map from persistent IDs -> IdentifierInfo*. - IDData* IIDMap = Alloc.Allocate(idcount); - - // Generate mapping from persistent IDs -> IdentifierInfo*. - for (IDMap::iterator I=IM.begin(), E=IM.end(); I!=E; ++I) { - // Decrement by 1 because we are using a vector for the lookup and - // 0 is reserved for NULL. - assert(I->second > 0); - assert(I->second-1 < idcount); - unsigned idx = I->second-1; - IIDMap[idx].II = I->first; - } - - // We want to write out the strings in lexical order to support binary - // search of strings to identifiers. Create such a table. - unsigned *LexicalOrder = Alloc.Allocate(idcount); - for (unsigned i = 0; i < idcount ; ++i ) LexicalOrder[i] = i; - std::sort(LexicalOrder, LexicalOrder+idcount, CompareIDDataIndex(IIDMap)); - - // Write out the lexically-sorted table of persistent ids. - Offset LexicalOff = Out.tell(); - for (unsigned i = 0; i < idcount ; ++i) Emit32(LexicalOrder[i]); - - for (unsigned i = 0; i < idcount; ++i) { - IDData& d = IIDMap[i]; - d.FileOffset = Out.tell(); // Record the location for this data. - unsigned len = d.II->getLength(); // Write out the string length. - Emit32(len); - const char* buf = d.II->getName(); // Write out the string data. - EmitBuf(buf, buf+len); - // Emit a null character for those clients expecting that IdentifierInfo - // strings are null terminated. - Emit8('\0'); - } - - // Now emit the table mapping from persistent IDs to PTH file offsets. - Offset IDOff = Out.tell(); - Emit32(idcount); // Emit the number of identifiers. - for (unsigned i = 0 ; i < idcount; ++i) Emit32(IIDMap[i].FileOffset); - - return std::make_pair(IDOff, LexicalOff); -} - - PCHEntry PTHWriter::LexTokens(Lexer& L) { // Pad 0's so that we emit tokens to a 4-byte alignment. // This speed up reading them back in. @@ -657,3 +587,92 @@ void clang::CacheTokens(Preprocessor& PP, const std::string& OutFile) { PW.GeneratePTH(); } +//===----------------------------------------------------------------------===// + +namespace { +class VISIBILITY_HIDDEN PCHIdKey { +public: + const IdentifierInfo* II; + uint32_t FileOffset; +}; + +class VISIBILITY_HIDDEN PCHIdentifierTableTrait { +public: + typedef PCHIdKey* key_type; + typedef key_type key_type_ref; + + typedef uint32_t data_type; + typedef data_type data_type_ref; + + static unsigned ComputeHash(PCHIdKey* key) { + return BernsteinHash(key->II->getName()); + } + + static std::pair + EmitKeyDataLength(llvm::raw_ostream& Out, const PCHIdKey* key, uint32_t) { + unsigned n = strlen(key->II->getName()) + 1; + ::Emit16(Out, n); + return std::make_pair(n, sizeof(uint32_t)); + } + + static void EmitKey(llvm::raw_fd_ostream& Out, PCHIdKey* key, unsigned n) { + // Record the location of the key data. This is used when generating + // the mapping from persistent IDs to strings. + key->FileOffset = Out.tell(); + Out.write(key->II->getName(), n); + } + + static void EmitData(llvm::raw_ostream& Out, uint32_t pID, unsigned) { + ::Emit32(Out, pID); + } +}; +} // end anonymous namespace + +/// EmitIdentifierTable - Emits two tables to the PTH file. The first is +/// a hashtable mapping from identifier strings to persistent IDs. The second +/// is a straight table mapping from persistent IDs to string data (the +/// keys of the first table). +/// +std::pair PTHWriter::EmitIdentifierTable() { + // Build two maps: + // (1) an inverse map from persistent IDs -> (IdentifierInfo*,Offset) + // (2) a map from (IdentifierInfo*, Offset)* -> persistent IDs + + // Note that we use 'calloc', so all the bytes are 0. + PCHIdKey* IIDMap = (PCHIdKey*) calloc(idcount, sizeof(PCHIdKey)); + + // Create the hashtable. + OnDiskChainedHashTableGenerator IIOffMap; + + // Generate mapping from persistent IDs -> IdentifierInfo*. + for (IDMap::iterator I=IM.begin(), E=IM.end(); I!=E; ++I) { + // Decrement by 1 because we are using a vector for the lookup and + // 0 is reserved for NULL. + assert(I->second > 0); + assert(I->second-1 < idcount); + unsigned idx = I->second-1; + + // Store the mapping from persistent ID to IdentifierInfo* + IIDMap[idx].II = I->first; + + // Store the reverse mapping in a hashtable. + IIOffMap.insert(&IIDMap[idx], I->second); + } + + // Write out the inverse map first. This causes the PCIDKey entries to + // record PTH file offsets for the string data. This is used to write + // the second table. + Offset StringTableOffset = IIOffMap.Emit(Out); + + // Now emit the table mapping from persistent IDs to PTH file offsets. + Offset IDOff = Out.tell(); + Emit32(idcount); // Emit the number of identifiers. + for (unsigned i = 0 ; i < idcount; ++i) Emit32(IIDMap[i].FileOffset); + + // Finally, release the inverse map. + free(IIDMap); + + return std::make_pair(IDOff, StringTableOffset); +} + + diff --git a/include/clang/Basic/IdentifierTable.h b/include/clang/Basic/IdentifierTable.h index 03533f87b9..27697df76c 100644 --- a/include/clang/Basic/IdentifierTable.h +++ b/include/clang/Basic/IdentifierTable.h @@ -83,24 +83,26 @@ public: /// const char *getName() const { if (Entry) return Entry->getKeyData(); + // FIXME: This is gross. It would be best not to embed specific details + // of the PTH file format here. // The 'this' pointer really points to a // std::pair, where internal pointer // points to the external string data. - return ((std::pair*) this)->second + 4; + return ((std::pair*) this)->second; } /// getLength - Efficiently return the length of this identifier info. /// unsigned getLength() const { if (Entry) return Entry->getKeyLength(); + // FIXME: This is gross. It would be best not to embed specific details + // of the PTH file format here. // The 'this' pointer really points to a // std::pair, where internal pointer // points to the external string data. - const char* p = ((std::pair*) this)->second; - return ((unsigned) p[0]) - | (((unsigned) p[1]) << 8) - | (((unsigned) p[2]) << 16) - | (((unsigned) p[3]) << 24); + const char* p = ((std::pair*) this)->second-2; + return (((unsigned) p[0]) + | (((unsigned) p[1]) << 8)) - 1; } /// hasMacroDefinition - Return true if this identifier is #defined to some diff --git a/include/clang/Lex/PTHManager.h b/include/clang/Lex/PTHManager.h index c9554939f5..83e2a9806b 100644 --- a/include/clang/Lex/PTHManager.h +++ b/include/clang/Lex/PTHManager.h @@ -53,9 +53,9 @@ class PTHManager : public IdentifierInfoLookup { /// reconsitute an IdentifierInfo. const unsigned char* const IdDataTable; - /// SortedIdTable - Array ordering persistent identifier IDs by the lexical - /// order of their corresponding strings. This is used by get(). - const unsigned char* const SortedIdTable; + /// SortedIdTable - Abstract data structure mapping from strings to + /// persistent IDs. This is used by get(). + void* StringIdLookup; /// NumIds - The number of identifiers in the PTH file. const unsigned NumIds; @@ -72,7 +72,7 @@ class PTHManager : public IdentifierInfoLookup { /// method. PTHManager(const llvm::MemoryBuffer* buf, void* fileLookup, const unsigned char* idDataTable, IdentifierInfo** perIDCache, - const unsigned char* sortedIdTable, unsigned numIds, + void* stringIdLookup, unsigned numIds, const unsigned char* spellingBase); // Do not implement. @@ -83,7 +83,6 @@ class PTHManager : public IdentifierInfoLookup { /// spelling for a token. unsigned getSpellingAtPTHOffset(unsigned PTHOffset, const char*& Buffer); - /// GetIdentifierInfo - Used to reconstruct IdentifierInfo objects from the /// PTH file. inline IdentifierInfo* GetIdentifierInfo(unsigned PersistentID) { @@ -96,7 +95,7 @@ class PTHManager : public IdentifierInfoLookup { public: // The current PTH version. - enum { Version = 3 }; + enum { Version = 5 }; ~PTHManager(); diff --git a/lib/Lex/PTHLexer.cpp b/lib/Lex/PTHLexer.cpp index 4231f2a915..18f2dc4d8e 100644 --- a/lib/Lex/PTHLexer.cpp +++ b/lib/Lex/PTHLexer.cpp @@ -59,6 +59,21 @@ static inline uint32_t ReadLE32(const unsigned char *&Data) { return V; } +// Bernstein hash function: +// This is basically copy-and-paste from StringMap. This likely won't +// stay here, which is why I didn't both to expose this function from +// String Map. +static unsigned BernsteinHash(const char* x) { + unsigned int R = 0; + for ( ; *x != '\0' ; ++x) R = R * 33 + *x; + return R + (R >> 5); +} + +static unsigned BernsteinHash(const char* x, unsigned n) { + unsigned int R = 0; + for (unsigned i = 0 ; i < n ; ++i, ++x) R = R * 33 + *x; + return R + (R >> 5); +} //===----------------------------------------------------------------------===// // PTHLexer methods. @@ -447,10 +462,7 @@ public: } static unsigned ComputeHash(const char* x) { - // More copy-paste nonsense. Will refactor. - unsigned int R = 0; - for (; *x != '\0' ; ++x) R = R * 33 + *x; - return R + (R >> 5); + return BernsteinHash(x); } static const char* GetInternalKey(const FileEntry* FE) { @@ -472,9 +484,51 @@ public: return PTHFileData(x, y); } }; + +class VISIBILITY_HIDDEN PTHStringLookupTrait { +public: + typedef uint32_t + data_type; + + typedef const std::pair + external_key_type; + + typedef external_key_type internal_key_type; + + static bool EqualKey(const internal_key_type& a, + const internal_key_type& b) { + return (a.second == b.second) ? memcmp(a.first, b.first, a.second) == 0 + : false; + } + + static unsigned ComputeHash(const internal_key_type& a) { + return BernsteinHash(a.first, a.second); + } + + // This hopefully will just get inlined and removed by the optimizer. + static const internal_key_type& + GetInternalKey(const external_key_type& x) { return x; } + + static std::pair + ReadKeyDataLength(const unsigned char*& d) { + return std::make_pair((unsigned) ReadUnalignedLE16(d), sizeof(uint32_t)); + } + + static std::pair + ReadKey(const unsigned char* d, unsigned n) { + assert(n >= 2 && d[n-1] == '\0'); + return std::make_pair((const char*) d, n-1); + } + + static uint32_t ReadData(const unsigned char* d, unsigned) { + return ::ReadUnalignedLE32(d); + } +}; + } // end anonymous namespace -typedef OnDiskChainedHashTable PTHFileLookup; +typedef OnDiskChainedHashTable PTHFileLookup; +typedef OnDiskChainedHashTable PTHStringIdLookup; //===----------------------------------------------------------------------===// // PTHManager methods. @@ -483,15 +537,16 @@ typedef OnDiskChainedHashTable PTHFileLookup; PTHManager::PTHManager(const llvm::MemoryBuffer* buf, void* fileLookup, const unsigned char* idDataTable, IdentifierInfo** perIDCache, - const unsigned char* sortedIdTable, unsigned numIds, + void* stringIdLookup, unsigned numIds, const unsigned char* spellingBase) : Buf(buf), PerIDCache(perIDCache), FileLookup(fileLookup), - IdDataTable(idDataTable), SortedIdTable(sortedIdTable), + IdDataTable(idDataTable), StringIdLookup(stringIdLookup), NumIds(numIds), PP(0), SpellingBase(spellingBase) {} PTHManager::~PTHManager() { delete Buf; delete (PTHFileLookup*) FileLookup; + delete (PTHStringIdLookup*) StringIdLookup; free(PerIDCache); } @@ -513,7 +568,7 @@ PTHManager* PTHManager::Create(const std::string& file, Diagnostic* Diags) { "PTH file %0 could not be read"); Diags->Report(FullSourceLoc(), DiagID) << file; } - + return 0; } @@ -575,13 +630,21 @@ PTHManager* PTHManager::Create(const std::string& file, Diagnostic* Diags) { return 0; } - // Get the location of the lexigraphically-sorted table of persistent IDs. - const unsigned char* SortedIdTableOffset = EndTable + sizeof(uint32_t)*1; - const unsigned char* SortedIdTable = BufBeg + ReadLE32(SortedIdTableOffset); - if (!(SortedIdTable >= BufBeg && SortedIdTable < BufEnd)) { + // Get the location of the hashtable mapping between strings and + // persistent IDs. + const unsigned char* StringIdTableOffset = EndTable + sizeof(uint32_t)*1; + const unsigned char* StringIdTable = BufBeg + ReadLE32(StringIdTableOffset); + if (!(StringIdTable >= BufBeg && StringIdTable < BufEnd)) { InvalidPTH(Diags); return 0; } + + llvm::OwningPtr SL(PTHStringIdLookup::Create(StringIdTable, + BufBeg)); + if (SL->isEmpty()) { + InvalidPTH(Diags, "PTH file contains no identifiers."); + return 0; + } // Get the location of the spelling cache. const unsigned char* spellingBaseOffset = EndTable + sizeof(uint32_t)*3; @@ -609,7 +672,7 @@ PTHManager* PTHManager::Create(const std::string& file, Diagnostic* Diags) { // Create the new PTHManager. return new PTHManager(File.take(), FL.take(), IData, PerIDCache, - SortedIdTable, NumIds, spellingBase); + SL.take(), NumIds, spellingBase); } IdentifierInfo* PTHManager::LazilyCreateIdentifierInfo(unsigned PersistentID) { // Look in the PTH file for the string data for the IdentifierInfo object. @@ -623,56 +686,28 @@ IdentifierInfo* PTHManager::LazilyCreateIdentifierInfo(unsigned PersistentID) { Alloc.Allocate >(); Mem->second = IDData; + assert(IDData[0] != '\0'); IdentifierInfo *II = new ((void*) Mem) IdentifierInfo(); // Store the new IdentifierInfo in the cache. PerIDCache[PersistentID] = II; + assert(II->getName() && II->getName()[0] != '\0'); return II; } IdentifierInfo* PTHManager::get(const char *NameStart, const char *NameEnd) { - unsigned min = 0; - unsigned max = NumIds; - unsigned Len = NameEnd - NameStart; - - do { - unsigned i = (max - min) / 2 + min; - const unsigned char *Ptr = SortedIdTable + (i * 4); - - // Read the persistentID. - unsigned perID = ReadLE32(Ptr); - - // Get the IdentifierInfo. - IdentifierInfo* II = GetIdentifierInfo(perID); - - // First compare the lengths. - unsigned IILen = II->getLength(); - if (Len < IILen) goto IsLess; - if (Len > IILen) goto IsGreater; - - // Now compare the strings! - { - signed comp = strncmp(NameStart, II->getName(), Len); - if (comp < 0) goto IsLess; - if (comp > 0) goto IsGreater; - } - // We found a match! - return II; - - IsGreater: - if (i == min) break; - min = i; - continue; - - IsLess: - max = i; - assert(!(max == min) || (min == i)); - } - while (min != max); - - return 0; -} + PTHStringIdLookup& SL = *((PTHStringIdLookup*)StringIdLookup); + // Double check our assumption that the last character isn't '\0'. + assert(NameStart[NameEnd-NameStart-1] != '\0'); + PTHStringIdLookup::iterator I = SL.find(std::make_pair(NameStart, + NameEnd - NameStart)); + if (I == SL.end()) // No identifier found? + return 0; + // Match found. Return the identifier! + assert(*I > 0); + return GetIdentifierInfo(*I-1); +} PTHLexer *PTHManager::CreateLexer(FileID FID) { const FileEntry *FE = PP->getSourceManager().getFileEntryForID(FID);