PTH: Replace string identifier to persistent ID lookup with a hashtable. This is

actually *slightly* slower than the binary search. Since this is algorithmically
better, further performance tuning should be able to make this faster.


git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@64326 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Ted Kremenek 2009-02-11 21:29:16 +00:00
Родитель 5b5c9ef865
Коммит 7e3a004c6e
4 изменённых файлов: 206 добавлений и 151 удалений

Просмотреть файл

@ -53,6 +53,16 @@ static void Pad(llvm::raw_fd_ostream& Out, unsigned A) {
for ( ; n ; --n ) Emit8(Out, 0);
}
// Bernstein hash function:
// This is basically copy-and-paste from StringMap. This likely won't
// stay here, which is why I didn't both to expose this function from
// String Map.
static unsigned BernsteinHash(const char* x) {
unsigned int R = 0;
for ( ; *x != '\0' ; ++x) R = R * 33 + *x;
return R + (R >> 5);
}
//===----------------------------------------------------------------------===//
// On Disk Hashtable Logic. This will eventually get refactored and put
// elsewhere.
@ -193,14 +203,7 @@ public:
typedef const PCHEntry& data_type_ref;
static unsigned ComputeHash(const FileEntry* FE) {
// Bernstein hash function:
// This is basically copy-and-paste from StringMap. This likely won't
// stay here, which is why I didn't both to expose this function from
// String Map. There are plenty of other hash functions which are likely
// to perform better and be faster.
unsigned int R = 0;
for (const char* x = FE->getName(); *x != '\0' ; ++x) R = R * 33 + *x;
return R + (R >> 5);
return BernsteinHash(FE->getName());
}
static std::pair<unsigned,unsigned>
@ -273,6 +276,10 @@ class VISIBILITY_HIDDEN PTHWriter {
for ( ; I != E ; ++I) Out << *I;
}
/// EmitIdentifierTable - Emits two tables to the PTH file. The first is
/// a hashtable mapping from identifier strings to persistent IDs.
/// The second is a straight table mapping from persistent IDs to string data
/// (the keys of the first table).
std::pair<Offset, Offset> EmitIdentifierTable();
/// EmitFileTable - Emit a table mapping from file name strings to PTH
@ -335,83 +342,6 @@ void PTHWriter::EmitToken(const Token& T) {
Emit32(PP.getSourceManager().getFileOffset(T.getLocation()));
}
namespace {
struct VISIBILITY_HIDDEN IDData {
const IdentifierInfo* II;
uint32_t FileOffset;
};
class VISIBILITY_HIDDEN CompareIDDataIndex {
IDData* Table;
public:
CompareIDDataIndex(IDData* table) : Table(table) {}
bool operator()(unsigned i, unsigned j) const {
const IdentifierInfo* II_i = Table[i].II;
const IdentifierInfo* II_j = Table[j].II;
unsigned i_len = II_i->getLength();
unsigned j_len = II_j->getLength();
if (i_len > j_len)
return false;
if (i_len < j_len)
return true;
// Otherwise, compare the strings themselves!
return strncmp(II_i->getName(), II_j->getName(), i_len) < 0;
}
};
}
std::pair<Offset,Offset> PTHWriter::EmitIdentifierTable() {
llvm::BumpPtrAllocator Alloc;
// Build an inverse map from persistent IDs -> IdentifierInfo*.
IDData* IIDMap = Alloc.Allocate<IDData>(idcount);
// Generate mapping from persistent IDs -> IdentifierInfo*.
for (IDMap::iterator I=IM.begin(), E=IM.end(); I!=E; ++I) {
// Decrement by 1 because we are using a vector for the lookup and
// 0 is reserved for NULL.
assert(I->second > 0);
assert(I->second-1 < idcount);
unsigned idx = I->second-1;
IIDMap[idx].II = I->first;
}
// We want to write out the strings in lexical order to support binary
// search of strings to identifiers. Create such a table.
unsigned *LexicalOrder = Alloc.Allocate<unsigned>(idcount);
for (unsigned i = 0; i < idcount ; ++i ) LexicalOrder[i] = i;
std::sort(LexicalOrder, LexicalOrder+idcount, CompareIDDataIndex(IIDMap));
// Write out the lexically-sorted table of persistent ids.
Offset LexicalOff = Out.tell();
for (unsigned i = 0; i < idcount ; ++i) Emit32(LexicalOrder[i]);
for (unsigned i = 0; i < idcount; ++i) {
IDData& d = IIDMap[i];
d.FileOffset = Out.tell(); // Record the location for this data.
unsigned len = d.II->getLength(); // Write out the string length.
Emit32(len);
const char* buf = d.II->getName(); // Write out the string data.
EmitBuf(buf, buf+len);
// Emit a null character for those clients expecting that IdentifierInfo
// strings are null terminated.
Emit8('\0');
}
// Now emit the table mapping from persistent IDs to PTH file offsets.
Offset IDOff = Out.tell();
Emit32(idcount); // Emit the number of identifiers.
for (unsigned i = 0 ; i < idcount; ++i) Emit32(IIDMap[i].FileOffset);
return std::make_pair(IDOff, LexicalOff);
}
PCHEntry PTHWriter::LexTokens(Lexer& L) {
// Pad 0's so that we emit tokens to a 4-byte alignment.
// This speed up reading them back in.
@ -657,3 +587,92 @@ void clang::CacheTokens(Preprocessor& PP, const std::string& OutFile) {
PW.GeneratePTH();
}
//===----------------------------------------------------------------------===//
namespace {
class VISIBILITY_HIDDEN PCHIdKey {
public:
const IdentifierInfo* II;
uint32_t FileOffset;
};
class VISIBILITY_HIDDEN PCHIdentifierTableTrait {
public:
typedef PCHIdKey* key_type;
typedef key_type key_type_ref;
typedef uint32_t data_type;
typedef data_type data_type_ref;
static unsigned ComputeHash(PCHIdKey* key) {
return BernsteinHash(key->II->getName());
}
static std::pair<unsigned,unsigned>
EmitKeyDataLength(llvm::raw_ostream& Out, const PCHIdKey* key, uint32_t) {
unsigned n = strlen(key->II->getName()) + 1;
::Emit16(Out, n);
return std::make_pair(n, sizeof(uint32_t));
}
static void EmitKey(llvm::raw_fd_ostream& Out, PCHIdKey* key, unsigned n) {
// Record the location of the key data. This is used when generating
// the mapping from persistent IDs to strings.
key->FileOffset = Out.tell();
Out.write(key->II->getName(), n);
}
static void EmitData(llvm::raw_ostream& Out, uint32_t pID, unsigned) {
::Emit32(Out, pID);
}
};
} // end anonymous namespace
/// EmitIdentifierTable - Emits two tables to the PTH file. The first is
/// a hashtable mapping from identifier strings to persistent IDs. The second
/// is a straight table mapping from persistent IDs to string data (the
/// keys of the first table).
///
std::pair<Offset,Offset> PTHWriter::EmitIdentifierTable() {
// Build two maps:
// (1) an inverse map from persistent IDs -> (IdentifierInfo*,Offset)
// (2) a map from (IdentifierInfo*, Offset)* -> persistent IDs
// Note that we use 'calloc', so all the bytes are 0.
PCHIdKey* IIDMap = (PCHIdKey*) calloc(idcount, sizeof(PCHIdKey));
// Create the hashtable.
OnDiskChainedHashTableGenerator<PCHIdentifierTableTrait> IIOffMap;
// Generate mapping from persistent IDs -> IdentifierInfo*.
for (IDMap::iterator I=IM.begin(), E=IM.end(); I!=E; ++I) {
// Decrement by 1 because we are using a vector for the lookup and
// 0 is reserved for NULL.
assert(I->second > 0);
assert(I->second-1 < idcount);
unsigned idx = I->second-1;
// Store the mapping from persistent ID to IdentifierInfo*
IIDMap[idx].II = I->first;
// Store the reverse mapping in a hashtable.
IIOffMap.insert(&IIDMap[idx], I->second);
}
// Write out the inverse map first. This causes the PCIDKey entries to
// record PTH file offsets for the string data. This is used to write
// the second table.
Offset StringTableOffset = IIOffMap.Emit(Out);
// Now emit the table mapping from persistent IDs to PTH file offsets.
Offset IDOff = Out.tell();
Emit32(idcount); // Emit the number of identifiers.
for (unsigned i = 0 ; i < idcount; ++i) Emit32(IIDMap[i].FileOffset);
// Finally, release the inverse map.
free(IIDMap);
return std::make_pair(IDOff, StringTableOffset);
}

Просмотреть файл

@ -83,24 +83,26 @@ public:
///
const char *getName() const {
if (Entry) return Entry->getKeyData();
// FIXME: This is gross. It would be best not to embed specific details
// of the PTH file format here.
// The 'this' pointer really points to a
// std::pair<IdentifierInfo, const char*>, where internal pointer
// points to the external string data.
return ((std::pair<IdentifierInfo, const char*>*) this)->second + 4;
return ((std::pair<IdentifierInfo, const char*>*) this)->second;
}
/// getLength - Efficiently return the length of this identifier info.
///
unsigned getLength() const {
if (Entry) return Entry->getKeyLength();
// FIXME: This is gross. It would be best not to embed specific details
// of the PTH file format here.
// The 'this' pointer really points to a
// std::pair<IdentifierInfo, const char*>, where internal pointer
// points to the external string data.
const char* p = ((std::pair<IdentifierInfo, const char*>*) this)->second;
return ((unsigned) p[0])
| (((unsigned) p[1]) << 8)
| (((unsigned) p[2]) << 16)
| (((unsigned) p[3]) << 24);
const char* p = ((std::pair<IdentifierInfo, const char*>*) this)->second-2;
return (((unsigned) p[0])
| (((unsigned) p[1]) << 8)) - 1;
}
/// hasMacroDefinition - Return true if this identifier is #defined to some

Просмотреть файл

@ -53,9 +53,9 @@ class PTHManager : public IdentifierInfoLookup {
/// reconsitute an IdentifierInfo.
const unsigned char* const IdDataTable;
/// SortedIdTable - Array ordering persistent identifier IDs by the lexical
/// order of their corresponding strings. This is used by get().
const unsigned char* const SortedIdTable;
/// SortedIdTable - Abstract data structure mapping from strings to
/// persistent IDs. This is used by get().
void* StringIdLookup;
/// NumIds - The number of identifiers in the PTH file.
const unsigned NumIds;
@ -72,7 +72,7 @@ class PTHManager : public IdentifierInfoLookup {
/// method.
PTHManager(const llvm::MemoryBuffer* buf, void* fileLookup,
const unsigned char* idDataTable, IdentifierInfo** perIDCache,
const unsigned char* sortedIdTable, unsigned numIds,
void* stringIdLookup, unsigned numIds,
const unsigned char* spellingBase);
// Do not implement.
@ -83,7 +83,6 @@ class PTHManager : public IdentifierInfoLookup {
/// spelling for a token.
unsigned getSpellingAtPTHOffset(unsigned PTHOffset, const char*& Buffer);
/// GetIdentifierInfo - Used to reconstruct IdentifierInfo objects from the
/// PTH file.
inline IdentifierInfo* GetIdentifierInfo(unsigned PersistentID) {
@ -96,7 +95,7 @@ class PTHManager : public IdentifierInfoLookup {
public:
// The current PTH version.
enum { Version = 3 };
enum { Version = 5 };
~PTHManager();

Просмотреть файл

@ -59,6 +59,21 @@ static inline uint32_t ReadLE32(const unsigned char *&Data) {
return V;
}
// Bernstein hash function:
// This is basically copy-and-paste from StringMap. This likely won't
// stay here, which is why I didn't both to expose this function from
// String Map.
static unsigned BernsteinHash(const char* x) {
unsigned int R = 0;
for ( ; *x != '\0' ; ++x) R = R * 33 + *x;
return R + (R >> 5);
}
static unsigned BernsteinHash(const char* x, unsigned n) {
unsigned int R = 0;
for (unsigned i = 0 ; i < n ; ++i, ++x) R = R * 33 + *x;
return R + (R >> 5);
}
//===----------------------------------------------------------------------===//
// PTHLexer methods.
@ -447,10 +462,7 @@ public:
}
static unsigned ComputeHash(const char* x) {
// More copy-paste nonsense. Will refactor.
unsigned int R = 0;
for (; *x != '\0' ; ++x) R = R * 33 + *x;
return R + (R >> 5);
return BernsteinHash(x);
}
static const char* GetInternalKey(const FileEntry* FE) {
@ -472,9 +484,51 @@ public:
return PTHFileData(x, y);
}
};
class VISIBILITY_HIDDEN PTHStringLookupTrait {
public:
typedef uint32_t
data_type;
typedef const std::pair<const char*, unsigned>
external_key_type;
typedef external_key_type internal_key_type;
static bool EqualKey(const internal_key_type& a,
const internal_key_type& b) {
return (a.second == b.second) ? memcmp(a.first, b.first, a.second) == 0
: false;
}
static unsigned ComputeHash(const internal_key_type& a) {
return BernsteinHash(a.first, a.second);
}
// This hopefully will just get inlined and removed by the optimizer.
static const internal_key_type&
GetInternalKey(const external_key_type& x) { return x; }
static std::pair<unsigned, unsigned>
ReadKeyDataLength(const unsigned char*& d) {
return std::make_pair((unsigned) ReadUnalignedLE16(d), sizeof(uint32_t));
}
static std::pair<const char*, unsigned>
ReadKey(const unsigned char* d, unsigned n) {
assert(n >= 2 && d[n-1] == '\0');
return std::make_pair((const char*) d, n-1);
}
static uint32_t ReadData(const unsigned char* d, unsigned) {
return ::ReadUnalignedLE32(d);
}
};
} // end anonymous namespace
typedef OnDiskChainedHashTable<PTHFileLookupTrait> PTHFileLookup;
typedef OnDiskChainedHashTable<PTHStringLookupTrait> PTHStringIdLookup;
//===----------------------------------------------------------------------===//
// PTHManager methods.
@ -483,15 +537,16 @@ typedef OnDiskChainedHashTable<PTHFileLookupTrait> PTHFileLookup;
PTHManager::PTHManager(const llvm::MemoryBuffer* buf, void* fileLookup,
const unsigned char* idDataTable,
IdentifierInfo** perIDCache,
const unsigned char* sortedIdTable, unsigned numIds,
void* stringIdLookup, unsigned numIds,
const unsigned char* spellingBase)
: Buf(buf), PerIDCache(perIDCache), FileLookup(fileLookup),
IdDataTable(idDataTable), SortedIdTable(sortedIdTable),
IdDataTable(idDataTable), StringIdLookup(stringIdLookup),
NumIds(numIds), PP(0), SpellingBase(spellingBase) {}
PTHManager::~PTHManager() {
delete Buf;
delete (PTHFileLookup*) FileLookup;
delete (PTHStringIdLookup*) StringIdLookup;
free(PerIDCache);
}
@ -575,14 +630,22 @@ PTHManager* PTHManager::Create(const std::string& file, Diagnostic* Diags) {
return 0;
}
// Get the location of the lexigraphically-sorted table of persistent IDs.
const unsigned char* SortedIdTableOffset = EndTable + sizeof(uint32_t)*1;
const unsigned char* SortedIdTable = BufBeg + ReadLE32(SortedIdTableOffset);
if (!(SortedIdTable >= BufBeg && SortedIdTable < BufEnd)) {
// Get the location of the hashtable mapping between strings and
// persistent IDs.
const unsigned char* StringIdTableOffset = EndTable + sizeof(uint32_t)*1;
const unsigned char* StringIdTable = BufBeg + ReadLE32(StringIdTableOffset);
if (!(StringIdTable >= BufBeg && StringIdTable < BufEnd)) {
InvalidPTH(Diags);
return 0;
}
llvm::OwningPtr<PTHStringIdLookup> SL(PTHStringIdLookup::Create(StringIdTable,
BufBeg));
if (SL->isEmpty()) {
InvalidPTH(Diags, "PTH file contains no identifiers.");
return 0;
}
// Get the location of the spelling cache.
const unsigned char* spellingBaseOffset = EndTable + sizeof(uint32_t)*3;
const unsigned char* spellingBase = BufBeg + ReadLE32(spellingBaseOffset);
@ -609,7 +672,7 @@ PTHManager* PTHManager::Create(const std::string& file, Diagnostic* Diags) {
// Create the new PTHManager.
return new PTHManager(File.take(), FL.take(), IData, PerIDCache,
SortedIdTable, NumIds, spellingBase);
SL.take(), NumIds, spellingBase);
}
IdentifierInfo* PTHManager::LazilyCreateIdentifierInfo(unsigned PersistentID) {
// Look in the PTH file for the string data for the IdentifierInfo object.
@ -623,56 +686,28 @@ IdentifierInfo* PTHManager::LazilyCreateIdentifierInfo(unsigned PersistentID) {
Alloc.Allocate<std::pair<IdentifierInfo,const unsigned char*> >();
Mem->second = IDData;
assert(IDData[0] != '\0');
IdentifierInfo *II = new ((void*) Mem) IdentifierInfo();
// Store the new IdentifierInfo in the cache.
PerIDCache[PersistentID] = II;
assert(II->getName() && II->getName()[0] != '\0');
return II;
}
IdentifierInfo* PTHManager::get(const char *NameStart, const char *NameEnd) {
unsigned min = 0;
unsigned max = NumIds;
unsigned Len = NameEnd - NameStart;
do {
unsigned i = (max - min) / 2 + min;
const unsigned char *Ptr = SortedIdTable + (i * 4);
// Read the persistentID.
unsigned perID = ReadLE32(Ptr);
// Get the IdentifierInfo.
IdentifierInfo* II = GetIdentifierInfo(perID);
// First compare the lengths.
unsigned IILen = II->getLength();
if (Len < IILen) goto IsLess;
if (Len > IILen) goto IsGreater;
// Now compare the strings!
{
signed comp = strncmp(NameStart, II->getName(), Len);
if (comp < 0) goto IsLess;
if (comp > 0) goto IsGreater;
}
// We found a match!
return II;
IsGreater:
if (i == min) break;
min = i;
continue;
IsLess:
max = i;
assert(!(max == min) || (min == i));
}
while (min != max);
PTHStringIdLookup& SL = *((PTHStringIdLookup*)StringIdLookup);
// Double check our assumption that the last character isn't '\0'.
assert(NameStart[NameEnd-NameStart-1] != '\0');
PTHStringIdLookup::iterator I = SL.find(std::make_pair(NameStart,
NameEnd - NameStart));
if (I == SL.end()) // No identifier found?
return 0;
}
// Match found. Return the identifier!
assert(*I > 0);
return GetIdentifierInfo(*I-1);
}
PTHLexer *PTHManager::CreateLexer(FileID FID) {
const FileEntry *FE = PP->getSourceManager().getFileEntryForID(FID);