зеркало из https://github.com/mozilla/gecko-dev.git
329 строки
13 KiB
Python
329 строки
13 KiB
Python
# perfecthash.py - Helper for generating perfect hash functions
|
|
#
|
|
# This Source Code Form is subject to the terms of the Mozilla Public
|
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
"""
|
|
A perfect hash function (PHF) is a function which maps distinct elements from a
|
|
source set to a sequence of integers with no collisions.
|
|
|
|
PHFs generated by this module use a 32-bit FNV hash to index an intermediate
|
|
table. The value from that table is then used to run a second 32-bit FNV hash to
|
|
get a final index.
|
|
|
|
The algorithm works by starting with the largest set of conflicts, and testing
|
|
intermediate table values until no conflicts are generated, allowing efficient
|
|
index lookup at runtime. (see also: http://stevehanov.ca/blog/index.php?id=119)
|
|
|
|
NOTE: This perfect hash generator was designed to be simple, easy to follow, and
|
|
maintainable, rather than being as optimized as possible, due to the relatively
|
|
small dataset it was designed for. In the future we may want to optimize further.
|
|
"""
|
|
|
|
from collections import namedtuple
|
|
import textwrap
|
|
|
|
|
|
class PerfectHash(object):
|
|
"""PerfectHash objects represent a computed perfect hash function, which
|
|
can be generated at compile time to provide highly efficient and compact
|
|
static HashTables.
|
|
|
|
Consumers must provide an intermediate table size to store the generated
|
|
hash function. Larger tables will generate more quickly, while smaller
|
|
tables will consume less space on disk."""
|
|
|
|
# 32-bit FNV offset basis and prime value.
|
|
# NOTE: Must match values in |PerfectHash.h|
|
|
FNV_OFFSET_BASIS = 0x811C9DC5
|
|
FNV_PRIME = 16777619
|
|
U32_MAX = 0xffffffff
|
|
|
|
# Bucket of entries which map into a given intermediate index.
|
|
Bucket = namedtuple('Bucket', 'index entries')
|
|
|
|
def __init__(self, entries, size, validate=True, key=lambda e: e[0]):
|
|
"""Create a new PerfectHash
|
|
|
|
@param entries set of entries to generate a PHF for
|
|
@param size size of the PHF intermediate table
|
|
@param validate test the generated hash function after generation
|
|
@param key function to get 'memoryview'-compatible key for an
|
|
entry. defaults to extracting the first element of an
|
|
entry 2-tuple.
|
|
"""
|
|
|
|
assert 0 < len(entries) < self.U32_MAX, "bad # of entries!"
|
|
self._key = key
|
|
|
|
# Allocate the intermediate table and keys.
|
|
self.table = [0] * size
|
|
self.entries = [None] * len(entries)
|
|
|
|
# A set of Bucket values. Each bucket contains the index into the
|
|
# intermediate table, and a list of Entries which map into that bucket.
|
|
buckets = [self.Bucket(idx, []) for idx in range(size)]
|
|
|
|
# Determine which input strings map to which buckets in the table.
|
|
for entry in entries:
|
|
assert entry is not None, "None entry in entries"
|
|
assert self.key(entry).itemsize == 1, "non-byte key elements"
|
|
buckets[self._hash(self.key(entry)) % size].entries.append(entry)
|
|
|
|
# Sort the buckets such that the largest one comes first.
|
|
buckets.sort(key=lambda b: len(b.entries), reverse=True)
|
|
|
|
for bucket in buckets:
|
|
# Once we've reached an empty bucket, we're done.
|
|
if len(bucket.entries) == 0:
|
|
break
|
|
|
|
# Try values for the basis until we find one with no conflicts.
|
|
idx = 0
|
|
basis = 1
|
|
slots = []
|
|
while idx < len(bucket.entries):
|
|
slot = self._hash(self.key(bucket.entries[idx]), basis) % len(self.entries)
|
|
if self.entries[slot] is not None or slot in slots:
|
|
# There was a conflict, try the next basis.
|
|
basis += 1
|
|
idx = 0
|
|
del slots[:]
|
|
assert basis < self.U32_MAX, "table too small"
|
|
else:
|
|
slots.append(slot)
|
|
idx += 1
|
|
|
|
assert basis < self.U32_MAX
|
|
|
|
# We've found a basis which doesn't conflict
|
|
self.table[bucket.index] = basis
|
|
for slot, entry in zip(slots, bucket.entries):
|
|
self.entries[slot] = entry
|
|
|
|
# Validate that looking up each key succeeds
|
|
if validate:
|
|
for entry in entries:
|
|
assert self.get_entry(self.key(entry)), "get_entry(%s)" % repr(entry)
|
|
|
|
@classmethod
|
|
def _hash(cls, key, basis=FNV_OFFSET_BASIS):
|
|
"""A basic FNV-based hash function. key is the memoryview to hash.
|
|
32-bit FNV is used for indexing into the first table, and the value
|
|
stored in that table is used as the offset basis for indexing into the
|
|
values table."""
|
|
for byte in memoryview(key):
|
|
basis ^= ord(byte) # xor-in the byte
|
|
basis *= cls.FNV_PRIME # Multiply by the FNV prime
|
|
basis &= cls.U32_MAX # clamp to 32-bits
|
|
return basis
|
|
|
|
def key(self, entry):
|
|
return memoryview(self._key(entry))
|
|
|
|
def get_raw_index(self, key):
|
|
"""Determine the index in self.entries without validating"""
|
|
mid = self.table[self._hash(key) % len(self.table)]
|
|
return self._hash(key, mid) % len(self.entries)
|
|
|
|
def get_index(self, key):
|
|
"""Given a key, determine the index in self.entries"""
|
|
idx = self.get_raw_index(key)
|
|
if memoryview(key) != self.key(self.entries[idx]):
|
|
return None
|
|
return idx
|
|
|
|
def get_entry(self, key):
|
|
"""Given a key, get the corresponding entry"""
|
|
idx = self.get_index(key)
|
|
if idx is None:
|
|
return None
|
|
return self.entries[idx]
|
|
|
|
@staticmethod
|
|
def _indent(text, amount=1):
|
|
return text.replace('\n', '\n' + (' ' * amount))
|
|
|
|
def codegen(self, name, entry_type):
|
|
"""Create a helper for codegening PHF logic"""
|
|
return CGHelper(self, name, entry_type)
|
|
|
|
def cxx_codegen(self, name, entry_type, lower_entry, entries_name=None,
|
|
return_type=None, return_entry='return entry;',
|
|
key_type='const char*', key_bytes='aKey',
|
|
key_length='strlen(aKey)'):
|
|
"""Generate complete C++ code for a get_entry-style method.
|
|
|
|
@param name Name for the entry getter function.
|
|
@param entry_type C++ type of each entry in static memory.
|
|
@param lower_entry Function called with each entry to convert it to a
|
|
C++ literal of type 'entry_type'.
|
|
|
|
# Optional Keyword Parameters:
|
|
@param entries_name Name for the generated entry table.
|
|
|
|
@param return_type C++ return type, default: 'const entry_type&'
|
|
@param return_entry Override the default behaviour for returning the
|
|
found entry. 'entry' is a reference to the found
|
|
entry, and 'aKey' is the lookup key. 'return_entry'
|
|
can be used for additional checks, e.g. for keys
|
|
not in the table.
|
|
|
|
@param key_type C++ key type, default: 'const char*'
|
|
@param key_bytes 'const char*' expression to get bytes for 'aKey'
|
|
@param key_length 'size_t' expression to get length of 'aKey'"""
|
|
|
|
if entries_name is None:
|
|
entries_name = 's%sEntries' % name
|
|
|
|
cg = self.codegen(entries_name, entry_type)
|
|
entries = cg.gen_entries(lower_entry)
|
|
getter = cg.gen_getter(name, return_type, return_entry,
|
|
key_type, key_bytes, key_length)
|
|
|
|
return "%s\n\n%s" % (entries, getter)
|
|
|
|
|
|
class CGHelper(object):
|
|
"""Helper object for generating C++ code for a PerfectHash.
|
|
Created using PerfectHash.codegen()."""
|
|
|
|
def __init__(self, phf, entries_name, entry_type):
|
|
self.phf = phf
|
|
self.entries_name = entries_name
|
|
self.entry_type = entry_type
|
|
|
|
@staticmethod
|
|
def _indent(text, amount=1):
|
|
return text.replace('\n', '\n' + (' ' * amount))
|
|
|
|
def basis_ty(self):
|
|
"""Determine how big of an integer is needed for bases table."""
|
|
max_base = max(self.phf.table)
|
|
if max_base <= 0xff:
|
|
return 'uint8_t'
|
|
elif max_base <= 0xffff:
|
|
return 'uint16_t'
|
|
return 'uint32_t'
|
|
|
|
def basis_table(self, name='BASES'):
|
|
"""Generate code for a static basis table"""
|
|
bases = ''
|
|
for idx, base in enumerate(self.phf.table):
|
|
if idx and idx % 16 == 0: # 16 bases per line
|
|
bases += '\n '
|
|
bases += '%4d,' % base
|
|
return textwrap.dedent("""\
|
|
static const %s %s[] = {
|
|
%s
|
|
};
|
|
""") % (self.basis_ty(), name, bases)
|
|
|
|
def gen_entries(self, lower_entry):
|
|
"""Generate code for an entries table"""
|
|
entries = self._indent(',\n'.join(lower_entry(entry).rstrip()
|
|
for entry in self.phf.entries))
|
|
return textwrap.dedent("""\
|
|
const %s %s[] = {
|
|
%s
|
|
};
|
|
""") % (self.entry_type, self.entries_name, entries)
|
|
|
|
def gen_getter(self, name, return_type=None, return_entry='return entry;',
|
|
key_type='const char*', key_bytes='aKey',
|
|
key_length='strlen(aKey)'):
|
|
"""Generate the code for a C++ getter.
|
|
|
|
@param name Name for the entry getter function.
|
|
@param return_type C++ return type, default: 'const entry_type&'
|
|
@param return_entry Override the default behaviour for returning the
|
|
found entry. 'entry' is a reference to the found
|
|
entry, and 'aKey' is the lookup key. 'return_entry'
|
|
can be used for additional checks, e.g. for keys
|
|
not in the table.
|
|
|
|
@param key_type C++ key type, default: 'const char*'
|
|
@param key_bytes 'const char*' expression to get bytes for 'aKey'
|
|
@param key_length 'size_t' expression to get length of 'aKey'"""
|
|
|
|
if return_type is None:
|
|
return_type = 'const %s&' % self.entry_type
|
|
|
|
return textwrap.dedent("""
|
|
%(return_type)s
|
|
%(name)s(%(key_type)s aKey)
|
|
{
|
|
%(basis_table)s
|
|
|
|
const char* bytes = %(key_bytes)s;
|
|
size_t length = %(key_length)s;
|
|
auto& entry = mozilla::perfecthash::Lookup(bytes, length, BASES,
|
|
%(entries_name)s);
|
|
%(return_entry)s
|
|
}
|
|
""") % {
|
|
'name': name,
|
|
'basis_table': self._indent(self.basis_table()),
|
|
'entries_name': self.entries_name,
|
|
|
|
'return_type': return_type,
|
|
'return_entry': self._indent(return_entry),
|
|
|
|
'key_type': key_type,
|
|
'key_bytes': key_bytes,
|
|
'key_length': key_length,
|
|
}
|
|
|
|
def gen_jslinearstr_getter(self, name, return_type=None,
|
|
return_entry='return entry;'):
|
|
"""Generate code for a specialized getter taking JSLinearStrings.
|
|
This getter avoids copying the JS string, but only supports ASCII keys.
|
|
|
|
@param name Name for the entry getter function.
|
|
@param return_type C++ return type, default: 'const entry_type&'
|
|
@param return_entry Override the default behaviour for returning the
|
|
found entry. 'entry' is a reference to the found
|
|
entry, and 'aKey' is the lookup key. 'return_entry'
|
|
can be used for additional checks, e.g. for keys
|
|
not in the table."""
|
|
|
|
assert all(ord(b) <= 0x7f
|
|
for e in self.phf.entries
|
|
for b in self.phf.key(e)), "non-ASCII key"
|
|
|
|
if return_type is None:
|
|
return_type = 'const %s&' % self.entry_type
|
|
|
|
return textwrap.dedent("""
|
|
%(return_type)s
|
|
%(name)s(JSLinearString* aKey)
|
|
{
|
|
%(basis_table)s
|
|
|
|
size_t length = js::GetLinearStringLength(aKey);
|
|
|
|
JS::AutoCheckCannotGC nogc;
|
|
if (js::LinearStringHasLatin1Chars(aKey)) {
|
|
auto& entry = mozilla::perfecthash::Lookup(
|
|
js::GetLatin1LinearStringChars(nogc, aKey),
|
|
length, BASES, %(entries_name)s);
|
|
|
|
%(return_entry)s
|
|
} else {
|
|
auto& entry = mozilla::perfecthash::Lookup(
|
|
js::GetTwoByteLinearStringChars(nogc, aKey),
|
|
length, BASES, %(entries_name)s);
|
|
|
|
%(return_entry)s
|
|
}
|
|
}
|
|
""") % {
|
|
'name': name,
|
|
'basis_table': self._indent(self.basis_table()),
|
|
'entries_name': self.entries_name,
|
|
|
|
'return_type': return_type,
|
|
'return_entry': self._indent(return_entry, 2),
|
|
}
|