From 57ad216048b4d66997ca8f04f8be4c6d90c4188d Mon Sep 17 00:00:00 2001 From: Boris Zbarsky Date: Wed, 29 Feb 2012 21:40:47 -0500 Subject: [PATCH] Bug 730100. Add a Bloom filter implementation. r=waldo --- mfbt/BloomFilter.h | 232 ++++++++++++++++++++++++++++++++ mfbt/Likely.h | 17 +++ mfbt/exported_headers.mk | 2 + xpcom/tests/Makefile.in | 1 + xpcom/tests/TestBloomFilter.cpp | 120 +++++++++++++++++ 5 files changed, 372 insertions(+) create mode 100644 mfbt/BloomFilter.h create mode 100644 mfbt/Likely.h create mode 100644 xpcom/tests/TestBloomFilter.cpp diff --git a/mfbt/BloomFilter.h b/mfbt/BloomFilter.h new file mode 100644 index 00000000000..a28c69bbe1c --- /dev/null +++ b/mfbt/BloomFilter.h @@ -0,0 +1,232 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* + * A counting Bloom filter implementation. This allows consumers to + * do fast probabilistic "is item X in set Y?" testing which will + * never answer "no" when the correct answer is "yes" (but might + * incorrectly answer "yes" when the correct answer is "no"). + */ + +#ifndef mozilla_BloomFilter_h_ +#define mozilla_BloomFilter_h_ + +#include "mozilla/Likely.h" +#include "mozilla/StdInt.h" +#include "mozilla/Util.h" + +#include + +namespace mozilla { + +/* + * This class implements a counting Bloom filter as described at + * , with + * 8-bit counters. This allows quick probabilistic answers to the + * question "is object X in set Y?" where the contents of Y might not + * be time-invariant. The probabilistic nature of the test means that + * sometimes the answer will be "yes" when it should be "no". If the + * answer is "no", then X is guaranteed not to be in Y. + * + * The filter is parametrized on KeySize, which is the size of the key + * generated by each of hash functions used by the filter, in bits, + * and the type of object T being added and removed. T must implement + * a |uint32_t hash() const| method which returns a uint32_t hash key + * that will be used to generate the two separate hash functions for + * the Bloom filter. This hash key MUST be well-distributed for good + * results! KeySize is not allowed to be larger than 16. + * + * The filter uses exactly 2**KeySize bytes of memory. From now on we + * will refer to the memory used by the filter as M. + * + * The expected rate of incorrect "yes" answers depends on M and on + * the number N of objects in set Y. As long as N is small compared + * to M, the rate of such answers is expected to be approximately + * 4*(N/M)**2 for this filter. In practice, if Y has a few hundred + * elements then using a KeySize of 12 gives a reasonably low + * incorrect answer rate. A KeySize of 12 has the additional benefit + * of using exactly one page for the filter in typical hardware + * configurations. + */ + +template +class BloomFilter { + /* + * A counting Bloom filter with 8-bit counters. For now we assume + * that having two hash functions is enough, but we may revisit that + * decision later. + * + * The filter uses an array with 2**KeySize entries. + * + * Assuming a well-distributed hash function, a Bloom filter with + * array size M containing N elements and + * using k hash function has expected false positive rate exactly + * + * $ (1 - (1 - 1/M)^{kN})^k $ + * + * because each array slot has a + * + * $ (1 - 1/M)^{kN} $ + * + * chance of being 0, and the expected false positive rate is the + * probability that all of the k hash functions will hit a nonzero + * slot. + * + * For reasonable assumptions (M large, kN large, which should both + * hold if we're worried about false positives) about M and kN this + * becomes approximately + * + * $$ (1 - \exp(-kN/M))^k $$ + * + * For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$, + * or in other words + * + * $$ N/M = -0.5 * \ln(1 - \sqrt(r)) $$ + * + * where r is the false positive rate. This can be used to compute + * the desired KeySize for a given load N and false positive rate r. + * + * If N/M is assumed small, then the false positive rate can + * further be approximated as 4*N^2/M^2. So increasing KeySize by + * 1, which doubles M, reduces the false positive rate by about a + * factor of 4, and a false positive rate of 1% corresponds to + * about M/N == 20. + * + * What this means in practice is that for a few hundred keys using a + * KeySize of 12 gives false positive rates on the order of 0.25-4%. + * + * Similarly, using a KeySize of 10 would lead to a 4% false + * positive rate for N == 100 and to quite bad false positive + * rates for larger N. + */ +public: + BloomFilter() { + MOZ_STATIC_ASSERT(KeySize <= keyShift, "KeySize too big"); + + // Should we have a custom operator new using calloc instead and + // require that we're allocated via the operator? + clear(); + } + + /* + * Clear the filter. This should be done before reusing it, because + * just removing all items doesn't clear counters that hit the upper + * bound. + */ + void clear(); + + /* + * Add an item to the filter. + */ + void add(const T* t); + + /* + * Remove an item from the filter. + */ + void remove(const T* t); + + /* + * Check whether the filter might contain an item. This can + * sometimes return true even if the item is not in the filter, + * but will never return false for items that are actually in the + * filter. + */ + bool mayContain(const T* t) const; + + /* + * Methods for add/remove/contain when we already have a hash computed + */ + void add(uint32_t hash); + void remove(uint32_t hash); + bool mayContain(uint32_t hash) const; + +private: + static const size_t arraySize = (1 << KeySize); + static const uint32_t keyMask = (1 << KeySize) - 1; + static const uint32_t keyShift = 16; + + static uint32_t hash1(uint32_t hash) { return hash & keyMask; } + static uint32_t hash2(uint32_t hash) { return (hash >> keyShift) & keyMask; } + + uint8_t& firstSlot(uint32_t hash) { return counters[hash1(hash)]; } + uint8_t& secondSlot(uint32_t hash) { return counters[hash2(hash)]; } + const uint8_t& firstSlot(uint32_t hash) const { return counters[hash1(hash)]; } + const uint8_t& secondSlot(uint32_t hash) const { return counters[hash2(hash)]; } + + static bool full(const uint8_t& slot) { return slot == UINT8_MAX; } + + uint8_t counters[arraySize]; +}; + +template +inline void +BloomFilter::clear() +{ + memset(counters, 0, arraySize); +} + +template +inline void +BloomFilter::add(uint32_t hash) +{ + uint8_t& slot1 = firstSlot(hash); + if (MOZ_LIKELY(!full(slot1))) + ++slot1; + + uint8_t& slot2 = secondSlot(hash); + if (MOZ_LIKELY(!full(slot2))) + ++slot2; +} + +template +MOZ_ALWAYS_INLINE void +BloomFilter::add(const T* t) +{ + uint32_t hash = t->hash(); + return add(hash); +} + +template +inline void +BloomFilter::remove(uint32_t hash) +{ + // If the slots are full, we don't know whether we bumped them to be + // there when we added or not, so just leave them full. + uint8_t& slot1 = firstSlot(hash); + if (MOZ_LIKELY(!full(slot1))) + --slot1; + + uint8_t& slot2 = secondSlot(hash); + if (MOZ_LIKELY(!full(slot2))) + --slot2; +} + +template +MOZ_ALWAYS_INLINE void +BloomFilter::remove(const T* t) +{ + uint32_t hash = t->hash(); + remove(hash); +} + +template +MOZ_ALWAYS_INLINE bool +BloomFilter::mayContain(uint32_t hash) const +{ + // Check that all the slots for this hash contain something + return firstSlot(hash) && secondSlot(hash); +} + +template +MOZ_ALWAYS_INLINE bool +BloomFilter::mayContain(const T* t) const +{ + uint32_t hash = t->hash(); + return mayContain(hash); +} + +} // namespace mozilla + +#endif /* mozilla_BloomFilter_h_ */ diff --git a/mfbt/Likely.h b/mfbt/Likely.h new file mode 100644 index 00000000000..bbc9a1d957c --- /dev/null +++ b/mfbt/Likely.h @@ -0,0 +1,17 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* + * MOZ_LIKELY and MOZ_UNLIKELY macros to hint to the compiler how a + * boolean predicate should be branch-predicted. + */ + +#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 2)) +# define MOZ_LIKELY(x) (__builtin_expect((x), 1)) +# define MOZ_UNLIKELY(x) (__builtin_expect((x), 0)) +#else +# define MOZ_LIKELY(x) (x) +# define MOZ_UNLIKELY(x) (x) +#endif + diff --git a/mfbt/exported_headers.mk b/mfbt/exported_headers.mk index 85313744d5f..6ca31f33e24 100644 --- a/mfbt/exported_headers.mk +++ b/mfbt/exported_headers.mk @@ -44,7 +44,9 @@ EXPORTS_NAMESPACES += mozilla EXPORTS_mozilla += \ Assertions.h \ Attributes.h \ + BloomFilter.h \ GuardObjects.h \ + Likely.h \ LinkedList.h \ MSStdInt.h \ RangedPtr.h \ diff --git a/xpcom/tests/Makefile.in b/xpcom/tests/Makefile.in index 117bd526e25..6d30b0b45a9 100644 --- a/xpcom/tests/Makefile.in +++ b/xpcom/tests/Makefile.in @@ -91,6 +91,7 @@ CPP_UNIT_TESTS = \ ShowAlignments.cpp \ ShowSSEConfig.cpp \ TestAutoPtr.cpp \ + TestBloomFilter.cpp \ TestCOMArray.cpp \ TestCOMPtr.cpp \ TestCOMPtrEq.cpp \ diff --git a/xpcom/tests/TestBloomFilter.cpp b/xpcom/tests/TestBloomFilter.cpp new file mode 100644 index 00000000000..f94d5dc6650 --- /dev/null +++ b/xpcom/tests/TestBloomFilter.cpp @@ -0,0 +1,120 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "mozilla/BloomFilter.h" +#include "TestHarness.h" + +#include + +using namespace mozilla; + +class FilterChecker { +public: + FilterChecker(uint32_t hash) : + mHash(hash) + {} + + uint32_t hash() const { return mHash; } + +private: + uint32_t mHash; +}; + +int main() +{ + BloomFilter<12, FilterChecker> *filter = new BloomFilter<12, FilterChecker>(); + + FilterChecker one(1); + FilterChecker two(0x20000); + FilterChecker many(0x10000); + FilterChecker multiple(0x20001); + + filter->add(&one); + if (!filter->mayContain(&one)) { + fail("Filter should contain 'one'"); + return -1; + } + + if (filter->mayContain(&multiple)) { + fail("Filter claims to contain 'multiple' when it should not"); + return -1; + } + + if (!filter->mayContain(&many)) { + fail("Filter should contain 'many' (false positive)"); + return -1; + } + + filter->add(&two); + if (!filter->mayContain(&multiple)) { + fail("Filter should contain 'multiple' (false positive)"); + return -1; + } + + // Test basic removals + filter->remove(&two); + if (filter->mayContain(&multiple)) { + fail("Filter claims to contain 'multiple' when it should not after two was " + "removed"); + return -1; + } + + // Test multiple addition/removal + const unsigned FILTER_SIZE = 255; + for (unsigned i = 0; i < FILTER_SIZE - 1; ++i) { + filter->add(&two); + } + if (!filter->mayContain(&multiple)) { + fail("Filter should contain 'multiple' after 'two' added lots of times " + "(false positive)"); + return -1; + } + for (unsigned i = 0; i < FILTER_SIZE - 1; ++i) { + filter->remove(&two); + } + if (filter->mayContain(&multiple)) { + fail("Filter claims to contain 'multiple' when it should not after two was " + "removed lots of times"); + return -1; + } + + // Test overflowing the filter buckets + for (unsigned i = 0; i < FILTER_SIZE + 1; ++i) { + filter->add(&two); + } + if (!filter->mayContain(&multiple)) { + fail("Filter should contain 'multiple' after 'two' added lots more times " + "(false positive)"); + return -1; + } + for (unsigned i = 0; i < FILTER_SIZE + 1; ++i) { + filter->remove(&two); + } + if (!filter->mayContain(&multiple)) { + fail("Filter claims to not contain 'multiple' even though we should have " + "run out of space in the buckets (false positive)"); + return -1; + } + if (!filter->mayContain(&two)) { + fail("Filter claims to not contain 'two' even though we should have run " + "out of space in the buckets (false positive)"); + return -1; + } + + filter->remove(&one); + if (filter->mayContain(&one)) { + fail("Filter should not contain 'one', because we didn't overflow its " + "bucket"); + return -1; + } + + filter->clear(); + if (filter->mayContain(&multiple)) { + fail("clear() failed to work"); + return -1; + } + + return 0; +}