Bug 730100. Add a Bloom filter implementation. r=waldo

2012-02-29 21:40:47 -05:00 · 2012-02-29 21:40:47 -05:00 · 3c48581eda
--- a/mfbt/BloomFilter.h
+++ b/mfbt/BloomFilter.h
@ -0,0 +1,232 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this file,
+ * You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/*
+ * A counting Bloom filter implementation.  This allows consumers to
+ * do fast probabilistic "is item X in set Y?" testing which will
+ * never answer "no" when the correct answer is "yes" (but might
+ * incorrectly answer "yes" when the correct answer is "no").
+ */
+
+#ifndef mozilla_BloomFilter_h_
+#define mozilla_BloomFilter_h_
+
+#include "mozilla/Likely.h"
+#include "mozilla/StdInt.h"
+#include "mozilla/Util.h"
+
+#include <string.h>
+
+namespace mozilla {
+
+/*
+ * This class implements a counting Bloom filter as described at
+ * <http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters>, with
+ * 8-bit counters.  This allows quick probabilistic answers to the
+ * question "is object X in set Y?" where the contents of Y might not
+ * be time-invariant.  The probabilistic nature of the test means that
+ * sometimes the answer will be "yes" when it should be "no".  If the
+ * answer is "no", then X is guaranteed not to be in Y.
+ *
+ * The filter is parametrized on KeySize, which is the size of the key
+ * generated by each of hash functions used by the filter, in bits,
+ * and the type of object T being added and removed.  T must implement
+ * a |uint32_t hash() const| method which returns a uint32_t hash key
+ * that will be used to generate the two separate hash functions for
+ * the Bloom filter.  This hash key MUST be well-distributed for good
+ * results!  KeySize is not allowed to be larger than 16.
+ *
+ * The filter uses exactly 2**KeySize bytes of memory.  From now on we
+ * will refer to the memory used by the filter as M.
+ *
+ * The expected rate of incorrect "yes" answers depends on M and on
+ * the number N of objects in set Y.  As long as N is small compared
+ * to M, the rate of such answers is expected to be approximately
+ * 4*(N/M)**2 for this filter.  In practice, if Y has a few hundred
+ * elements then using a KeySize of 12 gives a reasonably low
+ * incorrect answer rate.  A KeySize of 12 has the additional benefit
+ * of using exactly one page for the filter in typical hardware
+ * configurations.
+ */
+
+template<unsigned KeySize, class T>
+class BloomFilter {
+    /*
+     * A counting Bloom filter with 8-bit counters.  For now we assume
+     * that having two hash functions is enough, but we may revisit that
+     * decision later.
+     *
+     * The filter uses an array with 2**KeySize entries.
+     *
+     * Assuming a well-distributed hash function, a Bloom filter with
+     * array size M containing N elements and
+     * using k hash function has expected false positive rate exactly
+     *
+     * $  (1 - (1 - 1/M)^{kN})^k  $
+     *
+     * because each array slot has a
+     *
+     * $  (1 - 1/M)^{kN}  $
+     *
+     * chance of being 0, and the expected false positive rate is the
+     * probability that all of the k hash functions will hit a nonzero
+     * slot.
+     *
+     * For reasonable assumptions (M large, kN large, which should both
+     * hold if we're worried about false positives) about M and kN this
+     * becomes approximately
+     *
+     * $$  (1 - \exp(-kN/M))^k   $$
+     *
+     * For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
+     * or in other words
+     *
+     * $$    N/M = -0.5 * \ln(1 - \sqrt(r))   $$
+     *
+     * where r is the false positive rate.  This can be used to compute
+     * the desired KeySize for a given load N and false positive rate r.
+     *
+     * If N/M is assumed small, then the false positive rate can
+     * further be approximated as 4*N^2/M^2.  So increasing KeySize by
+     * 1, which doubles M, reduces the false positive rate by about a
+     * factor of 4, and a false positive rate of 1% corresponds to
+     * about M/N == 20.
+     *
+     * What this means in practice is that for a few hundred keys using a
+     * KeySize of 12 gives false positive rates on the order of 0.25-4%.
+     *
+     * Similarly, using a KeySize of 10 would lead to a 4% false
+     * positive rate for N == 100 and to quite bad false positive
+     * rates for larger N.
+     */
+public:
+    BloomFilter() {
+        MOZ_STATIC_ASSERT(KeySize <= keyShift, "KeySize too big");
+
+        // Should we have a custom operator new using calloc instead and
+        // require that we're allocated via the operator?
+        clear();
+    }
+
+    /*
+     * Clear the filter.  This should be done before reusing it, because
+     * just removing all items doesn't clear counters that hit the upper
+     * bound.
+     */
+    void clear();
+
+    /*
+     * Add an item to the filter.
+     */
+    void add(const T* t);
+
+    /*
+     * Remove an item from the filter.
+     */
+    void remove(const T* t);
+
+    /*
+     * Check whether the filter might contain an item.  This can
+     * sometimes return true even if the item is not in the filter,
+     * but will never return false for items that are actually in the
+     * filter.
+     */
+    bool mayContain(const T* t) const;
+
+    /*
+     * Methods for add/remove/contain when we already have a hash computed
+     */
+    void add(uint32_t hash);
+    void remove(uint32_t hash);
+    bool mayContain(uint32_t hash) const;
+
+private:
+    static const size_t arraySize = (1 << KeySize);
+    static const uint32_t keyMask = (1 << KeySize) - 1;
+    static const uint32_t keyShift = 16;
+
+    static uint32_t hash1(uint32_t hash) { return hash & keyMask; }
+    static uint32_t hash2(uint32_t hash) { return (hash >> keyShift) & keyMask; }
+
+    uint8_t& firstSlot(uint32_t hash) { return counters[hash1(hash)]; }
+    uint8_t& secondSlot(uint32_t hash) { return counters[hash2(hash)]; }
+    const uint8_t& firstSlot(uint32_t hash) const { return counters[hash1(hash)]; }
+    const uint8_t& secondSlot(uint32_t hash) const { return counters[hash2(hash)]; }
+
+    static bool full(const uint8_t& slot) { return slot == UINT8_MAX; }
+
+    uint8_t counters[arraySize];
+};
+
+template<unsigned KeySize, class T>
+inline void
+BloomFilter<KeySize, T>::clear()
+{
+    memset(counters, 0, arraySize);
+}
+
+template<unsigned KeySize, class T>
+inline void
+BloomFilter<KeySize, T>::add(uint32_t hash)
+{
+    uint8_t& slot1 = firstSlot(hash);
+    if (MOZ_LIKELY(!full(slot1)))
+        ++slot1;
+
+    uint8_t& slot2 = secondSlot(hash);
+    if (MOZ_LIKELY(!full(slot2)))
+        ++slot2;
+}
+
+template<unsigned KeySize, class T>
+MOZ_ALWAYS_INLINE void
+BloomFilter<KeySize, T>::add(const T* t)
+{
+    uint32_t hash = t->hash();
+    return add(hash);
+}
+
+template<unsigned KeySize, class T>
+inline void
+BloomFilter<KeySize, T>::remove(uint32_t hash)
+{
+    // If the slots are full, we don't know whether we bumped them to be
+    // there when we added or not, so just leave them full.
+    uint8_t& slot1 = firstSlot(hash);
+    if (MOZ_LIKELY(!full(slot1)))
+        --slot1;
+
+    uint8_t& slot2 = secondSlot(hash);
+    if (MOZ_LIKELY(!full(slot2)))
+        --slot2;
+}
+
+template<unsigned KeySize, class T>
+MOZ_ALWAYS_INLINE void
+BloomFilter<KeySize, T>::remove(const T* t)
+{
+    uint32_t hash = t->hash();
+    remove(hash);
+}
+
+template<unsigned KeySize, class T>
+MOZ_ALWAYS_INLINE bool
+BloomFilter<KeySize, T>::mayContain(uint32_t hash) const
+{
+    // Check that all the slots for this hash contain something
+    return firstSlot(hash) && secondSlot(hash);
+}
+
+template<unsigned KeySize, class T>
+MOZ_ALWAYS_INLINE bool
+BloomFilter<KeySize, T>::mayContain(const T* t) const
+{
+    uint32_t hash = t->hash();
+    return mayContain(hash);
+}
+
+} // namespace mozilla
+
+#endif /* mozilla_BloomFilter_h_ */
--- a/mfbt/Likely.h
+++ b/mfbt/Likely.h
@ -0,0 +1,17 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this file,
+ * You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/*
+ * MOZ_LIKELY and MOZ_UNLIKELY macros to hint to the compiler how a
+ * boolean predicate should be branch-predicted.
+ */
+
+#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 2))
+#  define MOZ_LIKELY(x)   (__builtin_expect((x), 1))
+#  define MOZ_UNLIKELY(x) (__builtin_expect((x), 0))
+#else
+#  define MOZ_LIKELY(x)   (x)
+#  define MOZ_UNLIKELY(x) (x)
+#endif
+
--- a/mfbt/exported_headers.mk
+++ b/mfbt/exported_headers.mk
@ -44,7 +44,9 @@ EXPORTS_NAMESPACES += mozilla
 EXPORTS_mozilla += \
  Assertions.h \
  Attributes.h \
+  BloomFilter.h \
  GuardObjects.h \
+  Likely.h \
  LinkedList.h \
  MSStdInt.h \
  RangedPtr.h \
--- a/xpcom/tests/Makefile.in
+++ b/xpcom/tests/Makefile.in
@ -91,6 +91,7 @@ CPP_UNIT_TESTS = \
                 ShowAlignments.cpp \
                 ShowSSEConfig.cpp \
                 TestAutoPtr.cpp \
+                 TestBloomFilter.cpp \
                 TestCOMArray.cpp \
                 TestCOMPtr.cpp \
                 TestCOMPtrEq.cpp \
--- a/xpcom/tests/TestBloomFilter.cpp
+++ b/xpcom/tests/TestBloomFilter.cpp
@ -0,0 +1,120 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this file,
+ * You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "mozilla/BloomFilter.h"
+#include "TestHarness.h"
+
+#include <stdio.h>
+
+using namespace mozilla;
+
+class FilterChecker {
+public:
+  FilterChecker(uint32_t hash) :
+    mHash(hash)
+  {}
+  
+  uint32_t hash() const { return mHash; }
+
+private:
+  uint32_t mHash;
+};
+
+int main()
+{
+  BloomFilter<12, FilterChecker> *filter = new BloomFilter<12, FilterChecker>();
+
+  FilterChecker one(1);
+  FilterChecker two(0x20000);
+  FilterChecker many(0x10000);
+  FilterChecker multiple(0x20001);
+
+  filter->add(&one);
+  if (!filter->mayContain(&one)) {
+    fail("Filter should contain 'one'");
+    return -1;
+  }
+
+  if (filter->mayContain(&multiple)) {
+    fail("Filter claims to contain 'multiple' when it should not");
+    return -1;
+  }
+
+  if (!filter->mayContain(&many)) {
+    fail("Filter should contain 'many' (false positive)");
+    return -1;
+  }
+
+  filter->add(&two);
+  if (!filter->mayContain(&multiple)) {
+    fail("Filter should contain 'multiple' (false positive)");
+    return -1;
+  }
+
+  // Test basic removals
+  filter->remove(&two);
+  if (filter->mayContain(&multiple)) {
+    fail("Filter claims to contain 'multiple' when it should not after two was "
+         "removed");
+    return -1;
+  }
+
+  // Test multiple addition/removal
+  const unsigned FILTER_SIZE = 255;
+  for (unsigned i = 0; i < FILTER_SIZE - 1; ++i) {
+    filter->add(&two);
+  }
+  if (!filter->mayContain(&multiple)) {
+    fail("Filter should contain 'multiple' after 'two' added lots of times "
+         "(false positive)");
+    return -1;
+  }
+  for (unsigned i = 0; i < FILTER_SIZE - 1; ++i) {
+    filter->remove(&two);
+  }
+  if (filter->mayContain(&multiple)) {
+    fail("Filter claims to contain 'multiple' when it should not after two was "
+         "removed lots of times");
+    return -1;
+  }
+
+  // Test overflowing the filter buckets
+  for (unsigned i = 0; i < FILTER_SIZE + 1; ++i) {
+    filter->add(&two);
+  }
+  if (!filter->mayContain(&multiple)) {
+    fail("Filter should contain 'multiple' after 'two' added lots more times "
+         "(false positive)");
+    return -1;
+  }
+  for (unsigned i = 0; i < FILTER_SIZE + 1; ++i) {
+    filter->remove(&two);
+  }
+  if (!filter->mayContain(&multiple)) {
+    fail("Filter claims to not contain 'multiple' even though we should have "
+         "run out of space in the buckets (false positive)");
+    return -1;
+  }
+  if (!filter->mayContain(&two)) {
+    fail("Filter claims to not contain 'two' even though we should have run "
+         "out of space in the buckets (false positive)");
+    return -1;
+  }
+
+  filter->remove(&one);
+  if (filter->mayContain(&one)) {
+    fail("Filter should not contain 'one', because we didn't overflow its "
+         "bucket");
+    return -1;
+  }
+  
+  filter->clear();
+  if (filter->mayContain(&multiple)) {
+    fail("clear() failed to work");
+    return -1;
+  }
+
+  return 0;
+}