gecko-dev/mfbt/BloomFilter.h

/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/*
 * A counting Bloom filter implementation.  This allows consumers to
 * do fast probabilistic "is item X in set Y?" testing which will
 * never answer "no" when the correct answer is "yes" (but might
 * incorrectly answer "yes" when the correct answer is "no").
 */

#ifndef mozilla_BloomFilter_h
#define mozilla_BloomFilter_h

#include "mozilla/Assertions.h"
#include "mozilla/Likely.h"
#include "mozilla/StandardInteger.h"
#include "mozilla/Util.h"

#include <string.h>

namespace mozilla {

/*
 * This class implements a counting Bloom filter as described at
 * <http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters>, with
 * 8-bit counters.  This allows quick probabilistic answers to the
 * question "is object X in set Y?" where the contents of Y might not
 * be time-invariant.  The probabilistic nature of the test means that
 * sometimes the answer will be "yes" when it should be "no".  If the
 * answer is "no", then X is guaranteed not to be in Y.
 *
 * The filter is parametrized on KeySize, which is the size of the key
 * generated by each of hash functions used by the filter, in bits,
 * and the type of object T being added and removed.  T must implement
 * a |uint32_t hash() const| method which returns a uint32_t hash key
 * that will be used to generate the two separate hash functions for
 * the Bloom filter.  This hash key MUST be well-distributed for good
 * results!  KeySize is not allowed to be larger than 16.
 *
 * The filter uses exactly 2**KeySize bytes of memory.  From now on we
 * will refer to the memory used by the filter as M.
 *
 * The expected rate of incorrect "yes" answers depends on M and on
 * the number N of objects in set Y.  As long as N is small compared
 * to M, the rate of such answers is expected to be approximately
 * 4*(N/M)**2 for this filter.  In practice, if Y has a few hundred
 * elements then using a KeySize of 12 gives a reasonably low
 * incorrect answer rate.  A KeySize of 12 has the additional benefit
 * of using exactly one page for the filter in typical hardware
 * configurations.
 */

template<unsigned KeySize, class T>
class BloomFilter
{
    /*
     * A counting Bloom filter with 8-bit counters.  For now we assume
     * that having two hash functions is enough, but we may revisit that
     * decision later.
     *
     * The filter uses an array with 2**KeySize entries.
     *
     * Assuming a well-distributed hash function, a Bloom filter with
     * array size M containing N elements and
     * using k hash function has expected false positive rate exactly
     *
     * $  (1 - (1 - 1/M)^{kN})^k  $
     *
     * because each array slot has a
     *
     * $  (1 - 1/M)^{kN}  $
     *
     * chance of being 0, and the expected false positive rate is the
     * probability that all of the k hash functions will hit a nonzero
     * slot.
     *
     * For reasonable assumptions (M large, kN large, which should both
     * hold if we're worried about false positives) about M and kN this
     * becomes approximately
     *
     * $$  (1 - \exp(-kN/M))^k   $$
     *
     * For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
     * or in other words
     *
     * $$    N/M = -0.5 * \ln(1 - \sqrt(r))   $$
     *
     * where r is the false positive rate.  This can be used to compute
     * the desired KeySize for a given load N and false positive rate r.
     *
     * If N/M is assumed small, then the false positive rate can
     * further be approximated as 4*N^2/M^2.  So increasing KeySize by
     * 1, which doubles M, reduces the false positive rate by about a
     * factor of 4, and a false positive rate of 1% corresponds to
     * about M/N == 20.
     *
     * What this means in practice is that for a few hundred keys using a
     * KeySize of 12 gives false positive rates on the order of 0.25-4%.
     *
     * Similarly, using a KeySize of 10 would lead to a 4% false
     * positive rate for N == 100 and to quite bad false positive
     * rates for larger N.
     */
  public:
    BloomFilter() {
        MOZ_STATIC_ASSERT(KeySize <= keyShift, "KeySize too big");

        // Should we have a custom operator new using calloc instead and
        // require that we're allocated via the operator?
        clear();
    }

    /*
     * Clear the filter.  This should be done before reusing it, because
     * just removing all items doesn't clear counters that hit the upper
     * bound.
     */
    void clear();

    /*
     * Add an item to the filter.
     */
    void add(const T* t);

    /*
     * Remove an item from the filter.
     */
    void remove(const T* t);

    /*
     * Check whether the filter might contain an item.  This can
     * sometimes return true even if the item is not in the filter,
     * but will never return false for items that are actually in the
     * filter.
     */
    bool mightContain(const T* t) const;

    /*
     * Methods for add/remove/contain when we already have a hash computed
     */
    void add(uint32_t hash);
    void remove(uint32_t hash);
    bool mightContain(uint32_t hash) const;

  private:
    static const size_t arraySize = (1 << KeySize);
    static const uint32_t keyMask = (1 << KeySize) - 1;
    static const uint32_t keyShift = 16;

    static uint32_t hash1(uint32_t hash) { return hash & keyMask; }
    static uint32_t hash2(uint32_t hash) { return (hash >> keyShift) & keyMask; }

    uint8_t& firstSlot(uint32_t hash) { return counters[hash1(hash)]; }
    uint8_t& secondSlot(uint32_t hash) { return counters[hash2(hash)]; }
    const uint8_t& firstSlot(uint32_t hash) const { return counters[hash1(hash)]; }
    const uint8_t& secondSlot(uint32_t hash) const { return counters[hash2(hash)]; }

    static bool full(const uint8_t& slot) { return slot == UINT8_MAX; }

    uint8_t counters[arraySize];
};

template<unsigned KeySize, class T>
inline void
BloomFilter<KeySize, T>::clear()
{
  memset(counters, 0, arraySize);
}

template<unsigned KeySize, class T>
inline void
BloomFilter<KeySize, T>::add(uint32_t hash)
{
  uint8_t& slot1 = firstSlot(hash);
  if (MOZ_LIKELY(!full(slot1)))
    ++slot1;

  uint8_t& slot2 = secondSlot(hash);
  if (MOZ_LIKELY(!full(slot2)))
    ++slot2;
}

template<unsigned KeySize, class T>
MOZ_ALWAYS_INLINE void
BloomFilter<KeySize, T>::add(const T* t)
{
  uint32_t hash = t->hash();
  return add(hash);
}

template<unsigned KeySize, class T>
inline void
BloomFilter<KeySize, T>::remove(uint32_t hash)
{
  // If the slots are full, we don't know whether we bumped them to be
  // there when we added or not, so just leave them full.
  uint8_t& slot1 = firstSlot(hash);
  if (MOZ_LIKELY(!full(slot1)))
    --slot1;

  uint8_t& slot2 = secondSlot(hash);
  if (MOZ_LIKELY(!full(slot2)))
    --slot2;
}

template<unsigned KeySize, class T>
MOZ_ALWAYS_INLINE void
BloomFilter<KeySize, T>::remove(const T* t)
{
  uint32_t hash = t->hash();
  remove(hash);
}

template<unsigned KeySize, class T>
MOZ_ALWAYS_INLINE bool
BloomFilter<KeySize, T>::mightContain(uint32_t hash) const
{
  // Check that all the slots for this hash contain something
  return firstSlot(hash) && secondSlot(hash);
}

template<unsigned KeySize, class T>
MOZ_ALWAYS_INLINE bool
BloomFilter<KeySize, T>::mightContain(const T* t) const
{
  uint32_t hash = t->hash();
  return mightContain(hash);
}

} // namespace mozilla

#endif /* mozilla_BloomFilter_h */
Bug 896341 - Update include guards and modelines in MFBT; r=Waldo 2013-07-24 11:41:39 +04:00			`/* -- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -- */`
			`/* vim: set ts=8 sts=2 et sw=2 tw=80: */`
Bug 730100. Add a Bloom filter implementation. r=waldo 2012-03-01 06:40:47 +04:00			`/* This Source Code Form is subject to the terms of the Mozilla Public`
Bug 896341 - Update include guards and modelines in MFBT; r=Waldo 2013-07-24 11:41:39 +04:00			`* License, v. 2.0. If a copy of the MPL was not distributed with this`
			`* file, You can obtain one at http://mozilla.org/MPL/2.0/. */`
Bug 730100. Add a Bloom filter implementation. r=waldo 2012-03-01 06:40:47 +04:00
			`/*`
			`* A counting Bloom filter implementation. This allows consumers to`
			`* do fast probabilistic "is item X in set Y?" testing which will`
			`* never answer "no" when the correct answer is "yes" (but might`
			`* incorrectly answer "yes" when the correct answer is "no").`
			`*/`

Bug 896341 - Update include guards and modelines in MFBT; r=Waldo 2013-07-24 11:41:39 +04:00			`#ifndef mozilla_BloomFilter_h`
			`#define mozilla_BloomFilter_h`
Bug 730100. Add a Bloom filter implementation. r=waldo 2012-03-01 06:40:47 +04:00
Move TestBloomFilter out of XPCOM and into mfbt/tests, now that it exists. No bug, r=testingonlychange --HG-- extra : rebase_source : 85a2a56bcc02bf122ee7fd21d16dffd534d1c98d 2012-10-04 04:09:59 +04:00			`#include "mozilla/Assertions.h"`
Bug 730100. Add a Bloom filter implementation. r=waldo 2012-03-01 06:40:47 +04:00			`#include "mozilla/Likely.h"`
Bug 731789 - Followup, s/StdInt/StandardInteger in BloomFilter.h. r=me 2012-03-01 08:40:16 +04:00			`#include "mozilla/StandardInteger.h"`
Bug 730100. Add a Bloom filter implementation. r=waldo 2012-03-01 06:40:47 +04:00			`#include "mozilla/Util.h"`

			`#include <string.h>`

			`namespace mozilla {`

			`/*`
			`* This class implements a counting Bloom filter as described at`
			`* <http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters>, with`
			`* 8-bit counters. This allows quick probabilistic answers to the`
			`* question "is object X in set Y?" where the contents of Y might not`
			`* be time-invariant. The probabilistic nature of the test means that`
			`* sometimes the answer will be "yes" when it should be "no". If the`
			`* answer is "no", then X is guaranteed not to be in Y.`
			`*`
			`* The filter is parametrized on KeySize, which is the size of the key`
			`* generated by each of hash functions used by the filter, in bits,`
			`* and the type of object T being added and removed. T must implement`
			`* a \|uint32_t hash() const\| method which returns a uint32_t hash key`
			`* that will be used to generate the two separate hash functions for`
			`* the Bloom filter. This hash key MUST be well-distributed for good`
			`* results! KeySize is not allowed to be larger than 16.`
			`*`
			`* The filter uses exactly 2**KeySize bytes of memory. From now on we`
			`* will refer to the memory used by the filter as M.`
			`*`
			`* The expected rate of incorrect "yes" answers depends on M and on`
			`* the number N of objects in set Y. As long as N is small compared`
			`* to M, the rate of such answers is expected to be approximately`
			`* 4(N/M)*2 for this filter. In practice, if Y has a few hundred`
			`* elements then using a KeySize of 12 gives a reasonably low`
			`* incorrect answer rate. A KeySize of 12 has the additional benefit`
			`* of using exactly one page for the filter in typical hardware`
			`* configurations.`
			`*/`

			`template<unsigned KeySize, class T>`
Style patrol to make everything conform to mfbt/STYLE. No bug, r=sparky 2012-06-04 07:36:43 +04:00			`class BloomFilter`
			`{`
Bug 730100. Add a Bloom filter implementation. r=waldo 2012-03-01 06:40:47 +04:00			`/*`
			`* A counting Bloom filter with 8-bit counters. For now we assume`
			`* that having two hash functions is enough, but we may revisit that`
			`* decision later.`
			`*`
			`* The filter uses an array with 2**KeySize entries.`
			`*`
			`* Assuming a well-distributed hash function, a Bloom filter with`
			`* array size M containing N elements and`
			`* using k hash function has expected false positive rate exactly`
			`*`
			`* $ (1 - (1 - 1/M)^{kN})^k $`
			`*`
			`* because each array slot has a`
			`*`
			`* $ (1 - 1/M)^{kN} $`
			`*`
			`* chance of being 0, and the expected false positive rate is the`
			`* probability that all of the k hash functions will hit a nonzero`
			`* slot.`
			`*`
			`* For reasonable assumptions (M large, kN large, which should both`
			`* hold if we're worried about false positives) about M and kN this`
			`* becomes approximately`
			`*`
			`* $$ (1 - \exp(-kN/M))^k $$`
			`*`
			`* For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,`
			`* or in other words`
			`*`
			`* $$ N/M = -0.5 * \ln(1 - \sqrt(r)) $$`
			`*`
			`* where r is the false positive rate. This can be used to compute`
			`* the desired KeySize for a given load N and false positive rate r.`
			`*`
			`* If N/M is assumed small, then the false positive rate can`
			`* further be approximated as 4*N^2/M^2. So increasing KeySize by`
			`* 1, which doubles M, reduces the false positive rate by about a`
			`* factor of 4, and a false positive rate of 1% corresponds to`
			`* about M/N == 20.`
			`*`
			`* What this means in practice is that for a few hundred keys using a`
			`* KeySize of 12 gives false positive rates on the order of 0.25-4%.`
			`*`
			`* Similarly, using a KeySize of 10 would lead to a 4% false`
			`* positive rate for N == 100 and to quite bad false positive`
			`* rates for larger N.`
			`*/`
Style patrol to make everything conform to mfbt/STYLE. No bug, r=sparky 2012-06-04 07:36:43 +04:00			`public:`
Bug 730100. Add a Bloom filter implementation. r=waldo 2012-03-01 06:40:47 +04:00			`BloomFilter() {`
			`MOZ_STATIC_ASSERT(KeySize <= keyShift, "KeySize too big");`

			`// Should we have a custom operator new using calloc instead and`
			`// require that we're allocated via the operator?`
			`clear();`
			`}`

			`/*`
			`* Clear the filter. This should be done before reusing it, because`
			`* just removing all items doesn't clear counters that hit the upper`
			`* bound.`
			`*/`
			`void clear();`

			`/*`
			`* Add an item to the filter.`
			`*/`
			`void add(const T* t);`

			`/*`
			`* Remove an item from the filter.`
			`*/`
			`void remove(const T* t);`

			`/*`
			`* Check whether the filter might contain an item. This can`
			`* sometimes return true even if the item is not in the filter,`
			`* but will never return false for items that are actually in the`
			`* filter.`
			`*/`
Bug 705877 part 1. Rename mayContain on BloomFilter to mightContain. r=dbaron 2012-03-13 07:54:12 +04:00			`bool mightContain(const T* t) const;`
Bug 730100. Add a Bloom filter implementation. r=waldo 2012-03-01 06:40:47 +04:00
			`/*`
			`* Methods for add/remove/contain when we already have a hash computed`
			`*/`
			`void add(uint32_t hash);`
			`void remove(uint32_t hash);`
Bug 705877 part 1. Rename mayContain on BloomFilter to mightContain. r=dbaron 2012-03-13 07:54:12 +04:00			`bool mightContain(uint32_t hash) const;`
Bug 730100. Add a Bloom filter implementation. r=waldo 2012-03-01 06:40:47 +04:00
Style patrol to make everything conform to mfbt/STYLE. No bug, r=sparky 2012-06-04 07:36:43 +04:00			`private:`
Bug 730100. Add a Bloom filter implementation. r=waldo 2012-03-01 06:40:47 +04:00			`static const size_t arraySize = (1 << KeySize);`
			`static const uint32_t keyMask = (1 << KeySize) - 1;`
			`static const uint32_t keyShift = 16;`

			`static uint32_t hash1(uint32_t hash) { return hash & keyMask; }`
			`static uint32_t hash2(uint32_t hash) { return (hash >> keyShift) & keyMask; }`

			`uint8_t& firstSlot(uint32_t hash) { return counters[hash1(hash)]; }`
			`uint8_t& secondSlot(uint32_t hash) { return counters[hash2(hash)]; }`
			`const uint8_t& firstSlot(uint32_t hash) const { return counters[hash1(hash)]; }`
			`const uint8_t& secondSlot(uint32_t hash) const { return counters[hash2(hash)]; }`

			`static bool full(const uint8_t& slot) { return slot == UINT8_MAX; }`

			`uint8_t counters[arraySize];`
			`};`

			`template<unsigned KeySize, class T>`
			`inline void`
			`BloomFilter<KeySize, T>::clear()`
			`{`
Style patrol to make everything conform to mfbt/STYLE. No bug, r=sparky 2012-06-04 07:36:43 +04:00			`memset(counters, 0, arraySize);`
Bug 730100. Add a Bloom filter implementation. r=waldo 2012-03-01 06:40:47 +04:00			`}`

			`template<unsigned KeySize, class T>`
			`inline void`
			`BloomFilter<KeySize, T>::add(uint32_t hash)`
			`{`
Style patrol to make everything conform to mfbt/STYLE. No bug, r=sparky 2012-06-04 07:36:43 +04:00			`uint8_t& slot1 = firstSlot(hash);`
			`if (MOZ_LIKELY(!full(slot1)))`
			`++slot1;`
Bug 730100. Add a Bloom filter implementation. r=waldo 2012-03-01 06:40:47 +04:00
Style patrol to make everything conform to mfbt/STYLE. No bug, r=sparky 2012-06-04 07:36:43 +04:00			`uint8_t& slot2 = secondSlot(hash);`
			`if (MOZ_LIKELY(!full(slot2)))`
			`++slot2;`
Bug 730100. Add a Bloom filter implementation. r=waldo 2012-03-01 06:40:47 +04:00			`}`

			`template<unsigned KeySize, class T>`
			`MOZ_ALWAYS_INLINE void`
			`BloomFilter<KeySize, T>::add(const T* t)`
			`{`
Style patrol to make everything conform to mfbt/STYLE. No bug, r=sparky 2012-06-04 07:36:43 +04:00			`uint32_t hash = t->hash();`
			`return add(hash);`
Bug 730100. Add a Bloom filter implementation. r=waldo 2012-03-01 06:40:47 +04:00			`}`

			`template<unsigned KeySize, class T>`
			`inline void`
			`BloomFilter<KeySize, T>::remove(uint32_t hash)`
			`{`
Style patrol to make everything conform to mfbt/STYLE. No bug, r=sparky 2012-06-04 07:36:43 +04:00			`// If the slots are full, we don't know whether we bumped them to be`
			`// there when we added or not, so just leave them full.`
			`uint8_t& slot1 = firstSlot(hash);`
			`if (MOZ_LIKELY(!full(slot1)))`
			`--slot1;`

			`uint8_t& slot2 = secondSlot(hash);`
			`if (MOZ_LIKELY(!full(slot2)))`
			`--slot2;`
Bug 730100. Add a Bloom filter implementation. r=waldo 2012-03-01 06:40:47 +04:00			`}`

			`template<unsigned KeySize, class T>`
			`MOZ_ALWAYS_INLINE void`
			`BloomFilter<KeySize, T>::remove(const T* t)`
			`{`
Style patrol to make everything conform to mfbt/STYLE. No bug, r=sparky 2012-06-04 07:36:43 +04:00			`uint32_t hash = t->hash();`
			`remove(hash);`
Bug 730100. Add a Bloom filter implementation. r=waldo 2012-03-01 06:40:47 +04:00			`}`

			`template<unsigned KeySize, class T>`
			`MOZ_ALWAYS_INLINE bool`
Bug 705877 part 1. Rename mayContain on BloomFilter to mightContain. r=dbaron 2012-03-13 07:54:12 +04:00			`BloomFilter<KeySize, T>::mightContain(uint32_t hash) const`
Bug 730100. Add a Bloom filter implementation. r=waldo 2012-03-01 06:40:47 +04:00			`{`
Style patrol to make everything conform to mfbt/STYLE. No bug, r=sparky 2012-06-04 07:36:43 +04:00			`// Check that all the slots for this hash contain something`
			`return firstSlot(hash) && secondSlot(hash);`
Bug 730100. Add a Bloom filter implementation. r=waldo 2012-03-01 06:40:47 +04:00			`}`

			`template<unsigned KeySize, class T>`
			`MOZ_ALWAYS_INLINE bool`
Bug 705877 part 1. Rename mayContain on BloomFilter to mightContain. r=dbaron 2012-03-13 07:54:12 +04:00			`BloomFilter<KeySize, T>::mightContain(const T* t) const`
Bug 730100. Add a Bloom filter implementation. r=waldo 2012-03-01 06:40:47 +04:00			`{`
Style patrol to make everything conform to mfbt/STYLE. No bug, r=sparky 2012-06-04 07:36:43 +04:00			`uint32_t hash = t->hash();`
			`return mightContain(hash);`
Bug 730100. Add a Bloom filter implementation. r=waldo 2012-03-01 06:40:47 +04:00			`}`

			`} // namespace mozilla`

Bug 896341 - Update include guards and modelines in MFBT; r=Waldo 2013-07-24 11:41:39 +04:00			`#endif /* mozilla_BloomFilter_h */`