зеркало из https://github.com/mozilla/pjs.git
Bug 730100. Add a Bloom filter implementation. r=waldo
This commit is contained in:
Родитель
70c5f45260
Коммит
57ad216048
|
@ -0,0 +1,232 @@
|
|||
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
|
||||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
||||
* You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
/*
|
||||
* A counting Bloom filter implementation. This allows consumers to
|
||||
* do fast probabilistic "is item X in set Y?" testing which will
|
||||
* never answer "no" when the correct answer is "yes" (but might
|
||||
* incorrectly answer "yes" when the correct answer is "no").
|
||||
*/
|
||||
|
||||
#ifndef mozilla_BloomFilter_h_
|
||||
#define mozilla_BloomFilter_h_
|
||||
|
||||
#include "mozilla/Likely.h"
|
||||
#include "mozilla/StdInt.h"
|
||||
#include "mozilla/Util.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
namespace mozilla {
|
||||
|
||||
/*
|
||||
* This class implements a counting Bloom filter as described at
|
||||
* <http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters>, with
|
||||
* 8-bit counters. This allows quick probabilistic answers to the
|
||||
* question "is object X in set Y?" where the contents of Y might not
|
||||
* be time-invariant. The probabilistic nature of the test means that
|
||||
* sometimes the answer will be "yes" when it should be "no". If the
|
||||
* answer is "no", then X is guaranteed not to be in Y.
|
||||
*
|
||||
* The filter is parametrized on KeySize, which is the size of the key
|
||||
* generated by each of hash functions used by the filter, in bits,
|
||||
* and the type of object T being added and removed. T must implement
|
||||
* a |uint32_t hash() const| method which returns a uint32_t hash key
|
||||
* that will be used to generate the two separate hash functions for
|
||||
* the Bloom filter. This hash key MUST be well-distributed for good
|
||||
* results! KeySize is not allowed to be larger than 16.
|
||||
*
|
||||
* The filter uses exactly 2**KeySize bytes of memory. From now on we
|
||||
* will refer to the memory used by the filter as M.
|
||||
*
|
||||
* The expected rate of incorrect "yes" answers depends on M and on
|
||||
* the number N of objects in set Y. As long as N is small compared
|
||||
* to M, the rate of such answers is expected to be approximately
|
||||
* 4*(N/M)**2 for this filter. In practice, if Y has a few hundred
|
||||
* elements then using a KeySize of 12 gives a reasonably low
|
||||
* incorrect answer rate. A KeySize of 12 has the additional benefit
|
||||
* of using exactly one page for the filter in typical hardware
|
||||
* configurations.
|
||||
*/
|
||||
|
||||
template<unsigned KeySize, class T>
|
||||
class BloomFilter {
|
||||
/*
|
||||
* A counting Bloom filter with 8-bit counters. For now we assume
|
||||
* that having two hash functions is enough, but we may revisit that
|
||||
* decision later.
|
||||
*
|
||||
* The filter uses an array with 2**KeySize entries.
|
||||
*
|
||||
* Assuming a well-distributed hash function, a Bloom filter with
|
||||
* array size M containing N elements and
|
||||
* using k hash function has expected false positive rate exactly
|
||||
*
|
||||
* $ (1 - (1 - 1/M)^{kN})^k $
|
||||
*
|
||||
* because each array slot has a
|
||||
*
|
||||
* $ (1 - 1/M)^{kN} $
|
||||
*
|
||||
* chance of being 0, and the expected false positive rate is the
|
||||
* probability that all of the k hash functions will hit a nonzero
|
||||
* slot.
|
||||
*
|
||||
* For reasonable assumptions (M large, kN large, which should both
|
||||
* hold if we're worried about false positives) about M and kN this
|
||||
* becomes approximately
|
||||
*
|
||||
* $$ (1 - \exp(-kN/M))^k $$
|
||||
*
|
||||
* For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
|
||||
* or in other words
|
||||
*
|
||||
* $$ N/M = -0.5 * \ln(1 - \sqrt(r)) $$
|
||||
*
|
||||
* where r is the false positive rate. This can be used to compute
|
||||
* the desired KeySize for a given load N and false positive rate r.
|
||||
*
|
||||
* If N/M is assumed small, then the false positive rate can
|
||||
* further be approximated as 4*N^2/M^2. So increasing KeySize by
|
||||
* 1, which doubles M, reduces the false positive rate by about a
|
||||
* factor of 4, and a false positive rate of 1% corresponds to
|
||||
* about M/N == 20.
|
||||
*
|
||||
* What this means in practice is that for a few hundred keys using a
|
||||
* KeySize of 12 gives false positive rates on the order of 0.25-4%.
|
||||
*
|
||||
* Similarly, using a KeySize of 10 would lead to a 4% false
|
||||
* positive rate for N == 100 and to quite bad false positive
|
||||
* rates for larger N.
|
||||
*/
|
||||
public:
|
||||
BloomFilter() {
|
||||
MOZ_STATIC_ASSERT(KeySize <= keyShift, "KeySize too big");
|
||||
|
||||
// Should we have a custom operator new using calloc instead and
|
||||
// require that we're allocated via the operator?
|
||||
clear();
|
||||
}
|
||||
|
||||
/*
|
||||
* Clear the filter. This should be done before reusing it, because
|
||||
* just removing all items doesn't clear counters that hit the upper
|
||||
* bound.
|
||||
*/
|
||||
void clear();
|
||||
|
||||
/*
|
||||
* Add an item to the filter.
|
||||
*/
|
||||
void add(const T* t);
|
||||
|
||||
/*
|
||||
* Remove an item from the filter.
|
||||
*/
|
||||
void remove(const T* t);
|
||||
|
||||
/*
|
||||
* Check whether the filter might contain an item. This can
|
||||
* sometimes return true even if the item is not in the filter,
|
||||
* but will never return false for items that are actually in the
|
||||
* filter.
|
||||
*/
|
||||
bool mayContain(const T* t) const;
|
||||
|
||||
/*
|
||||
* Methods for add/remove/contain when we already have a hash computed
|
||||
*/
|
||||
void add(uint32_t hash);
|
||||
void remove(uint32_t hash);
|
||||
bool mayContain(uint32_t hash) const;
|
||||
|
||||
private:
|
||||
static const size_t arraySize = (1 << KeySize);
|
||||
static const uint32_t keyMask = (1 << KeySize) - 1;
|
||||
static const uint32_t keyShift = 16;
|
||||
|
||||
static uint32_t hash1(uint32_t hash) { return hash & keyMask; }
|
||||
static uint32_t hash2(uint32_t hash) { return (hash >> keyShift) & keyMask; }
|
||||
|
||||
uint8_t& firstSlot(uint32_t hash) { return counters[hash1(hash)]; }
|
||||
uint8_t& secondSlot(uint32_t hash) { return counters[hash2(hash)]; }
|
||||
const uint8_t& firstSlot(uint32_t hash) const { return counters[hash1(hash)]; }
|
||||
const uint8_t& secondSlot(uint32_t hash) const { return counters[hash2(hash)]; }
|
||||
|
||||
static bool full(const uint8_t& slot) { return slot == UINT8_MAX; }
|
||||
|
||||
uint8_t counters[arraySize];
|
||||
};
|
||||
|
||||
template<unsigned KeySize, class T>
|
||||
inline void
|
||||
BloomFilter<KeySize, T>::clear()
|
||||
{
|
||||
memset(counters, 0, arraySize);
|
||||
}
|
||||
|
||||
template<unsigned KeySize, class T>
|
||||
inline void
|
||||
BloomFilter<KeySize, T>::add(uint32_t hash)
|
||||
{
|
||||
uint8_t& slot1 = firstSlot(hash);
|
||||
if (MOZ_LIKELY(!full(slot1)))
|
||||
++slot1;
|
||||
|
||||
uint8_t& slot2 = secondSlot(hash);
|
||||
if (MOZ_LIKELY(!full(slot2)))
|
||||
++slot2;
|
||||
}
|
||||
|
||||
template<unsigned KeySize, class T>
|
||||
MOZ_ALWAYS_INLINE void
|
||||
BloomFilter<KeySize, T>::add(const T* t)
|
||||
{
|
||||
uint32_t hash = t->hash();
|
||||
return add(hash);
|
||||
}
|
||||
|
||||
template<unsigned KeySize, class T>
|
||||
inline void
|
||||
BloomFilter<KeySize, T>::remove(uint32_t hash)
|
||||
{
|
||||
// If the slots are full, we don't know whether we bumped them to be
|
||||
// there when we added or not, so just leave them full.
|
||||
uint8_t& slot1 = firstSlot(hash);
|
||||
if (MOZ_LIKELY(!full(slot1)))
|
||||
--slot1;
|
||||
|
||||
uint8_t& slot2 = secondSlot(hash);
|
||||
if (MOZ_LIKELY(!full(slot2)))
|
||||
--slot2;
|
||||
}
|
||||
|
||||
template<unsigned KeySize, class T>
|
||||
MOZ_ALWAYS_INLINE void
|
||||
BloomFilter<KeySize, T>::remove(const T* t)
|
||||
{
|
||||
uint32_t hash = t->hash();
|
||||
remove(hash);
|
||||
}
|
||||
|
||||
template<unsigned KeySize, class T>
|
||||
MOZ_ALWAYS_INLINE bool
|
||||
BloomFilter<KeySize, T>::mayContain(uint32_t hash) const
|
||||
{
|
||||
// Check that all the slots for this hash contain something
|
||||
return firstSlot(hash) && secondSlot(hash);
|
||||
}
|
||||
|
||||
template<unsigned KeySize, class T>
|
||||
MOZ_ALWAYS_INLINE bool
|
||||
BloomFilter<KeySize, T>::mayContain(const T* t) const
|
||||
{
|
||||
uint32_t hash = t->hash();
|
||||
return mayContain(hash);
|
||||
}
|
||||
|
||||
} // namespace mozilla
|
||||
|
||||
#endif /* mozilla_BloomFilter_h_ */
|
|
@ -0,0 +1,17 @@
|
|||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
||||
* You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
/*
|
||||
* MOZ_LIKELY and MOZ_UNLIKELY macros to hint to the compiler how a
|
||||
* boolean predicate should be branch-predicted.
|
||||
*/
|
||||
|
||||
#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 2))
|
||||
# define MOZ_LIKELY(x) (__builtin_expect((x), 1))
|
||||
# define MOZ_UNLIKELY(x) (__builtin_expect((x), 0))
|
||||
#else
|
||||
# define MOZ_LIKELY(x) (x)
|
||||
# define MOZ_UNLIKELY(x) (x)
|
||||
#endif
|
||||
|
|
@ -44,7 +44,9 @@ EXPORTS_NAMESPACES += mozilla
|
|||
EXPORTS_mozilla += \
|
||||
Assertions.h \
|
||||
Attributes.h \
|
||||
BloomFilter.h \
|
||||
GuardObjects.h \
|
||||
Likely.h \
|
||||
LinkedList.h \
|
||||
MSStdInt.h \
|
||||
RangedPtr.h \
|
||||
|
|
|
@ -91,6 +91,7 @@ CPP_UNIT_TESTS = \
|
|||
ShowAlignments.cpp \
|
||||
ShowSSEConfig.cpp \
|
||||
TestAutoPtr.cpp \
|
||||
TestBloomFilter.cpp \
|
||||
TestCOMArray.cpp \
|
||||
TestCOMPtr.cpp \
|
||||
TestCOMPtrEq.cpp \
|
||||
|
|
|
@ -0,0 +1,120 @@
|
|||
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
||||
* You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#include "mozilla/BloomFilter.h"
|
||||
#include "TestHarness.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
using namespace mozilla;
|
||||
|
||||
class FilterChecker {
|
||||
public:
|
||||
FilterChecker(uint32_t hash) :
|
||||
mHash(hash)
|
||||
{}
|
||||
|
||||
uint32_t hash() const { return mHash; }
|
||||
|
||||
private:
|
||||
uint32_t mHash;
|
||||
};
|
||||
|
||||
int main()
|
||||
{
|
||||
BloomFilter<12, FilterChecker> *filter = new BloomFilter<12, FilterChecker>();
|
||||
|
||||
FilterChecker one(1);
|
||||
FilterChecker two(0x20000);
|
||||
FilterChecker many(0x10000);
|
||||
FilterChecker multiple(0x20001);
|
||||
|
||||
filter->add(&one);
|
||||
if (!filter->mayContain(&one)) {
|
||||
fail("Filter should contain 'one'");
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (filter->mayContain(&multiple)) {
|
||||
fail("Filter claims to contain 'multiple' when it should not");
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (!filter->mayContain(&many)) {
|
||||
fail("Filter should contain 'many' (false positive)");
|
||||
return -1;
|
||||
}
|
||||
|
||||
filter->add(&two);
|
||||
if (!filter->mayContain(&multiple)) {
|
||||
fail("Filter should contain 'multiple' (false positive)");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Test basic removals
|
||||
filter->remove(&two);
|
||||
if (filter->mayContain(&multiple)) {
|
||||
fail("Filter claims to contain 'multiple' when it should not after two was "
|
||||
"removed");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Test multiple addition/removal
|
||||
const unsigned FILTER_SIZE = 255;
|
||||
for (unsigned i = 0; i < FILTER_SIZE - 1; ++i) {
|
||||
filter->add(&two);
|
||||
}
|
||||
if (!filter->mayContain(&multiple)) {
|
||||
fail("Filter should contain 'multiple' after 'two' added lots of times "
|
||||
"(false positive)");
|
||||
return -1;
|
||||
}
|
||||
for (unsigned i = 0; i < FILTER_SIZE - 1; ++i) {
|
||||
filter->remove(&two);
|
||||
}
|
||||
if (filter->mayContain(&multiple)) {
|
||||
fail("Filter claims to contain 'multiple' when it should not after two was "
|
||||
"removed lots of times");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Test overflowing the filter buckets
|
||||
for (unsigned i = 0; i < FILTER_SIZE + 1; ++i) {
|
||||
filter->add(&two);
|
||||
}
|
||||
if (!filter->mayContain(&multiple)) {
|
||||
fail("Filter should contain 'multiple' after 'two' added lots more times "
|
||||
"(false positive)");
|
||||
return -1;
|
||||
}
|
||||
for (unsigned i = 0; i < FILTER_SIZE + 1; ++i) {
|
||||
filter->remove(&two);
|
||||
}
|
||||
if (!filter->mayContain(&multiple)) {
|
||||
fail("Filter claims to not contain 'multiple' even though we should have "
|
||||
"run out of space in the buckets (false positive)");
|
||||
return -1;
|
||||
}
|
||||
if (!filter->mayContain(&two)) {
|
||||
fail("Filter claims to not contain 'two' even though we should have run "
|
||||
"out of space in the buckets (false positive)");
|
||||
return -1;
|
||||
}
|
||||
|
||||
filter->remove(&one);
|
||||
if (filter->mayContain(&one)) {
|
||||
fail("Filter should not contain 'one', because we didn't overflow its "
|
||||
"bucket");
|
||||
return -1;
|
||||
}
|
||||
|
||||
filter->clear();
|
||||
if (filter->mayContain(&multiple)) {
|
||||
fail("clear() failed to work");
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
Загрузка…
Ссылка в новой задаче