gecko-dev/mfbt/BloomFilter.h

/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/*
 * A counting Bloom filter implementation.  This allows consumers to
 * do fast probabilistic "is item X in set Y?" testing which will
 * never answer "no" when the correct answer is "yes" (but might
 * incorrectly answer "yes" when the correct answer is "no").
 */

#ifndef mozilla_BloomFilter_h
#define mozilla_BloomFilter_h

#include "mozilla/Assertions.h"
#include "mozilla/Likely.h"

#include <stdint.h>
#include <string.h>

namespace mozilla {

/*
 * This class implements a counting Bloom filter as described at
 * <http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters>, with
 * 8-bit counters.  This allows quick probabilistic answers to the
 * question "is object X in set Y?" where the contents of Y might not
 * be time-invariant.  The probabilistic nature of the test means that
 * sometimes the answer will be "yes" when it should be "no".  If the
 * answer is "no", then X is guaranteed not to be in Y.
 *
 * The filter is parametrized on KeySize, which is the size of the key
 * generated by each of hash functions used by the filter, in bits,
 * and the type of object T being added and removed.  T must implement
 * a |uint32_t hash() const| method which returns a uint32_t hash key
 * that will be used to generate the two separate hash functions for
 * the Bloom filter.  This hash key MUST be well-distributed for good
 * results!  KeySize is not allowed to be larger than 16.
 *
 * The filter uses exactly 2**KeySize bytes of memory.  From now on we
 * will refer to the memory used by the filter as M.
 *
 * The expected rate of incorrect "yes" answers depends on M and on
 * the number N of objects in set Y.  As long as N is small compared
 * to M, the rate of such answers is expected to be approximately
 * 4*(N/M)**2 for this filter.  In practice, if Y has a few hundred
 * elements then using a KeySize of 12 gives a reasonably low
 * incorrect answer rate.  A KeySize of 12 has the additional benefit
 * of using exactly one page for the filter in typical hardware
 * configurations.
 */

template<unsigned KeySize, class T>
class BloomFilter
{
  /*
   * A counting Bloom filter with 8-bit counters.  For now we assume
   * that having two hash functions is enough, but we may revisit that
   * decision later.
   *
   * The filter uses an array with 2**KeySize entries.
   *
   * Assuming a well-distributed hash function, a Bloom filter with
   * array size M containing N elements and
   * using k hash function has expected false positive rate exactly
   *
   * $  (1 - (1 - 1/M)^{kN})^k  $
   *
   * because each array slot has a
   *
   * $  (1 - 1/M)^{kN}  $
   *
   * chance of being 0, and the expected false positive rate is the
   * probability that all of the k hash functions will hit a nonzero
   * slot.
   *
   * For reasonable assumptions (M large, kN large, which should both
   * hold if we're worried about false positives) about M and kN this
   * becomes approximately
   *
   * $$  (1 - \exp(-kN/M))^k   $$
   *
   * For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
   * or in other words
   *
   * $$    N/M = -0.5 * \ln(1 - \sqrt(r))   $$
   *
   * where r is the false positive rate.  This can be used to compute
   * the desired KeySize for a given load N and false positive rate r.
   *
   * If N/M is assumed small, then the false positive rate can
   * further be approximated as 4*N^2/M^2.  So increasing KeySize by
   * 1, which doubles M, reduces the false positive rate by about a
   * factor of 4, and a false positive rate of 1% corresponds to
   * about M/N == 20.
   *
   * What this means in practice is that for a few hundred keys using a
   * KeySize of 12 gives false positive rates on the order of 0.25-4%.
   *
   * Similarly, using a KeySize of 10 would lead to a 4% false
   * positive rate for N == 100 and to quite bad false positive
   * rates for larger N.
   */
public:
  BloomFilter()
  {
    static_assert(KeySize <= kKeyShift, "KeySize too big");

    // Should we have a custom operator new using calloc instead and
    // require that we're allocated via the operator?
    clear();
  }

  /*
   * Clear the filter.  This should be done before reusing it, because
   * just removing all items doesn't clear counters that hit the upper
   * bound.
   */
  void clear();

  /*
   * Add an item to the filter.
   */
  void add(const T* aValue);

  /*
   * Remove an item from the filter.
   */
  void remove(const T* aValue);

  /*
   * Check whether the filter might contain an item.  This can
   * sometimes return true even if the item is not in the filter,
   * but will never return false for items that are actually in the
   * filter.
   */
  bool mightContain(const T* aValue) const;

  /*
   * Methods for add/remove/contain when we already have a hash computed
   */
  void add(uint32_t aHash);
  void remove(uint32_t aHash);
  bool mightContain(uint32_t aHash) const;

private:
  static const size_t kArraySize = (1 << KeySize);
  static const uint32_t kKeyMask = (1 << KeySize) - 1;
  static const uint32_t kKeyShift = 16;

  static uint32_t hash1(uint32_t aHash)
  {
    return aHash & kKeyMask;
  }
  static uint32_t hash2(uint32_t aHash)
  {
    return (aHash >> kKeyShift) & kKeyMask;
  }

  uint8_t& firstSlot(uint32_t aHash)
  {
    return mCounters[hash1(aHash)];
  }
  uint8_t& secondSlot(uint32_t aHash)
  {
    return mCounters[hash2(aHash)];
  }

  const uint8_t& firstSlot(uint32_t aHash) const
  {
    return mCounters[hash1(aHash)];
  }
  const uint8_t& secondSlot(uint32_t aHash) const
  {
    return mCounters[hash2(aHash)];
  }

  static bool full(const uint8_t& aSlot) { return aSlot == UINT8_MAX; }

  uint8_t mCounters[kArraySize];
};

template<unsigned KeySize, class T>
inline void
BloomFilter<KeySize, T>::clear()
{
  memset(mCounters, 0, kArraySize);
}

template<unsigned KeySize, class T>
inline void
BloomFilter<KeySize, T>::add(uint32_t aHash)
{
  uint8_t& slot1 = firstSlot(aHash);
  if (MOZ_LIKELY(!full(slot1))) {
    ++slot1;
  }
  uint8_t& slot2 = secondSlot(aHash);
  if (MOZ_LIKELY(!full(slot2))) {
    ++slot2;
  }
}

template<unsigned KeySize, class T>
MOZ_ALWAYS_INLINE void
BloomFilter<KeySize, T>::add(const T* aValue)
{
  uint32_t hash = aValue->hash();
  return add(hash);
}

template<unsigned KeySize, class T>
inline void
BloomFilter<KeySize, T>::remove(uint32_t aHash)
{
  // If the slots are full, we don't know whether we bumped them to be
  // there when we added or not, so just leave them full.
  uint8_t& slot1 = firstSlot(aHash);
  if (MOZ_LIKELY(!full(slot1))) {
    --slot1;
  }
  uint8_t& slot2 = secondSlot(aHash);
  if (MOZ_LIKELY(!full(slot2))) {
    --slot2;
  }
}

template<unsigned KeySize, class T>
MOZ_ALWAYS_INLINE void
BloomFilter<KeySize, T>::remove(const T* aValue)
{
  uint32_t hash = aValue->hash();
  remove(hash);
}

template<unsigned KeySize, class T>
MOZ_ALWAYS_INLINE bool
BloomFilter<KeySize, T>::mightContain(uint32_t aHash) const
{
  // Check that all the slots for this hash contain something
  return firstSlot(aHash) && secondSlot(aHash);
}

template<unsigned KeySize, class T>
MOZ_ALWAYS_INLINE bool
BloomFilter<KeySize, T>::mightContain(const T* aValue) const
{
  uint32_t hash = aValue->hash();
  return mightContain(hash);
}

} // namespace mozilla

#endif /* mozilla_BloomFilter_h */