initial commit

alexanderbez · Dec 30, 2018 · b69335e · b69335e
commit b69335e
Show file tree

Hide file tree

Showing 5 changed files with 339 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+/target
+**/*.rs.bk
+Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -0,0 +1,19 @@
+[package]
+name = "rsbloom"
+version = "0.1.0"
+authors = ["Aleksandr Bezobchuk <aleks.bezobchuk@gmail.com>"]
+description = "A simple bloom filter implementation in Rust"
+homepage = "https://github.com/alexanderbez/rsbloom"
+documentation = "https://docs.rs/rsbloom/"
+edition = "2018"
+readme = "README.md"
+keywords = ["bloom", "filter", "bloomfilter", "bloomfilters"]
+license = "MIT"
+
+[lib]
+name = "rsbloom"
+
+[dependencies]
+bit-vec = "0.5.0"
+fasthash = "0.3"
+rand = "0.6.1"
diff --git a/examples/simple.rs b/examples/simple.rs
@@ -0,0 +1,17 @@
+//! A simple example showing the use of a Bloom filter.
+
+use rsbloom::BloomFilter;
+
+fn main() {
+  let approx_items = 100;
+  let mut bf = BloomFilter::new(approx_items);
+
+  bf.set(&"foo");
+  bf.set(&"bar");
+
+  bf.has(&"foo"); // true
+  bf.has(&"bar"); // true
+  bf.has(&"baz"); // false
+
+  bf.num_items_approx(); // 2
+}
diff --git a/src/bloom.rs b/src/bloom.rs
@@ -0,0 +1,217 @@
+// The MIT License
+// Copyright (c) 2018 Aleksandr Bezobchuk
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+//! A simple and intuitive implementation of a Bloom filter using enhanced
+//! double hashing.
+
+use bit_vec::BitVec;
+use fasthash::RandomState;
+use fasthash::{murmur3, xx};
+use std::hash::{BuildHasher, Hash, Hasher};
+
+const LN_SQR: f64 = core::f64::consts::LN_2 * core::f64::consts::LN_2;
+const SET_BIT: bool = true;
+const UNSET_BIT: bool = false;
+
+/// The default false positive probability value which is 1%.
+pub const DEFAULT_FALSE_POS: f64 = 0.01;
+
+/// A Bloom filter implementation that tracks the total number of set bits along
+/// with the underlying bit vector and hashing functions, Murmur3 and xxHash.
+pub struct BloomFilter<R: BuildHasher, S: BuildHasher> {
+  bit_vec: BitVec,
+  num_hashes: u64,
+  set_bits: u64,
+  murmur_hasher: R,
+  xx_hasher: S,
+}
+
+impl BloomFilter<RandomState<murmur3::Murmur3_x64_128>, RandomState<xx::XXHash64>> {
+  /// Return a new Bloom filter with a given number of approximate items to set.
+  /// The default false positive probability is set and defined by DEFAULT_FALSE_POS.
+  pub fn new(approx_items: u64) -> Self {
+    BloomFilter::new_with_rate(approx_items, DEFAULT_FALSE_POS)
+  }
+
+  /// Return a new Bloom filter with a given number of approximate items to set
+  /// and a desired false positive probability.
+  pub fn new_with_rate(approx_items: u64, fp_prob: f64) -> Self {
+    let num_bits = optimal_num_bits(approx_items, fp_prob);
+    let num_hashes = optimal_num_hashes(num_bits, approx_items);
+
+    BloomFilter {
+      bit_vec: BitVec::from_elem(num_bits as usize, UNSET_BIT),
+      num_hashes: num_hashes,
+      set_bits: 0,
+      murmur_hasher: RandomState::<murmur3::Murmur3_x64_128>::new(),
+      xx_hasher: RandomState::<xx::XXHash64>::new(),
+    }
+  }
+}
+
+impl<R, S> BloomFilter<R, S>
+where
+  R: BuildHasher,
+  S: BuildHasher,
+{
+  /// Set an object in the Bloom filter. This operation is idempotent in regards
+  /// to each unique object. Each object must implement the Hash trait.
+  pub fn set<T: Hash>(&mut self, obj: &T) {
+    let mut hasher_one = self.murmur_hasher.build_hasher();
+    let mut hasher_two = self.xx_hasher.build_hasher();
+
+    obj.hash(&mut hasher_one);
+    obj.hash(&mut hasher_two);
+
+    let h1 = hasher_one.finish();
+    let h2 = hasher_two.finish();
+
+    for i in 0..self.num_hashes {
+      let bit_idx = self.enhanced_double_hash(h1, h2, i) as usize;
+
+      // Unwrap option<bool> and if the bit is unset then we increment the set
+      // bits.
+      //
+      // NOTE: We should not panic here as enhanced_double_hash ensures the
+      // index is within bounds via modulo bit vector table size.
+      if self.bit_vec.get(bit_idx).unwrap() == UNSET_BIT {
+        self.set_bits += 1;
+      }
+
+      self.bit_vec.set(bit_idx, SET_BIT);
+    }
+  }
+
+  /// Returns a bool reflecting if a given object is 'most likely' in the Bloom
+  /// filter or not. There is a possibility for a false positive with the
+  /// probability being under the Bloom filter's p value, but a false negative
+  /// will never occur.
+  pub fn has<T: Hash>(&self, obj: &T) -> Option<bool> {
+    let mut hasher_one = self.murmur_hasher.build_hasher();
+    let mut hasher_two = self.xx_hasher.build_hasher();
+
+    obj.hash(&mut hasher_one);
+    obj.hash(&mut hasher_two);
+
+    let h1 = hasher_one.finish();
+    let h2 = hasher_two.finish();
+
+    for i in 0..self.num_hashes {
+      let bit_idx = self.enhanced_double_hash(h1, h2, i) as usize;
+
+      // Unwrap option<bool> and if the bit is not set, then we short-circuit
+      // and return false.
+      //
+      // NOTE: We should not panic here as enhanced_double_hash ensures the
+      // index is within bounds via modulo bit vector table size.
+      if self.bit_vec.get(bit_idx).unwrap() != SET_BIT {
+        return Some(false);
+      }
+    }
+
+    Some(true)
+  }
+
+  /// Returns the approximate total number of objects set in the Bloom filter.
+  pub fn num_items_approx(&self) -> u64 {
+    let m = self.bit_vec.len() as f64;
+    let k = self.num_hashes as f64;
+    let x = self.set_bits as f64;
+    (-(m / k) * (1.0 - (x / m)).ln()) as u64
+  }
+
+  fn enhanced_double_hash(&self, h1: u64, h2: u64, i: u64) -> u64 {
+    let r = h1.wrapping_add(i.wrapping_mul(h2)).wrapping_add(i.pow(3));
+    r % self.bit_vec.len() as u64
+  }
+}
+
+/// Return the optimal bit vector size for a Bloom filter given an approximate
+/// size approx_items and a desired false positive probability fp_prob.
+fn optimal_num_bits(approx_items: u64, fp_prob: f64) -> u64 {
+  (-((fp_prob.ln() * (approx_items as f64)) / LN_SQR)).ceil() as u64
+}
+
+/// Return the optimal number of hash 'functions' for a Bloom filter given a
+/// bit vector size num_bits and an approximate set size approx_items.
+fn optimal_num_hashes(num_bits: u64, approx_items: u64) -> u64 {
+  (((num_bits / approx_items) as f64) * core::f64::consts::LN_2).ceil() as u64
+}
+
+#[cfg(test)]
+mod tests {
+  use super::*;
+  use rand::distributions::Alphanumeric;
+  use rand::{thread_rng, Rng};
+  use std::collections::HashSet;
+
+  fn random_str(len: usize) -> String {
+    thread_rng().sample_iter(&Alphanumeric).take(len).collect()
+  }
+
+  #[test]
+  fn test_bloom_filter() {
+    let n = 1000;
+    let mut items = HashSet::<String>::new();
+
+    // generate random strings to insert
+    for _ in 0..n {
+      items.insert(random_str(30));
+    }
+
+    let mut bf = BloomFilter::new(items.len() as u64);
+
+    // test inclusion
+    for item in items.iter() {
+      bf.set(item);
+
+      let exists = bf.has(item).unwrap();
+      assert_eq!(
+        exists, true,
+        "item {} should result in a positive inclusion",
+        item,
+      );
+    }
+
+    // test false negatives
+    for _ in 0..n {
+      let item = random_str(30);
+      let exists = bf.has(&item).unwrap();
+
+      if items.contains(&item) {
+        assert_eq!(exists, true, "item {} resulted in a false negative", item);
+      }
+    }
+  }
+
+  #[test]
+  fn test_optimal_num_bits() {
+    assert_eq!(optimal_num_bits(10, 0.04), 67);
+    assert_eq!(optimal_num_bits(5000, 0.01), 47926);
+    assert_eq!(optimal_num_bits(100000, 0.01), 958506);
+  }
+
+  #[test]
+  fn test_optimal_num_hashes() {
+    assert_eq!(optimal_num_hashes(67, 10), 5);
+    assert_eq!(optimal_num_hashes(47926, 5000), 7);
+    assert_eq!(optimal_num_hashes(958506, 100000), 7);
+  }
+}
diff --git a/src/lib.rs b/src/lib.rs
@@ -0,0 +1,83 @@
+// The MIT License
+// Copyright (c) 2018 Aleksandr Bezobchuk
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+//! A simple implementation of a Bloom filter, a space-efficient probabilistic
+//! data structure.
+//!
+//! # Bloom Filters
+//!
+//! A Bloom filter is a space-efficient probabilistic data structure that is
+//! used to test whether an element is a member of a set. It allows for queries
+//! to return: "possibly in set" or "definitely not in set". Elements can be
+//! added to the set, but not removed; the more elements that are added to the
+//! set, the larger the probability of false positives. It has been shown that
+//! fewer than 10 bits per element are required for a 1% false positive
+//! probability, independent of the size or number of elements in the set.
+//!
+//! The provided implementation allows you to create a Bloom filter specifying
+//! the approximate number of items expected to inserted and an optional false
+//! positive probability. It also allows you to approximate the total number of
+//! items in the filter.
+//!
+//! # Enhanced Double Hashing
+//!
+//! Enhanced double hashing is used to set bit positions within a bit vector.
+//! The choice for double hashing was shown to be effective without any loss in
+//! the asymptotic false positive probability, leading to less computation and
+//! potentially less need for randomness in practice by Adam Kirsch and
+//! Michael Mitzenmacher in
+//! [Less Hashing, Same Performance: Building a Better Bloom Filter](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.152.579&rep=rep1&type=pdf).
+//!
+//! The enhanced double hash takes the form of the following formula:
+//!
+//! g<sub>i</sub>(x) = (H<sub>1</sub>(x) + iH<sub>2</sub>(x) + f(i)) mod m, where
+//!
+//! H<sub>1</sub>
+//! is Murmur3 128-bit, H<sub>2</sub> is xxHash 64-bit, and f(i) = i<sup>3</sup>
+//!
+//!
+//! # Example
+//!
+//! ```rust
+//! use rsbloom::BloomFilter;
+//!
+//! fn main() {
+//!   let approx_items = 100;
+//!   let mut bf = BloomFilter::new(approx_items);
+//!
+//!   bf.set(&"foo");
+//!   bf.set(&"bar");
+//!
+//!   bf.has(&"foo"); // true
+//!   bf.has(&"bar"); // true
+//!   bf.has(&"baz"); // false
+//!
+//!   bf.num_items_approx(); // 2
+//! }
+//! ```
+#![crate_type = "lib"]
+#![crate_name = "rsbloom"]
+#![warn(missing_docs)]
+
+// import library modules
+pub mod bloom;
+
+// re-export library modules
+pub use self::bloom::BloomFilter;