-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Aleksandr Bezobchuk
committed
Dec 30, 2018
0 parents
commit b69335e
Showing
5 changed files
with
339 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
/target | ||
**/*.rs.bk | ||
Cargo.lock |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
[package] | ||
name = "rsbloom" | ||
version = "0.1.0" | ||
authors = ["Aleksandr Bezobchuk <aleks.bezobchuk@gmail.com>"] | ||
description = "A simple bloom filter implementation in Rust" | ||
homepage = "https://github.com/alexanderbez/rsbloom" | ||
documentation = "https://docs.rs/rsbloom/" | ||
edition = "2018" | ||
readme = "README.md" | ||
keywords = ["bloom", "filter", "bloomfilter", "bloomfilters"] | ||
license = "MIT" | ||
|
||
[lib] | ||
name = "rsbloom" | ||
|
||
[dependencies] | ||
bit-vec = "0.5.0" | ||
fasthash = "0.3" | ||
rand = "0.6.1" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
//! A simple example showing the use of a Bloom filter. | ||
use rsbloom::BloomFilter; | ||
|
||
fn main() { | ||
let approx_items = 100; | ||
let mut bf = BloomFilter::new(approx_items); | ||
|
||
bf.set(&"foo"); | ||
bf.set(&"bar"); | ||
|
||
bf.has(&"foo"); // true | ||
bf.has(&"bar"); // true | ||
bf.has(&"baz"); // false | ||
|
||
bf.num_items_approx(); // 2 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,217 @@ | ||
// The MIT License | ||
// Copyright (c) 2018 Aleksandr Bezobchuk | ||
// | ||
// Permission is hereby granted, free of charge, to any person obtaining a copy | ||
// of this software and associated documentation files (the "Software"), to deal | ||
// in the Software without restriction, including without limitation the rights | ||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
// copies of the Software, and to permit persons to whom the Software is | ||
// furnished to do so, subject to the following conditions: | ||
// The above copyright notice and this permission notice shall be included in | ||
// all copies or substantial portions of the Software. | ||
// | ||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
// THE SOFTWARE. | ||
|
||
//! A simple and intuitive implementation of a Bloom filter using enhanced | ||
//! double hashing. | ||
use bit_vec::BitVec; | ||
use fasthash::RandomState; | ||
use fasthash::{murmur3, xx}; | ||
use std::hash::{BuildHasher, Hash, Hasher}; | ||
|
||
const LN_SQR: f64 = core::f64::consts::LN_2 * core::f64::consts::LN_2; | ||
const SET_BIT: bool = true; | ||
const UNSET_BIT: bool = false; | ||
|
||
/// The default false positive probability value which is 1%. | ||
pub const DEFAULT_FALSE_POS: f64 = 0.01; | ||
|
||
/// A Bloom filter implementation that tracks the total number of set bits along | ||
/// with the underlying bit vector and hashing functions, Murmur3 and xxHash. | ||
pub struct BloomFilter<R: BuildHasher, S: BuildHasher> { | ||
bit_vec: BitVec, | ||
num_hashes: u64, | ||
set_bits: u64, | ||
murmur_hasher: R, | ||
xx_hasher: S, | ||
} | ||
|
||
impl BloomFilter<RandomState<murmur3::Murmur3_x64_128>, RandomState<xx::XXHash64>> { | ||
/// Return a new Bloom filter with a given number of approximate items to set. | ||
/// The default false positive probability is set and defined by DEFAULT_FALSE_POS. | ||
pub fn new(approx_items: u64) -> Self { | ||
BloomFilter::new_with_rate(approx_items, DEFAULT_FALSE_POS) | ||
} | ||
|
||
/// Return a new Bloom filter with a given number of approximate items to set | ||
/// and a desired false positive probability. | ||
pub fn new_with_rate(approx_items: u64, fp_prob: f64) -> Self { | ||
let num_bits = optimal_num_bits(approx_items, fp_prob); | ||
let num_hashes = optimal_num_hashes(num_bits, approx_items); | ||
|
||
BloomFilter { | ||
bit_vec: BitVec::from_elem(num_bits as usize, UNSET_BIT), | ||
num_hashes: num_hashes, | ||
set_bits: 0, | ||
murmur_hasher: RandomState::<murmur3::Murmur3_x64_128>::new(), | ||
xx_hasher: RandomState::<xx::XXHash64>::new(), | ||
} | ||
} | ||
} | ||
|
||
impl<R, S> BloomFilter<R, S> | ||
where | ||
R: BuildHasher, | ||
S: BuildHasher, | ||
{ | ||
/// Set an object in the Bloom filter. This operation is idempotent in regards | ||
/// to each unique object. Each object must implement the Hash trait. | ||
pub fn set<T: Hash>(&mut self, obj: &T) { | ||
let mut hasher_one = self.murmur_hasher.build_hasher(); | ||
let mut hasher_two = self.xx_hasher.build_hasher(); | ||
|
||
obj.hash(&mut hasher_one); | ||
obj.hash(&mut hasher_two); | ||
|
||
let h1 = hasher_one.finish(); | ||
let h2 = hasher_two.finish(); | ||
|
||
for i in 0..self.num_hashes { | ||
let bit_idx = self.enhanced_double_hash(h1, h2, i) as usize; | ||
|
||
// Unwrap option<bool> and if the bit is unset then we increment the set | ||
// bits. | ||
// | ||
// NOTE: We should not panic here as enhanced_double_hash ensures the | ||
// index is within bounds via modulo bit vector table size. | ||
if self.bit_vec.get(bit_idx).unwrap() == UNSET_BIT { | ||
self.set_bits += 1; | ||
} | ||
|
||
self.bit_vec.set(bit_idx, SET_BIT); | ||
} | ||
} | ||
|
||
/// Returns a bool reflecting if a given object is 'most likely' in the Bloom | ||
/// filter or not. There is a possibility for a false positive with the | ||
/// probability being under the Bloom filter's p value, but a false negative | ||
/// will never occur. | ||
pub fn has<T: Hash>(&self, obj: &T) -> Option<bool> { | ||
let mut hasher_one = self.murmur_hasher.build_hasher(); | ||
let mut hasher_two = self.xx_hasher.build_hasher(); | ||
|
||
obj.hash(&mut hasher_one); | ||
obj.hash(&mut hasher_two); | ||
|
||
let h1 = hasher_one.finish(); | ||
let h2 = hasher_two.finish(); | ||
|
||
for i in 0..self.num_hashes { | ||
let bit_idx = self.enhanced_double_hash(h1, h2, i) as usize; | ||
|
||
// Unwrap option<bool> and if the bit is not set, then we short-circuit | ||
// and return false. | ||
// | ||
// NOTE: We should not panic here as enhanced_double_hash ensures the | ||
// index is within bounds via modulo bit vector table size. | ||
if self.bit_vec.get(bit_idx).unwrap() != SET_BIT { | ||
return Some(false); | ||
} | ||
} | ||
|
||
Some(true) | ||
} | ||
|
||
/// Returns the approximate total number of objects set in the Bloom filter. | ||
pub fn num_items_approx(&self) -> u64 { | ||
let m = self.bit_vec.len() as f64; | ||
let k = self.num_hashes as f64; | ||
let x = self.set_bits as f64; | ||
(-(m / k) * (1.0 - (x / m)).ln()) as u64 | ||
} | ||
|
||
fn enhanced_double_hash(&self, h1: u64, h2: u64, i: u64) -> u64 { | ||
let r = h1.wrapping_add(i.wrapping_mul(h2)).wrapping_add(i.pow(3)); | ||
r % self.bit_vec.len() as u64 | ||
} | ||
} | ||
|
||
/// Return the optimal bit vector size for a Bloom filter given an approximate | ||
/// size approx_items and a desired false positive probability fp_prob. | ||
fn optimal_num_bits(approx_items: u64, fp_prob: f64) -> u64 { | ||
(-((fp_prob.ln() * (approx_items as f64)) / LN_SQR)).ceil() as u64 | ||
} | ||
|
||
/// Return the optimal number of hash 'functions' for a Bloom filter given a | ||
/// bit vector size num_bits and an approximate set size approx_items. | ||
fn optimal_num_hashes(num_bits: u64, approx_items: u64) -> u64 { | ||
(((num_bits / approx_items) as f64) * core::f64::consts::LN_2).ceil() as u64 | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use super::*; | ||
use rand::distributions::Alphanumeric; | ||
use rand::{thread_rng, Rng}; | ||
use std::collections::HashSet; | ||
|
||
fn random_str(len: usize) -> String { | ||
thread_rng().sample_iter(&Alphanumeric).take(len).collect() | ||
} | ||
|
||
#[test] | ||
fn test_bloom_filter() { | ||
let n = 1000; | ||
let mut items = HashSet::<String>::new(); | ||
|
||
// generate random strings to insert | ||
for _ in 0..n { | ||
items.insert(random_str(30)); | ||
} | ||
|
||
let mut bf = BloomFilter::new(items.len() as u64); | ||
|
||
// test inclusion | ||
for item in items.iter() { | ||
bf.set(item); | ||
|
||
let exists = bf.has(item).unwrap(); | ||
assert_eq!( | ||
exists, true, | ||
"item {} should result in a positive inclusion", | ||
item, | ||
); | ||
} | ||
|
||
// test false negatives | ||
for _ in 0..n { | ||
let item = random_str(30); | ||
let exists = bf.has(&item).unwrap(); | ||
|
||
if items.contains(&item) { | ||
assert_eq!(exists, true, "item {} resulted in a false negative", item); | ||
} | ||
} | ||
} | ||
|
||
#[test] | ||
fn test_optimal_num_bits() { | ||
assert_eq!(optimal_num_bits(10, 0.04), 67); | ||
assert_eq!(optimal_num_bits(5000, 0.01), 47926); | ||
assert_eq!(optimal_num_bits(100000, 0.01), 958506); | ||
} | ||
|
||
#[test] | ||
fn test_optimal_num_hashes() { | ||
assert_eq!(optimal_num_hashes(67, 10), 5); | ||
assert_eq!(optimal_num_hashes(47926, 5000), 7); | ||
assert_eq!(optimal_num_hashes(958506, 100000), 7); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
// The MIT License | ||
// Copyright (c) 2018 Aleksandr Bezobchuk | ||
// | ||
// Permission is hereby granted, free of charge, to any person obtaining a copy | ||
// of this software and associated documentation files (the "Software"), to deal | ||
// in the Software without restriction, including without limitation the rights | ||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
// copies of the Software, and to permit persons to whom the Software is | ||
// furnished to do so, subject to the following conditions: | ||
// The above copyright notice and this permission notice shall be included in | ||
// all copies or substantial portions of the Software. | ||
// | ||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
// THE SOFTWARE. | ||
|
||
//! A simple implementation of a Bloom filter, a space-efficient probabilistic | ||
//! data structure. | ||
//! | ||
//! # Bloom Filters | ||
//! | ||
//! A Bloom filter is a space-efficient probabilistic data structure that is | ||
//! used to test whether an element is a member of a set. It allows for queries | ||
//! to return: "possibly in set" or "definitely not in set". Elements can be | ||
//! added to the set, but not removed; the more elements that are added to the | ||
//! set, the larger the probability of false positives. It has been shown that | ||
//! fewer than 10 bits per element are required for a 1% false positive | ||
//! probability, independent of the size or number of elements in the set. | ||
//! | ||
//! The provided implementation allows you to create a Bloom filter specifying | ||
//! the approximate number of items expected to inserted and an optional false | ||
//! positive probability. It also allows you to approximate the total number of | ||
//! items in the filter. | ||
//! | ||
//! # Enhanced Double Hashing | ||
//! | ||
//! Enhanced double hashing is used to set bit positions within a bit vector. | ||
//! The choice for double hashing was shown to be effective without any loss in | ||
//! the asymptotic false positive probability, leading to less computation and | ||
//! potentially less need for randomness in practice by Adam Kirsch and | ||
//! Michael Mitzenmacher in | ||
//! [Less Hashing, Same Performance: Building a Better Bloom Filter](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.152.579&rep=rep1&type=pdf). | ||
//! | ||
//! The enhanced double hash takes the form of the following formula: | ||
//! | ||
//! g<sub>i</sub>(x) = (H<sub>1</sub>(x) + iH<sub>2</sub>(x) + f(i)) mod m, where | ||
//! | ||
//! H<sub>1</sub> | ||
//! is Murmur3 128-bit, H<sub>2</sub> is xxHash 64-bit, and f(i) = i<sup>3</sup> | ||
//! | ||
//! | ||
//! # Example | ||
//! | ||
//! ```rust | ||
//! use rsbloom::BloomFilter; | ||
//! | ||
//! fn main() { | ||
//! let approx_items = 100; | ||
//! let mut bf = BloomFilter::new(approx_items); | ||
//! | ||
//! bf.set(&"foo"); | ||
//! bf.set(&"bar"); | ||
//! | ||
//! bf.has(&"foo"); // true | ||
//! bf.has(&"bar"); // true | ||
//! bf.has(&"baz"); // false | ||
//! | ||
//! bf.num_items_approx(); // 2 | ||
//! } | ||
//! ``` | ||
#![crate_type = "lib"] | ||
#![crate_name = "rsbloom"] | ||
#![warn(missing_docs)] | ||
|
||
// import library modules | ||
pub mod bloom; | ||
|
||
// re-export library modules | ||
pub use self::bloom::BloomFilter; |