Skip to content

Commit 3e8bcee

Browse files
committed
Add BloomFilter64 to support large Bloom filters
Add BloomFilter64 that is like BloomFilter, but uses ulong to represent m and count, and uses Buckets64 and Hash128. Add Buckets64 that is like Buckets, but uses ulong to represent count and uses multiple arrays to overcome the C# array size limit of approximately 2^31 elements and the default .NET limit of 2^31 bytes per object. Add Hash128 that is like Hash, but provides 128 bits of hash material. Also, add Hash128.ComputeHashAndSum() to improve performance by avoiding the conversion of bytes to string, then string to bytes. Add Utils.OptimalM64() that takes a ulong n and calculates a ulong m. Add Utils.HashKernel128() and Utils.HashKernel128ReturnValue for working with 128 bits of hash material. In Utils.HashKernel128(), avoid the use of Skip() and Take() to improve performance. Add TestBloomFilter64.cs, TestBuckets64.cs, TestHash128.cs to test the new classes.
1 parent edb3da6 commit 3e8bcee

File tree

9 files changed

+1058
-0
lines changed

9 files changed

+1058
-0
lines changed
Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Linq;
4+
using System.Text;
5+
using System.Threading.Tasks;
6+
using ProbabilisticDataStructures;
7+
using System.Security.Cryptography;
8+
9+
namespace ProbabilisticDataStructures
10+
{
11+
/// <summary>
12+
/// BloomFilter64 implements a classic Bloom filter. A bloom filter has a non-zero
13+
/// probability of false positives and a zero probability of false negatives.
14+
/// </summary>
15+
public class BloomFilter64 : IFilter
16+
{
17+
/// <summary>
18+
/// Filter data
19+
/// </summary>
20+
internal Buckets64 Buckets { get; set; }
21+
/// <summary>
22+
/// Hash algorithm
23+
/// </summary>
24+
private HashAlgorithm Hash { get; set; }
25+
/// <summary>
26+
/// Filter size
27+
/// </summary>
28+
private ulong m { get; set; }
29+
/// <summary>
30+
/// Number of hash functions
31+
/// </summary>
32+
private uint k { get; set; }
33+
/// <summary>
34+
/// Number of items added
35+
/// </summary>
36+
private ulong count { get; set; }
37+
38+
/// <summary>
39+
/// Creates a new Bloom filter optimized to store n items with a specified target
40+
/// false-positive rate.
41+
/// </summary>
42+
/// <param name="n">Number of items to store.</param>
43+
/// <param name="fpRate">Desired false positive rate.</param>
44+
public BloomFilter64(ulong n, double fpRate)
45+
{
46+
var m = Utils.OptimalM64(n, fpRate);
47+
var k = Utils.OptimalK(fpRate);
48+
Buckets = new Buckets64(m, 1);
49+
Hash = Defaults.GetDefaultHashAlgorithm();
50+
this.m = m;
51+
this.k = k;
52+
}
53+
54+
/// <summary>
55+
/// Returns the Bloom filter capacity, m.
56+
/// </summary>
57+
/// <returns>The Bloom filter capacity, m.</returns>
58+
public ulong Capacity()
59+
{
60+
return this.m;
61+
}
62+
63+
/// <summary>
64+
/// Returns the number of hash functions.
65+
/// </summary>
66+
/// <returns>The number of hash functions.</returns>
67+
public uint K()
68+
{
69+
return this.k;
70+
}
71+
72+
/// <summary>
73+
/// Returns the number of items in the filter.
74+
/// </summary>
75+
/// <returns></returns>
76+
public ulong Count()
77+
{
78+
return this.count;
79+
}
80+
81+
/// <summary>
82+
/// Returns the current estimated ratio of set bits.
83+
/// </summary>
84+
/// <returns>The current estimated ratio of set bits.</returns>
85+
public double EstimatedFillRatio()
86+
{
87+
return 1 - Math.Exp((-(double)this.count * (double)this.k) / (double)this.m);
88+
}
89+
90+
/// <summary>
91+
/// Returns the ratio of set bits.
92+
/// </summary>
93+
/// <returns>The ratio of set bits.</returns>
94+
public double FillRatio()
95+
{
96+
ulong sum = 0;
97+
for (ulong i = 0; i < this.Buckets.count; i++)
98+
{
99+
sum += this.Buckets.Get(i);
100+
}
101+
return (double)sum / (double)this.m;
102+
}
103+
104+
/// <summary>
105+
/// Will test for membership of the data and returns true if it is a member,
106+
/// false if not. This is a probabilistic test, meaning there is a non-zero
107+
/// probability of false positives but a zero probability of false negatives.
108+
/// </summary>
109+
/// <param name="data">The data to search for.</param>
110+
/// <returns>Whether or not the data is maybe contained in the filter.</returns>
111+
public bool Test(byte[] data)
112+
{
113+
var hashKernel = Utils.HashKernel128(data, this.Hash);
114+
var lower = hashKernel.LowerBaseHash;
115+
var upper = hashKernel.UpperBaseHash;
116+
117+
// If any of the K bits are not set, then it's not a member.
118+
for (uint i = 0; i < this.k; i++)
119+
{
120+
if (this.Buckets.Get((lower + upper * i) % this.m) == 0)
121+
{
122+
return false;
123+
}
124+
}
125+
return true;
126+
}
127+
128+
/// <summary>
129+
/// Will add the data to the Bloom filter. It returns the filter to allow
130+
/// for chaining.
131+
/// </summary>
132+
/// <param name="data">The data to add.</param>
133+
/// <returns>The filter.</returns>
134+
public IFilter Add(byte[] data)
135+
{
136+
var hashKernel = Utils.HashKernel128(data, this.Hash);
137+
var lower = hashKernel.LowerBaseHash;
138+
var upper = hashKernel.UpperBaseHash;
139+
140+
// Set the K bits.
141+
for (uint i = 0; i < this.k; i++)
142+
{
143+
this.Buckets.Set((lower + upper * i) % this.m, 1);
144+
}
145+
146+
this.count++;
147+
return this;
148+
}
149+
150+
/// <summary>
151+
/// Is equivalent to calling Test followed by Add. It returns true if the data is
152+
/// a member, false if not.
153+
/// </summary>
154+
/// <param name="data">The data to test for and add if it doesn't exist.</param>
155+
/// <returns>Whether or not the data was probably contained in the filter.</returns>
156+
public bool TestAndAdd(byte[] data)
157+
{
158+
var hashKernel = Utils.HashKernel128(data, this.Hash);
159+
var lower = hashKernel.LowerBaseHash;
160+
var upper = hashKernel.UpperBaseHash;
161+
var member = true;
162+
163+
// If any of the K bits are not set, then it's not a member.
164+
for (uint i = 0; i < this.k; i++)
165+
{
166+
var idx = (lower + upper * i) % this.m;
167+
if (this.Buckets.Get(idx) == 0)
168+
{
169+
member = false;
170+
}
171+
this.Buckets.Set(idx, 1);
172+
}
173+
174+
this.count++;
175+
return member;
176+
}
177+
178+
/// <summary>
179+
/// Restores the Bloom filter to its original state. It returns the filter to
180+
/// allow for chaining.
181+
/// </summary>
182+
/// <returns>The reset bloom filter.</returns>
183+
public BloomFilter64 Reset()
184+
{
185+
this.Buckets.Reset();
186+
return this;
187+
}
188+
189+
/// <summary>
190+
/// Sets the hashing function used in the filter.
191+
/// </summary>
192+
/// <param name="h">The HashAlgorithm to use.</param>
193+
// TODO: Add SetHash to the IFilter interface?
194+
public void SetHash(HashAlgorithm h)
195+
{
196+
this.Hash = h;
197+
}
198+
}
199+
}
Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Linq;
4+
using System.Text;
5+
using System.Threading.Tasks;
6+
7+
namespace ProbabilisticDataStructures
8+
{
9+
/// <summary>
10+
/// Buckets64 is a fast, space-efficient array of buckets where each bucket can store
11+
/// up to a configured maximum value.
12+
/// </summary>
13+
public class Buckets64
14+
{
15+
// The largest C# array to create; the largest power of 2 that C# can support.
16+
private const uint maxArraySize = 1U << 30;
17+
private byte[][] Data { get; set; }
18+
private int arrayCount { get; set; }
19+
private byte bucketSize { get; set; }
20+
private byte _max;
21+
private int Max
22+
{
23+
get
24+
{
25+
return _max;
26+
}
27+
set
28+
{
29+
// TODO: Figure out this truncation thing.
30+
// I'm not sure if MaxValue is always supposed to be capped at 255 via
31+
// a byte conversion or not...
32+
if (value > byte.MaxValue)
33+
_max = byte.MaxValue;
34+
else
35+
_max = (byte)value;
36+
}
37+
}
38+
internal ulong count { get; set; }
39+
40+
/// <summary>
41+
/// Creates a new Buckets64 with the provided number of buckets where each bucket
42+
/// is the specified number of bits.
43+
/// </summary>
44+
/// <param name="count">Number of buckets.</param>
45+
/// <param name="bucketSize">Number of bits per bucket.</param>
46+
internal Buckets64(ulong count, byte bucketSize)
47+
{
48+
this.count = count;
49+
this.bucketSize = bucketSize;
50+
AllocateArray(count, bucketSize);
51+
this.Max = (1 << bucketSize) - 1;
52+
}
53+
54+
private void AllocateArray(ulong count, byte bucketSize)
55+
{
56+
this.arrayCount = (int)(count / maxArraySize + 1);
57+
this.Data = new byte[this.arrayCount][];
58+
var bytesToAllocate = (count * bucketSize + 7) / 8;
59+
for (int i = 0; i < this.arrayCount; i++)
60+
{
61+
var arraySize = Math.Min(bytesToAllocate, maxArraySize);
62+
this.Data[i] = new byte[arraySize];
63+
bytesToAllocate -= arraySize;
64+
}
65+
}
66+
67+
/// <summary>
68+
/// Returns the maximum value that can be stored in a bucket.
69+
/// </summary>
70+
/// <returns>The bucket max value.</returns>
71+
internal byte MaxBucketValue()
72+
{
73+
return this._max;
74+
}
75+
76+
/// <summary>
77+
/// Increment the value in the specified bucket by the provided delta. A bucket
78+
/// can be decremented by providing a negative delta.
79+
/// <para>
80+
/// The value is clamped to zero and the maximum bucket value. Returns itself
81+
/// to allow for chaining.
82+
/// </para>
83+
/// </summary>
84+
/// <param name="bucket">The bucket to increment.</param>
85+
/// <param name="delta">The amount to increment the bucket by.</param>
86+
/// <returns>The modified bucket.</returns>
87+
internal Buckets64 Increment(uint bucket, int delta)
88+
{
89+
int val = (int)(GetBits(bucket * this.bucketSize, this.bucketSize) + delta);
90+
91+
if (val > this.Max)
92+
val = this.Max;
93+
else if (val < 0)
94+
val = 0;
95+
96+
SetBits((uint)bucket * (uint)this.bucketSize, this.bucketSize, (uint)val);
97+
return this;
98+
}
99+
100+
/// <summary>
101+
/// Set the bucket value. The value is clamped to zero and the maximum bucket
102+
/// value. Returns itself to allow for chaining.
103+
/// </summary>
104+
/// <param name="bucket">The bucket to change the value of.</param>
105+
/// <param name="value">The value to set.</param>
106+
/// <returns>The modified bucket.</returns>
107+
internal Buckets64 Set(ulong bucket, byte value)
108+
{
109+
if (value > this._max)
110+
value = this._max;
111+
112+
SetBits(bucket * this.bucketSize, this.bucketSize, value);
113+
return this;
114+
}
115+
116+
/// <summary>
117+
/// Returns the value in the specified bucket.
118+
/// </summary>
119+
/// <param name="bucket">The bucket to get.</param>
120+
/// <returns>The specified bucket.</returns>
121+
internal uint Get(ulong bucket)
122+
{
123+
return GetBits(bucket * this.bucketSize, this.bucketSize);
124+
}
125+
126+
/// <summary>
127+
/// Restores the Buckets64 to the original state. Returns itself to allow for
128+
/// chaining.
129+
/// </summary>
130+
/// <returns>The Buckets64 object the reset operation was performed on.</returns>
131+
internal Buckets64 Reset()
132+
{
133+
AllocateArray(this.count, this.bucketSize);
134+
return this;
135+
}
136+
137+
/// <summary>
138+
/// Returns the bits at the specified offset and length.
139+
/// </summary>
140+
/// <param name="offset">The position to start reading at.</param>
141+
/// <param name="length">The distance to read from the offset.</param>
142+
/// <returns>The bits at the specified offset and length.</returns>
143+
internal uint GetBits(ulong offset, int length)
144+
{
145+
ulong byteIndex = offset / 8;
146+
int byteOffset = (int)(offset % 8);
147+
148+
if ((byteOffset + length) > 8)
149+
{
150+
int rem = 8 - byteOffset;
151+
return GetBits(offset, rem)
152+
| (GetBits(offset + (ulong)rem, length - rem) << rem);
153+
}
154+
155+
var dataArray = this.Data[byteIndex / maxArraySize];
156+
var dataArrayByteIndex = byteIndex % maxArraySize;
157+
int bitMask = (1 << length) - 1;
158+
return (uint)((dataArray[dataArrayByteIndex] & (bitMask << byteOffset)) >> byteOffset);
159+
}
160+
161+
/// <summary>
162+
/// Sets bits at the specified offset and length.
163+
/// </summary>
164+
/// <param name="offset">The position to start writing at.</param>
165+
/// <param name="length">The distance to write from the offset.</param>
166+
/// <param name="bits">The bits to write.</param>
167+
internal void SetBits(ulong offset, int length, uint bits)
168+
{
169+
ulong byteIndex = offset / 8;
170+
int byteOffset = (int)(offset % 8);
171+
172+
if ((byteOffset + length) > 8)
173+
{
174+
int rem = 8 - byteOffset;
175+
SetBits(offset, (byte)rem, bits);
176+
SetBits(offset + (ulong)rem, length - rem, bits >> rem);
177+
return;
178+
}
179+
180+
var dataArray = this.Data[(uint)(byteIndex / maxArraySize)];
181+
var dataArrayByteIndex = (uint)(byteIndex % maxArraySize);
182+
int bitMask = (1 << length) - 1;
183+
dataArray[dataArrayByteIndex] =
184+
(byte)((dataArray[dataArrayByteIndex]) & ~(bitMask << byteOffset));
185+
dataArray[dataArrayByteIndex] =
186+
(byte)((dataArray[dataArrayByteIndex]) | ((bits & bitMask) << byteOffset));
187+
}
188+
}
189+
}

0 commit comments

Comments
 (0)