-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbloom filter.R
109 lines (79 loc) · 2.45 KB
/
bloom filter.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#Bloom Filter
install.packages("digest")
install.packages("bit")
library(digest)
library(bit)
spam <- read.table("listed_username_30.txt", sep = "\t", stringsAsFactors = FALSE)
colnames(spam) <- "usernames"
str(spam)
n <- nrow(spam)
p <- 0.07
m <- ceiling(-n*log(p) / (log(2)^2))
k <- ceiling(m/n * log(2))
hex_to_int = function(h) {
xx = strsplit(tolower(h), "")[[1L]]
pos = match(xx, c(0L:9L, letters[1L:6L]))
sum((pos - 1L) * 16^(rev(seq_along(xx) - 1)))
}
hashing1 <- NULL
hashing2 <- NULL
hashing3 <- NULL
hashing4 <- NULL
h1 <- NULL
h2 <- NULL
h3 <- NULL
h4 <- NULL
for (i in 1:nrow(spam)){
hashing1[i] <- digest(spam$usernames[i], algo = "murmur32", serialize = TRUE)
h1[i] <- hex_to_int(hashing1[i])
h1[i] <- (h1[i] %% m) + 1
hashing2[i] <- digest(spam$usernames[i], algo = "xxhash32", serialize = TRUE)
h2[i] <- hex_to_int(hashing2[i])
h2[i] <- (h2[i] %% m) + 1
hashing3[i] <- digest(spam$usernames[i], algo = "crc32", serialize = TRUE)
h3[i] <- hex_to_int(hashing3[i])
h3[i] <- (h3[i] %% m) + 1
hashing4[i] <- digest(spam$usernames[i], algo = "xxhash64", serialize = TRUE)
h4[i] <- hex_to_int(hashing4[i])
h4[i] <- (h4[i] %% m) + 1
}
bit_vector <- bit(m)
for (i in 1:nrow(spam)){
bit_vector[h1[i]] = 1
bit_vector[h2[i]] = 1
bit_vector[h3[i]] = 1
bit_vector[h4[i]] = 1
}
stream <- read.table("listed_username_365.txt", stringsAsFactors = FALSE, sep = "\t")
summary(stream)
colnames(stream) <- "usernames"
str(stream)
h_n1 <- NULL
h_n2 <- NULL
h_n3 <- NULL
h_n4 <- NULL
for (i in 1:nrow(stream)){
hashing1[i] <- digest(stream$usernames[i], algo = "murmur32", serialize = TRUE)
h_n1[i] <- hex_to_int(hashing1[i])
h_n1[i] <- (h_n1[i] %% m) + 1
hashing2[i] <- digest(stream$usernames[i], algo = "xxhash32", serialize = TRUE)
h_n2[i] <- hex_to_int(hashing2[i])
h_n2[i] <- (h_n2[i] %% m) + 1
hashing3[i] <- digest(stream$usernames[i], algo = "crc32", serialize = TRUE)
h_n3[i] <- hex_to_int(hashing3[i])
h_n3[i] <- (h_n3[i] %% m) + 1
hashing4[i] <- digest(stream$usernames[i], algo = "xxhash64", serialize = TRUE)
h_n4[i] <- hex_to_int(hashing4[i])
h_n4[i] <- (h_n4[i] %% m) + 1
}
fp = 0
tn = 0
for (i in 1: nrow(stream)){
if (bit_vector[h_n1[i]] ==1 && bit_vector[h_n2[i]] ==1 && bit_vector[h_n3[i]] ==1 && bit_vector[h_n4[i]] ==1){
fp= fp+1
}
else {
tn = tn+1
}
}
fp/(fp+tn)*100