Skip to content

Commit efa045f

Browse files
committed
Update frequent itemset mining
1 parent 1a645b6 commit efa045f

File tree

8 files changed

+246
-64
lines changed

8 files changed

+246
-64
lines changed

Frequent Itemset Mining/apriori.py

Lines changed: 35 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
1+
from datetime import datetime
2+
13
class Apriori():
24

35
def __init__(self, dataset):
46
self.dataset = dataset
57
self.support_data = {}
68
self.freq_itemsets = []
7-
self.strong_association_rules = []
9+
self.t_num = float(len(self.dataset))
810

911

1012
def __create_C1(self):
@@ -88,15 +90,16 @@ def __generate_Lk_by_Ck(self, Ck, min_sup):
8890
item_count[item] = 1
8991
else:
9092
item_count[item] += 1
91-
t_num = float(len(self.dataset))
93+
9294
for item in item_count:
93-
if (item_count[item] / t_num) >= min_sup:
95+
if (item_count[item] / self.t_num) >= min_sup:
9496
Lk.add(item)
95-
self.support_data[item] = item_count[item] / t_num
97+
self.support_data[item] = item_count[item] / self.t_num
98+
9699
return Lk
97100

98101

99-
def generate_L(self, k, min_sup):
102+
def generate_L(self, min_sup):
100103
"""
101104
Generate all frequent itemsets.
102105
Args:
@@ -107,44 +110,41 @@ def generate_L(self, k, min_sup):
107110
L: The list of Lk.
108111
support_data: A dictionary. The key is frequent itemset and the value is support.
109112
"""
113+
start = datetime.now()
110114
C1 = self.__create_C1()
115+
deltatime = datetime.now() - start
116+
create_Ck_time = deltatime.seconds + deltatime.microseconds / 1000000
117+
118+
start = datetime.now()
111119
L1 = self.__generate_Lk_by_Ck(C1, min_sup)
120+
deltatime = datetime.now() - start
121+
generate_Lk_time = deltatime.seconds + deltatime.microseconds / 1000000
122+
112123
Lksub1 = L1.copy()
113-
print(Lksub1)
114124
for lk_i in Lksub1:
115125
self.freq_itemsets.append((lk_i, self.support_data[lk_i]))
116-
for i in range(2, k+1):
126+
i = 2
127+
128+
while True:
129+
start = datetime.now()
117130
Ci = self.__create_Ck(Lksub1, i)
131+
deltatime = datetime.now() - start
132+
create_Ck_time += deltatime.seconds + deltatime.microseconds / 1000000
133+
134+
start = datetime.now()
118135
Li = self.__generate_Lk_by_Ck(Ci, min_sup)
136+
deltatime = datetime.now() - start
137+
generate_Lk_time += deltatime.seconds + deltatime.microseconds / 1000000
138+
119139
Lksub1 = Li.copy()
120-
print(Lksub1)
140+
141+
if len(Lksub1) == 0:
142+
break
121143
for lk_i in Lksub1:
122144
self.freq_itemsets.append((lk_i, self.support_data[lk_i]))
123-
return self.freq_itemsets
124-
145+
i += 1
146+
147+
print("Create Ck time (s): ", create_Ck_time)
148+
print("Generate Lk time (s): ", generate_Lk_time)
125149

126-
def generate_big_rules(self, min_conf):
127-
"""
128-
Generate big rules from frequent itemsets.
129-
Args:
130-
L: The list of Lk.
131-
support_data: A dictionary. The key is frequent itemset and the value is support.
132-
min_conf: Minimal confidence.
133-
Returns:
134-
big_rule_list: A list which contains all big rules. Each big rule is represented
135-
as a 3-tuple.
136-
"""
137-
if self.freq_itemsets is None:
138-
return
139-
140-
sub_set_list = []
141-
for freq_set, support in self.freq_itemsets:
142-
for sub_set in sub_set_list:
143-
if sub_set.issubset(freq_set):
144-
conf = support / self.support_data[freq_set - sub_set]
145-
big_rule = (freq_set - sub_set, sub_set, conf)
146-
if conf >= min_conf and big_rule not in self.strong_association_rules:
147-
# print freq_set-sub_set, " => ", sub_set, "conf: ", conf
148-
self.strong_association_rules.append(big_rule)
149-
sub_set_list.append(freq_set)
150-
return self.strong_association_rules
150+
return self.freq_itemsets

Frequent Itemset Mining/fpgrowth.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,11 @@ def find_child(self, name):
2020

2121
class FPGrowth():
2222

23-
def __init__(self, dataset, min_sup=0.0, min_conf=0.0):
23+
def __init__(self, dataset, min_sup=0.0):
2424
self.dataset = dataset
2525
self.min_sup = min_sup
26-
self.min_conf = min_conf
2726
self.freq_L1 = {} # 1-频繁项
2827
self.freq_itemsets = [] # 存储每个频繁项及其对应的计数
29-
self.strong_association_rules = [] # 存储强关联规则
3028

3129
def __get_frequency(self, trans_records):
3230
rect = {}
@@ -38,7 +36,7 @@ def __get_frequency(self, trans_records):
3836
def build_fptree(self):
3937
if self.dataset is None:
4038
return
41-
# 依据销售数量创建item序列
39+
# 创建item序列
4240
self.freq_L1 = self.__get_frequency(self.dataset)
4341
tmp_list = []
4442
tmp_list.extend(self.freq_L1.keys())
@@ -73,7 +71,7 @@ def __fpgrowth(self, cpb, post_model):
7371
rule.append(header.name)
7472
rule.extend(post_model)
7573
# 表头项+后缀模式 构成一条频繁模式(频繁模式内部也是按照F1排序的),频繁度为表头项的计数
76-
temp = (rule, header.count)
74+
temp = (rule, header.count / data_num)
7775
self.freq_itemsets.append(temp)
7876
# 新的后缀模式:表头项+上一次的后缀模式(注意保持顺序,始终按F1的顺序排列)
7977
new_post_pattern = []
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import os
2+
import numpy as np
3+
4+
class GenData():
5+
def __init__(self, ntrans, tlen, nitems):
6+
self.run_apt = "../../IBMGenerator/gen lit "
7+
self.target_folder = "./data/"
8+
self.ntrans = ntrans
9+
self.tlen = tlen
10+
self.nitems = nitems
11+
12+
def gen_base_data(self):
13+
fname = "base_set"
14+
run_command = self.run_apt + "-ntrans " + str(self.ntrans) \
15+
+ " -tlen " + str(self.tlen) + " -nitems " + str(self.nitems) \
16+
+ " -fname " + self.target_folder + fname + " -ascii"
17+
return os.system(run_command)
18+
19+
def gen_data_by_ntrans(self):
20+
ntrans_range = range(1, 21, 1)
21+
sub_folder = "ntrans/"
22+
os.system("mkdir " + self.target_folder + sub_folder)
23+
for ntrans in ntrans_range:
24+
fname = self.target_folder + sub_folder + str(ntrans)
25+
print(fname)
26+
run_command = self.run_apt + "-ntrans " + str(ntrans) \
27+
+ " -tlen " + str(self.tlen) + " -nitems " + str(self.nitems) \
28+
+ " -fname " + fname + " -ascii"
29+
os.system(run_command)
30+
31+
def gen_data_by_tlen(self):
32+
tlen_range = range(1, 21, 1)
33+
sub_folder = "tlen/"
34+
os.system("mkdir " + self.target_folder + sub_folder)
35+
for tlen in tlen_range:
36+
fname = self.target_folder + sub_folder + str(tlen)
37+
print(fname)
38+
run_command = self.run_apt + "-ntrans " + str(self.ntrans) \
39+
+ " -tlen " + str(tlen) + " -nitems " + str(self.nitems) \
40+
+ " -fname " + fname + " -ascii"
41+
os.system(run_command)
42+
43+
44+
def gen_data_by_nitems(self):
45+
nitems_range = list(np.arange(0.1, 2.1, 0.1))
46+
sub_folder = "nitems/"
47+
os.system("mkdir " + self.target_folder + sub_folder)
48+
for nitems in nitems_range:
49+
fname = self.target_folder + sub_folder + str(nitems)
50+
print(fname)
51+
run_command = self.run_apt + "-ntrans " + str(self.ntrans) \
52+
+ " -tlen " + str(self.tlen) + " -nitems " + str(nitems) \
53+
+ " -fname " + fname + " -ascii"
54+
os.system(run_command)
55+
56+
57+
58+
if __name__ == "__main__":
59+
# base set 5, 10, 1
60+
gen_data = GenData(ntrans=5, tlen=10, nitems=1)
61+
gen_data.gen_base_data()
62+
# gen_data.gen_data_by_ntrans()
63+
# gen_data.gen_data_by_tlen()
64+
# gen_data.gen_data_by_nitems()
18.8 KB
Loading
21.5 KB
Loading
21.9 KB
Loading
20.3 KB
Loading

Frequent Itemset Mining/test.py

Lines changed: 144 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,20 @@
1+
from datetime import datetime
2+
import os
3+
import matplotlib.pyplot as plt
4+
import numpy as np
5+
16
from apriori import Apriori
27
from fpgrowth import FPGrowth
38

9+
10+
def data_reader(data_file):
11+
data_set = []
12+
with open(data_file, 'r') as f:
13+
for line in f:
14+
data_set.append(line.split()[3:])
15+
return data_set
16+
17+
418
def load_data_set():
519
"""
620
Load a sample data set (From Data Mining: Concepts and Techniques, 3th Edition)
@@ -12,34 +26,140 @@ def load_data_set():
1226
['l1', 'l3'], ['l1', 'l2', 'l3', 'l5'], ['l1', 'l2', 'l3']]
1327
return data_set
1428

15-
def test_apriori(data_set):
29+
30+
def test_apriori(data_set, min_sup = 0.05):
31+
start = datetime.now()
1632
apriori = Apriori(data_set)
17-
freq_itemsets = apriori.generate_L(k=3, min_sup=0.2)
18-
big_rules_list = apriori.generate_big_rules(min_conf=0.7)
19-
for Lk in freq_itemsets:
20-
print(Lk)
21-
print()
22-
print("Big Rules")
23-
for item in big_rules_list:
24-
print(item[0], "=>", item[1], "conf: ", item[2])
25-
26-
27-
def test_fpgrowth(data_set):
28-
min_sup = 0.2
29-
min_conf = 0.7
30-
t = FPGrowth(data_set, min_sup=min_sup, min_conf=min_conf)
31-
t.build_fptree()
32-
for i in t.freq_itemsets:
33-
print(i)
34-
print(t.strong_association_rules)
33+
apriori.generate_L(min_sup=min_sup)
34+
deltatime = datetime.now() - start
35+
print("Apriori over")
36+
return deltatime.seconds + deltatime.microseconds / 1000000
37+
# print("# of freq itemsets:", len(apriori.freq_itemsets))
38+
# print(apriori.freq_itemsets)
39+
40+
41+
def test_fpgrowth(data_set, min_sup=0.05):
42+
start = datetime.now()
43+
fp = FPGrowth(data_set, min_sup=min_sup)
44+
fp.build_fptree()
45+
deltatime = datetime.now() - start
46+
print("FP-Growth over")
47+
# print("# of freq itemsets:", len(fp.freq_itemsets))
48+
49+
return deltatime.seconds + deltatime.microseconds / 1000000
50+
51+
52+
def test_ntrans():
53+
data_folder = "./data/ntrans/"
54+
ntrans_range = range(1, 21, 1)
55+
time_apriori = []
56+
time_fpgrowth = []
57+
58+
for ntrans in ntrans_range:
59+
fname = str(ntrans)+".data"
60+
print(fname)
61+
data_set = data_reader(data_folder+fname)
62+
63+
time_apriori.append(test_apriori(data_set))
64+
65+
time_fpgrowth.append(test_fpgrowth(data_set))
66+
67+
print(time_apriori)
68+
print(time_fpgrowth)
69+
plt.plot(ntrans_range, time_apriori, label="Apriori")
70+
plt.plot(ntrans_range, time_fpgrowth,label="FP-Growth")
71+
plt.xlabel("ntrans (k)")
72+
plt.ylabel("time (s)")
73+
plt.legend()
74+
plt.show()
75+
76+
def test_tlen():
77+
data_folder = "./data/tlen/"
78+
tlen_range = range(1, 21, 1)
79+
time_apriori = []
80+
time_fpgrowth = []
81+
82+
for tlen in tlen_range:
83+
fname = str(tlen)+".data"
84+
print(fname)
85+
data_set = data_reader(data_folder+fname)
86+
87+
time_apriori.append(test_apriori(data_set))
88+
89+
time_fpgrowth.append(test_fpgrowth(data_set))
90+
91+
print(time_apriori)
92+
print(time_fpgrowth)
93+
plt.plot(tlen_range, time_apriori, label="Apriori")
94+
plt.plot(tlen_range, time_fpgrowth,label="FP-Growth")
95+
plt.xlabel("tlen")
96+
plt.ylabel("time (s)")
97+
plt.legend()
98+
plt.show()
99+
100+
101+
def test_nitems():
102+
data_folder = "./data/nitems/"
103+
nitems_range = list(np.arange(0.1, 2.1, 0.1))
104+
time_apriori = []
105+
time_fpgrowth = []
106+
107+
for nitems in nitems_range:
108+
fname = str(nitems)+".data"
109+
print(fname)
110+
data_set = data_reader(data_folder+fname)
111+
112+
time_apriori.append(test_apriori(data_set))
113+
114+
time_fpgrowth.append(test_fpgrowth(data_set))
115+
116+
print(time_apriori)
117+
print(time_fpgrowth)
118+
plt.plot(nitems_range, time_apriori, label="Apriori")
119+
plt.plot(nitems_range, time_fpgrowth,label="FP-Growth")
120+
plt.xlabel("nitems (k)")
121+
plt.ylabel("time (s)")
122+
plt.legend()
123+
plt.show()
124+
125+
126+
def test_minsup():
127+
data_file = "./data/base_set.data"
128+
data_set = data_reader(data_file)
129+
minsup_range = list(np.arange(0.01, 0.21, 0.01))
130+
time_apriori = []
131+
time_fpgrowth = []
132+
133+
for minsup in minsup_range:
134+
time_apriori.append(test_apriori(data_set, min_sup=minsup))
135+
time_fpgrowth.append(test_fpgrowth(data_set, min_sup=minsup))
136+
137+
print(time_apriori)
138+
print(time_fpgrowth)
139+
plt.plot(minsup_range, time_apriori, label="Apriori")
140+
plt.plot(minsup_range, time_fpgrowth,label="FP-Growth")
141+
plt.xlabel("minsup")
142+
plt.ylabel("time (s)")
143+
plt.legend()
144+
plt.show()
145+
146+
def test_base():
147+
data_file = "./data/base_set.data"
148+
data_set = data_reader(data_file)
149+
# data_set = load_data_set()
150+
# print("Apriori-----------------------")
151+
# print("Time (s):", test_apriori(data_set))
152+
153+
print("FP-Growth-----------------------")
154+
print("Time (s):", test_fpgrowth(data_set))
35155

36156

37157
if __name__ == "__main__":
38158
"""
39159
Test
40160
"""
41-
data_set = load_data_set()
42-
test_apriori(data_set)
43-
print("FP-Growth-----------------------")
44-
test_fpgrowth(data_set)
45-
print(len(data_set))
161+
test_base()
162+
# test_ntrans()
163+
# test_tlen()
164+
# test_nitems()
165+
# test_minsup()

0 commit comments

Comments
 (0)