-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy path高斯混合聚类.py
156 lines (137 loc) · 6.32 KB
/
高斯混合聚类.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
from numpy import *
import matplotlib.pyplot as plt
import copy
'''
这个算法对初值太敏感了, 特别是对均值向量mu的初值
'''
# 加载数据, 预处理数据
def loadData(filename):
dataSet = []
fr = open(filename)
for line in fr.readlines():
curLine = line.strip().split(',')
fltLine = list(map(float, curLine)) # 将数据映射为浮点型数据
dataSet.append(fltLine)
return dataSet
# 高斯分布的概率密度函数
def prob(x, mu, sigma):
n = shape(x)[1] # 获取样本点的维数
expOn = float(-0.5 * (x - mu) * (sigma.I) * ((x - mu).T))
divBy = pow(2 * pi, n / 2) * pow(linalg.det(sigma), 0.5)
return pow(e, expOn) / divBy
# EM算法
def EM(dataMat, clusterNum = 3, maxIter=500, valve=1e-5):
'''
:param dataMat: 数据集
:param clusterNum: 簇个数
:param maxIter: 最大迭代次数
:return: 样本点的后验分布
'''
m, n = shape(dataMat) # 获取数据集的样本点个数以及样本点的维度
#alpha = [1 / 3, 1 / 3, 1 / 3] # 初始化各高斯混合成分参数
alpha = [] # 记录各高斯混合成分参数
#mu = [dataMat[:int(m/3), :], dataMat[int(m/3):int((m*2)/3), :], dataMat[int((m*2)/3):, :]] #
#mu = [dataMat[int(m/3), :], dataMat[int(m*2/3), :], dataMat[int(m-1), :]] # 初始化mu
mu= [] # 记录均值向量mu
# 高斯混合模型是对初值敏感的模型, mu的设置一定要保证随机选
for i in range(clusterNum):
alpha.append(1/clusterNum) # 初始化各高斯混合成分参数
index = int(random.uniform(0, m)) # 随机生成一个整数
mu.append(dataMat[index, :]) # 初始化mu
#print('alpha=%s'%alpha)
#print('mu=%s'%mu)
sigma = [mat([[0.1, 0], [0, 0.1]]) for x in range(clusterNum)] # 初始化协方差矩阵
gamma = mat(zeros((m, clusterNum))) # 初始化后验概率gamma
old_sumGamma = [] # 记录跌代前的每个样本Gamma的总和
sumGamma = mat(zeros((1,clusterNum))) # 记录的每个样本Gamma的总和
for i in range(maxIter): #
# E step
for j in range(m): # 遍历所有的样本点
sumAlphaMulP = 0
for k in range(clusterNum):
# 计算样本点j, 第k个高斯混合成分生成的后验概率
gamma[j, k] = alpha[k] * prob(dataMat[j, :], mu[k], sigma[k]) # 第k个高斯混合成分生成的后验概率的分子
sumAlphaMulP += gamma[j, k] # 求和得出分母分母
for k in range(clusterNum):
gamma[j, k] = gamma[j, k]/sumAlphaMulP # 计算样本点j, 第k个高斯混合成分生成的后验概率
old_sumGamma = copy.deepcopy(sumGamma) # 记录跌代前的每个样本Gamma的总和
sumGamma = sum(gamma, axis=0) # 计算所有样本的高斯混合成分后验概率对应的维度的和
# 计算迭代前后sumGamma的差距, 若果小于阀值, 退出
distance = 0
for k in range(clusterNum):
distance += (old_sumGamma.tolist()[0][k] - sumGamma.tolist()[0][k])**2
distance = sqrt(distance)
if distance < valve:
print('Iter=%s'%i)
break
# M step
for k in range(clusterNum):
mu[k] = mat(zeros((1, n)))
sigma[k] = mat(zeros((n, n)))
# 更新mu
for j in range(m):
mu[k] += gamma[j, k] * dataMat[j, :]
mu[k] /= sumGamma[0, k]
# 更新sigma
for j in range(m):
sigma[k] += gamma[j, k] * (dataMat[j, :] - mu[k]).T * (dataMat[j, :] - mu[k])
sigma[k] /= sumGamma[0, k]
# 更新alpha
alpha[k] = sumGamma[0, k] / m
# print(mu)
return gamma
# 初始化聚类簇
def initCentroids(dataMat, k):
numSamples, dim = dataMat.shape # 获取数据集的样本点个数以及样本点的维度
centroids = zeros((k, dim)) # 初始化k个, 维度为dim的聚类簇的坐标
for i in range(k): # 所有聚类簇
index = int(random.uniform(0, numSamples)) # 随机生成一个整数
centroids[i, :] = dataMat[index, :] # 初始化聚类簇
return centroids
# 高斯聚类
def gaussianCluster(dataMat, clusterNum = 3):
m, n = shape(dataMat) # 获取数据集的样本点个数以及样本点的维度
if clusterNum > m:
print('the number of clusters is illegal, it should be less than %s'%m)
return
# 每个样本的所属的簇,以及分到该簇对应的响应度
centroids = initCentroids(dataMat, clusterNum) # 初始化聚类簇
clusterAssign = mat(zeros((m, n))) # 初始化簇分配矩阵
gamma = EM(dataMat, clusterNum) # 计算后验概率
# 确定每个样本的簇标记
for i in range(m):
# argmax返回矩阵最大值所在下标, amx返回矩阵最大值
clusterAssign[i, :] = argmax(gamma[i, :]), amax(gamma[i, :])
#print('clusterAssign[i, :]=%s'%clusterAssign[i, :])
# 确定聚类簇点坐标
for j in range(clusterNum):
# 获取所有类别标记为j的点
pointsInCluster = dataMat[nonzero(clusterAssign[:, 0].A == j)[0]]
#print('%s,pointsInCluster=%s'%(j,pointsInCluster))
centroids[j, :] = mean(pointsInCluster, axis=0) # 计算这些点的平均值作为聚类簇的坐标
return centroids, clusterAssign
def showCluster(dataMat, k, centroids, clusterAssment):
numSamples, dim = dataMat.shape
if dim != 2:
print("Sorry! I can not draw because the dimension of your data is not 2!")
return 1
mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']
if k > len(mark):
print("Sorry! Your k is too large!")
return 1
# draw all samples
for i in range(numSamples):
markIndex = int(clusterAssment[i, 0])
plt.plot(dataMat[i, 0], dataMat[i, 1], mark[markIndex])
mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb']
# draw the centroids
for i in range(k):
plt.plot(centroids[i, 0], centroids[i, 1], mark[i], markersize=12)
plt.show()
if __name__ == '__main__':
dataMat = mat(loadData('watermelon4.txt'))
#print(dataMat)
clusterNum = 2 # 簇个数
centroids, clusterAssign = gaussianCluster(dataMat, clusterNum)
#print(clusterAssign)
showCluster(dataMat, clusterNum, centroids, clusterAssign)