-
Notifications
You must be signed in to change notification settings - Fork 0
/
DTW-perf.py
executable file
·161 lines (152 loc) · 7.36 KB
/
DTW-perf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/usr/bin/env python
########################################################################################
#
# DTW-perf.py
# Author: Gao Yueyao
# Python 3.6.10
# Requires the following Python packages:
# numpy(=1.18.1), pandas(1.0.3), matplotlib(3.2.1), tslearn(=0.41)
#
########################################################################################
#
# Import dependencies
#
########################################################################################
import os
import math
import numpy as np
import pandas as pd
import collections
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from tslearn.clustering import TimeSeriesKMeans
import argparse
from sklearn.metrics import silhouette_samples, silhouette_score, calinski_harabasz_score
########################################################################################
#
# Description of script
#
########################################################################################
parser = argparse.ArgumentParser(description="""
DTW-perf.py takes a gene expression matrix as input and run soft-DTW k-means
according to Marco Cuturi and Mathieu Blondel (https://arxiv.org/abs/1703.01541).
The purpose of this script is to select the optimal number of clusters by
evaluating the performance of the DTW-KMeans clustering algorithm by
fitting the model with a range of K values.
This script uses three performance evaluation method:
1. Elbow Method
Calculate Distortion (the average of the squared distances from the cluster centers of the respective clusters)
2. Silhouette coefficient[1]:
near +1 indicate that the sample is far away from the neighboring clusters
https://scikit-learn.org/stable/modules/clustering.html#silhouette-coefficient
3. Calinski harabasz Index[2]:
Computes the ratio of dispersion between and within clusters.
A higher Calinski-Harabasz score relates to a model with better defined clusters.
[1]Peter J. Rousseeuw (1987). “Silhouettes: a Graphical Aid to the Interpretation and Validation of Cluster Analysis”.
Computational and Applied Mathematics 20: 53–65. doi:10.1016/0377-0427(87)90125-7.
[2]Caliński, T., & Harabasz, J. (1974). “A Dendrite Method for Cluster Analysis”.
Communications in Statistics-theory and Methods 3: 1-27. doi:10.1080/03610927408827101.
""")
#########################################################################################
#
# Required arguments
#
##########################################################################################
parser.add_argument('-i','--input',dest='gene_expression_matrix',action="store",required=True,help="""
The format of the gene_expression_matrix.txt is:
gene 1 2 3 ... time_t
gene_1 10 20 5 ... 8
gene_2 3 2 50 ... 8
gene_3 18 100 10 ... 22
...
gene_n 45 22 15 ... 60
""")
parser.add_argument('-kmin','--Kmin',dest="Kmin",type=int,action='store',help="The minimum K value")
parser.add_argument('-kmax','--Kmax',dest="Kmax",type=int,action='store',help="The maximum K value")
parser.add_argument('-step','--stepsize',dest="stepsize",type=int,action='store',help="step size of K value range")
parser.add_argument('-o','--output',dest="output_path_prefix",action='store',help='required, e.g. /path/to/PERF_results')
args = parser.parse_args()
##########################################################################################
exp_df = pd.read_csv(args.gene_expression_matrix,sep="\t",index_col='gene')
MTRinput_df = pd.read_csv(args.gene_expression_matrix,sep='\t',index_col='gene')
MTR_arr = MTRinput_df.values
print('Input Gene Expression Matrix has {} entries with {} time points'.format(MTR_arr.shape[0],MTR_arr.shape[1]))
# Normalization with standard scaler
gene_expression_matrix = MTR_arr
gene_expression_matrix -= np.vstack(np.nanmean(gene_expression_matrix, axis=1))
gene_expression_matrix /= np.vstack(np.nanstd(gene_expression_matrix, axis=1))
# Set up a K range to iterate through for DTW-KMeans model
K_range = np.arange(args.Kmin,args.Kmax,args.stepsize)
# Three ways to measure the performances of DTW-KMeans model
Sum_of_squared_distances = []
ch_indexs = []
silhouette_scores = []
for n_clusters in K_range:
# soft-DTW-Kmeans
# Initialize the clusterer with n_clusters value and a random generator
# seed of 10 for reproducibility
clusterer = TimeSeriesKMeans(n_clusters=n_clusters, metric="softdtw", metric_params={"gamma": .01}, verbose=False,random_state=10)
cluster_labels = clusterer.fit_predict(gene_expression_matrix)
print('The Shape of Cluster Centers are {}'.format(clusterer.cluster_centers_.shape))
# The squared distance for Elbow Method
# Select optimal number of clusters by fitting the model
# with a range of K values
Sum_of_squared_distances.append(clusterer.inertia_)
print("For n_clusters =", n_clusters,
"The sum of squared distance is :", clusterer.inertia_)
#Compute the Calinski and Harabasz score.
#This gives ratio between the within-cluster dispersion and the
# between-cluster dispersion.
ch_score = calinski_harabasz_score(gene_expression_matrix, cluster_labels)
ch_indexs.append(ch_score)
print("For n_clusters =", n_clusters,
"The calinski_harabasz_score is :", ch_score)
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
silhouette_avg = silhouette_score(gene_expression_matrix, cluster_labels)
silhouette_scores.append(silhouette_avg)
print("For n_clusters =", n_clusters,
"The average silhouette_score is :", silhouette_avg)
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(gene_expression_matrix, cluster_labels)
# Plot the actual time-series clusters formed
# Time Set
t_label = list(map(int,MTRinput_df.columns.tolist()))
t = t_label
t /= np.mean(np.diff(t))
plt.figure(figsize=(20,20))
subplots = int(math.sqrt(n_clusters))+1
for yi in range(n_clusters):
ax = plt.subplot(subplots,subplots, yi + 1)
for xx in gene_expression_matrix[cluster_labels == yi]:
ax.plot(t,xx.ravel(),'k-', alpha=0.15)
ax.plot(t,clusterer.cluster_centers_[yi].ravel(), "r-")
ax.set_xticks(t)
ax.set_xticklabels(t_label)
ax.set_ylabel('Gene expression')
ax.set_title('Cluster {} ({})'.format(yi + 1,collections.Counter(cluster_labels)[yi]))
ax.grid(False)
if yi == 1:
plt.title("DTW Mtr $k$-means")
plt.tight_layout()
plt.savefig(args.output_path_prefix+'K{}_clustering.png'.format(n_clusters),dpi=200)
#Plot the results of Elbow Method, silhouette score, and ch index
fig, (ax0, ax1, ax2) = plt.subplots(3,figsize=(5, 15))
ax0.plot(K_range, Sum_of_squared_distances, 'bo-')
ax0.set_xlabel('n_clusters')
ax0.set_ylabel('Sum of Squared Distances')
ax0.set_title('Distortion for DTW-KMeans Clustering')
ax0.grid(False)
ax1.plot(K_range, ch_indexs, 'g.-')
ax1.set_xlabel('n_clusters')
ax1.set_ylabel('ch_score')
ax1.set_title('Calinski and Harabaz Score for DTW-KMeans Clustering')
ax1.grid(False)
ax2.plot(K_range, silhouette_scores, 'r*-')
ax2.set_xlabel('n_clusters')
ax2.set_ylabel('Averge Silhouette Score')
ax2.set_title('Averge Silhouette Score for DTW-KMeans Clustering')
ax2.grid(False)
plt.tight_layout()
plt.savefig(args.output_path_prefix+'_DTW-KMeans_PERFs.png',dpi=200)