This repository has been archived by the owner on Aug 16, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
clusteringCode.m
86 lines (76 loc) · 2.59 KB
/
clusteringCode.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
clc;
clear;
close all;
warning off;
className_ = {'spam','ham'};
% ###### read english big data
f1_ = fopen('english_big.txt');
count_ = 0;
while true
l_ = fgetl(f1_);
if ~ischar(l_)
break;
end
commaIndex_ = strfind(l_,',');commaIndex_ = commaIndex_(end);
count_ = count_ + 1;
englishBig_{count_} = l_(1:commaIndex_-1);
englishBigLabel_{count_} = l_(commaIndex_+1:end);
end
fclose(f1_);
% convert char label to numeric label
englishBigLabelNumeric_ = zeros(length(englishBigLabel_),1);
for class_ = 1 : length(className_)
commaIndex_ = strcmp(englishBigLabel_,className_(class_));
englishBigLabelNumeric_(commaIndex_) = class_;
end
% process messages
englishBigBag_ = erasePunctuation(englishBig_);
englishBigBag_ = lower(englishBigBag_);
englishBigBag_ = tokenizedDocument(englishBigBag_);
englishBigBag_ = removeWords(englishBigBag_,stopWords);
englishBigBag_ = removeShortWords(englishBigBag_,3);
englishBigBag_ = removeLongWords(englishBigBag_,10);
englishBigBag_ = normalizeWords(englishBigBag_);
englishBigBag_ = removeWords(englishBigBag_,stopWords);
englishBigBag_ = bagOfWords(englishBigBag_);
englishBigBag_ = removeInfrequentWords(englishBigBag_,10);
[englishBigBag_,empty_] = removeEmptyDocuments(englishBigBag_);
englishBigLabelNumeric_(empty_) = [];
englishBig_ = full(englishBigBag_.tfidf);
% ###### randperm data
perm_ = randperm(size(englishBig_,1));
englishBig_ = englishBig_(perm_,:);
englishBigLabelNumeric_ = englishBigLabelNumeric_(perm_);
% ###### splite data
trainCount_ = round(0.7 * size(englishBig_,1));
Xtr_ = englishBig_(1:trainCount_,:);
Ytr_ = englishBigLabelNumeric_(1:trainCount_);
Xts_ = englishBig_(1+trainCount_:end,:);
Yts_ = englishBigLabelNumeric_(1+trainCount_:end);
% ###### cluster data
[clusters_,centroids_] = kmeans(Xtr_,2);
if mode(clusters_) ~= mode(Ytr_)
temp_ = centroids_(1,:);
centroids_(1,:) = centroids_(2,:);
centroids_(2,:) = temp_;
end
for i = 1 : 2
ED_(i,:) = sqrt(sum((Xts_-centroids_(i,:)).^2,2));
end
[~,predictLables_] = min(ED_);
clusEval_ = EvalCrit(predictLables_',Yts_);
clusEval_ = [clusEval_(5),clusEval_(6),clusEval_(7)];
% ###### show results
figure;
bar(clusEval_,0.5);
set(gca,'XTickLabel',{'accuracy';'precision';'recall'});
text(1:length(clusEval_),clusEval_,num2str(clusEval_'),'vert','bottom','horiz','center');
figure;
plotconfusion(ind2vec(predictLables_),ind2vec(Yts_'));
title('confusion from clustering');
% ###### show results
model_ = fitctree(Xtr_,Ytr_);
predictLables_ = predict(model_,Xts_);
figure;
plotconfusion(ind2vec(predictLables_'),ind2vec(Yts_'));
title('confusion from decision tree');