-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmake_cluster.py
46 lines (35 loc) · 1.35 KB
/
make_cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import numpy as np
import os
from sklearn.cluster import AgglomerativeClustering
import shutil
from joblib import Memory
import joblib
from sklearn.cluster import DBSCAN
from scipy.cluster import hierarchy
from sklearn.cluster import KMeans
# Sử dụng joblib để đệm kết quả
memory = Memory("cachedir", verbose=0)
@memory.cache
def hierarchical_clustering(vectors):
clusterer = KMeans(n_clusters=10, random_state=0, n_init="auto")
clusterer.fit(vectors)
labels = clusterer.labels_
print(labels)
print(set(labels))
joblib.dump(clusterer, "clustering.pt")
return labels
data_folder = '/mlcv/WorkingSpace/Personals/ngocnd/BKAI2023/BKAI-NAVER-OCR-2023/Data/Backbone_vector/DataTrain'
file_paths = sorted([os.path.join(data_folder, file) for file in os.listdir(data_folder)], key=lambda x: int(os.path.basename(x).split(".")[0]))
vectors = []
for idx, file in enumerate(file_paths):
if idx > 1000:
break
vectors.append(np.load(file).flatten())
if (idx+1) % 10 == 0:
print(f"Loaded {idx+1} vectors...")
vectors = np.array(vectors)
with open('/mlcv/WorkingSpace/Personals/ngocnd/BKAI2023/BKAI-NAVER-OCR-2023/Data/Backbone_vector/vector.txt', 'r') as f:
image_paths = [path.strip() for path in f.readlines()][:1000]
print('Load image done')
labels = hierarchical_clustering(vectors)
print('Train done')