-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsummarize-cluster.py
80 lines (65 loc) · 2.18 KB
/
summarize-cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string
from sklearn.metrics.pairwise import cosine_similarity
import pdb
# set up nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
# load the labelled clusters
df = pd.read_csv('labelled-embeddings.csv', sep='\t', engine='python')
pd.set_option('display.max_rows', 200)
labels = df['labels'].unique()
labels.sort()
pd.set_option('display.max_colwidth', 100)
print(labels)
label_scores = {}
for label in labels:
print('----', label, '----')
cluster = df.loc[df['labels'] == label]
cluster_content = cluster['embedding_content']
# decode embeddings
dfe = cluster['embedding'].apply(
lambda x: [float(num) for num in x.strip('[]').split(',')]
)
num_columns = len(dfe.iloc[0])
column_names = range(num_columns)
dfc = pd.DataFrame(dfe.to_list(), columns=column_names)
# now compute centroid and similarities
centroid = np.mean(dfc.to_numpy(), axis=0).reshape(1, -1)
cum = 0
count = 0
for index, row in dfc.iterrows():
rowvals = dfc.iloc[index].values.reshape(1, -1)
similarity = cosine_similarity(rowvals, centroid)
cum += similarity
count += 1
cum /= count
label_scores[label] = cum
# normalize text
as_text = ' '.join(cluster_content.astype(str).values.flatten())
words = word_tokenize(as_text)
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)
stemmer = PorterStemmer()
cleaned_words = [
stemmer.stem(word.lower()) # Apply stemming to the word
for word in words
if len(word) >= 6 and word.lower() not in stop_words and word not in punctuation
]
# get the top tokens
fdist = FreqDist(cleaned_words)
top_words = fdist.most_common(8)
print('top stemmed words: ', top_words)
print('mean cosine dist : ', cum)
print(cluster['title'])
for key, value in label_scores.items():
print(key, value)
min_item = min(label_scores.items(), key=lambda x: x[1])
print(min_item)