-
Notifications
You must be signed in to change notification settings - Fork 0
/
DistanceMetrics.py
155 lines (138 loc) · 5.64 KB
/
DistanceMetrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import math
import random
import csv
import cProfile
import numpy as np
import hashlib
memoization = {}
class Similarity:
"""
This class contains instances of similarity / distance metrics. These are used in centroid based clustering
algorithms to identify similar patterns and put them into the same homogeneous sub sets
:param minimum: the minimum distance between two patterns (so you don't divide by 0)
"""
def __init__(self, minimum=1.0e-16):
self.e = minimum
self.vector_operators = VectorOperations()
def manhattan_distance(self, p_vec, q_vec):
"""
This method implements the manhattan distance metric
:param p_vec: vector one
:param q_vec: vector two
:return: the manhattan distance between vector one and two
"""
return max(np.sum(np.fabs(p_vec - q_vec)), self.e)
def square_euclidean_distance(self, p_vec, q_vec):
"""
This method implements the squared euclidean distance metric
:param p_vec: vector one
:param q_vec: vector two
:return: the squared euclidean distance between vector one and two
"""
diff = p_vec - q_vec
return max(np.sum(diff**2), self.e)
def euclidean_distance(self, p_vec, q_vec):
"""
This method implements the euclidean distance metric
:param p_vec: vector one
:param q_vec: vector two
:return: the euclidean distance between vector one and two
"""
return max(math.sqrt(self.square_euclidean_distance(p_vec, q_vec)), self.e)
def half_square_euclidean_distance(self, p_vec, q_vec):
"""
This method implements the half squared euclidean distance metric
:param p_vec: vector one
:param q_vec: vector two
:return: the half squared euclidean distance between vector one and two
"""
return max(0.5 * self.square_euclidean_distance(p_vec, q_vec), self.e)
def cosine_similarity(self, p_vec, q_vec):
"""
This method implements the cosine similarity metric
:param p_vec: vector one
:param q_vec: vector two
:return: the cosine similarity between vector one and two
"""
pq = self.vector_operators.product(p_vec, q_vec)
p_norm = self.vector_operators.norm(p_vec)
q_norm = self.vector_operators.norm(q_vec)
return max(pq / (p_norm * q_norm), self.e)
def tanimoto_coefficient(self, p_vec, q_vec):
"""
This method implements the cosine tanimoto coefficient metric
:param p_vec: vector one
:param q_vec: vector two
:return: the tanimoto coefficient between vector one and two
"""
pq = self.vector_operators.product(p_vec, q_vec)
p_square = self.vector_operators.square(p_vec)
q_square = self.vector_operators.square(q_vec)
return max(pq / (p_square + q_square - pq), self.e)
def fractional_distance(self, p_vec, q_vec, fraction=0.5):
"""
This method implements the fractional distance metric. I have implemented memoization for this method to reduce
the number of function calls required. The net effect is that the algorithm runs 400% faster. A similar approach
can be used with any of the above distance metrics as well.
:param p_vec: vector one
:param q_vec: vector two
:param fraction: the fractional distance value (power)
:return: the fractional distance between vector one and two
"""
# memoization is used to reduce unnecessary calculations ... makes a BIG difference
memoize = True
if memoize:
key = self.get_key(p_vec, q_vec)
x = memoization.get(key)
if x is None:
diff = p_vec - q_vec
#diff_fraction = diff**fraction
diff_fraction = np.power(np.abs(diff), fraction)
return max(np.power(abs(np.sum(diff_fraction)), float(1.0/fraction)), self.e)
else:
return x
else:
diff = p_vec - q_vec
diff_fraction = diff**fraction
return max(math.pow(np.sum(diff_fraction), 1/fraction), self.e)
@staticmethod
def get_key(p_vec, q_vec):
"""
This method returns a unique hash value for two vectors. The hash value is equal to the concatenated string of
the hash value for vector one and vector two. E.g. is hash(p_vec) = 1234 and hash(q_vec) = 5678 then get_key(
p_vec, q_vec) = 12345678. Memoization improved the speed of this algorithm 400%.
:param p_vec: vector one
:param q_vec: vector two
:return: a unique hash
"""
# return str(hash(tuple(p_vec))) + str(hash(tuple(q_vec)))
return str(hashlib.sha1(p_vec)) + str(hashlib.sha1(q_vec))
class VectorOperations():
"""
This class contains useful implementations of methods which can be performed on vectors
"""
@staticmethod
def product(p_vec, q_vec):
"""
This method returns the product of two lists / vectors
:param p_vec: vector one
:param q_vec: vector two
:return: the product of p_vec and q_vec
"""
return p_vec * q_vec
@staticmethod
def square(p_vec):
"""
This method returns the square of a vector
:param p_vec: the vector to be squared
:return: the squared value of the vector
"""
return p_vec**2
@staticmethod
def norm(p_vec):
"""
This method returns the norm value of a vector
:param p_vec: the vector to be normed
:return: the norm value of the vector
"""
return np.sqrt(p_vec)