Skip to content

Commit

Permalink
Merge pull request scikit-learn#2459 from dougalsutherland/euclidean_…
Browse files Browse the repository at this point in the history
…distances

[MRG + 1] support X_norm_squared in euclidean_distances
  • Loading branch information
GaelVaroquaux committed Sep 1, 2015
2 parents 201aca9 + 8d8b434 commit 4d39cf8
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 12 deletions.
33 changes: 21 additions & 12 deletions sklearn/metrics/pairwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,8 @@ def check_paired_arrays(X, Y):


# Pairwise distances
def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False):
def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False,
X_norm_squared=None):
"""
Considering the rows of X (and Y=X) as vectors, compute the
distance matrix between each pair of vectors.
Expand All @@ -157,8 +158,8 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False):
This formulation has two advantages over other ways of computing distances.
First, it is computationally efficient when dealing with sparse data.
Second, if x varies but y remains unchanged, then the right-most dot
product `dot(y, y)` can be pre-computed.
Second, if one argument varies but the other remains unchanged, then
`dot(x, x)` and/or `dot(y, y)` can be pre-computed.
However, this is not the most precise way of doing this computation, and
the distance matrix returned by this function may not be exactly
Expand All @@ -179,6 +180,10 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False):
squared : boolean, optional
Return squared Euclidean distances.
X_norm_squared : array-like, shape = [n_samples_1], optional
Pre-computed dot-products of vectors in X (e.g.,
``(X**2).sum(axis=1)``)
Returns
-------
distances : {array, sparse matrix}, shape (n_samples_1, n_samples_2)
Expand All @@ -200,24 +205,28 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False):
--------
paired_distances : distances betweens pairs of elements of X and Y.
"""
# should not need X_norm_squared because if you could precompute that as
# well as Y, then you should just pre-compute the output and not even
# call this function.
X, Y = check_pairwise_arrays(X, Y)

if Y_norm_squared is not None:
if X_norm_squared is not None:
XX = check_array(X_norm_squared)
if XX.shape == (1, X.shape[0]):
XX = XX.T
elif XX.shape != (X.shape[0], 1):
raise ValueError(
"Incompatible dimensions for X and X_norm_squared")
else:
XX = row_norms(X, squared=True)[:, np.newaxis]

if X is Y: # shortcut in the common case euclidean_distances(X, X)
YY = XX.T
elif Y_norm_squared is not None:
YY = check_array(Y_norm_squared)
if YY.shape != (1, Y.shape[0]):
raise ValueError(
"Incompatible dimensions for Y and Y_norm_squared")
else:
YY = row_norms(Y, squared=True)[np.newaxis, :]

if X is Y: # shortcut in the common case euclidean_distances(X, X)
XX = YY.T
else:
XX = row_norms(X, squared=True)[:, np.newaxis]

distances = safe_sparse_dot(X, Y.T, dense_output=True)
distances *= -2
distances += XX
Expand Down
25 changes: 25 additions & 0 deletions sklearn/metrics/tests/test_pairwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,31 @@ def test_euclidean_distances():
D = euclidean_distances(X, Y)
assert_array_almost_equal(D, [[1., 2.]])

rng = np.random.RandomState(0)
X = rng.random_sample((10, 4))
Y = rng.random_sample((20, 4))
X_norm_sq = (X ** 2).sum(axis=1)
Y_norm_sq = (Y ** 2).sum(axis=1)

# check that we still get the right answers with {X,Y}_norm_squared
D1 = euclidean_distances(X, Y)
D2 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq)
D3 = euclidean_distances(X, Y, Y_norm_squared=Y_norm_sq)
D4 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq,
Y_norm_squared=Y_norm_sq)
assert_array_almost_equal(D2, D1)
assert_array_almost_equal(D3, D1)
assert_array_almost_equal(D4, D1)

# check we get the wrong answer with wrong {X,Y}_norm_squared
X_norm_sq *= 0.5
Y_norm_sq *= 0.5
wrong_D = euclidean_distances(X, Y,
X_norm_squared=np.zeros_like(X_norm_sq),
Y_norm_squared=np.zeros_like(Y_norm_sq))
assert_greater(np.max(np.abs(wrong_D - D1)), .01)



# Paired distances

Expand Down

0 comments on commit 4d39cf8

Please sign in to comment.