Bug fixes discovered during testing

kalininalab · Apr 4, 2024 · 62f0afe · 62f0afe
1 parent 676674b
commit 62f0afe
Show file tree

Hide file tree

Showing 3 changed files with 5 additions and 14 deletions.
diff --git a/datasail/cluster/vectors.py b/datasail/cluster/vectors.py
@@ -13,13 +13,12 @@
     "russel", "sokal", "tanimoto"
 ]
 
-# unbounded: mcconnaughey
+# unbounded: canberra, chebyshev, cityblock, euclidean, mcconnaughey, manhattan, minkowski, sqeuclidean
 # produces inf or nan: correlation, cosine, jensenshannon, seuclidean, braycurtis
 # boolean only: dice, kulczynski1, rogerstanimoto, russelrao, sokalmichener, sokalsneath, yule
 # matching == hamming, manhattan == cityblock (inofficial)
 DIST_OPTIONS = Literal[
-    "canberra", "chebyshev", "cityblock", "euclidean", "hamming", "jaccard", "mahalanobis", "manhattan", "matching",
-    "minkowski", "sqeuclidean"
+    "hamming", "jaccard", "mahalanobis", "matching"
 ]
 
 
@@ -181,9 +180,7 @@ def run(
                 f"the embeddings. The number of samples ({len(fps)}) is too small; the covariance matrix is singular. "
                 f"For observations with {len(fps[0])} dimensions, at least {len(fps[0]) + 1} observations are required."
             )
-        dataset.cluster_distance = scipy.spatial.distance.cdist(
-            fps, fps, metric={"manhattan": "cityblock", "tanimoto": "jaccard"}.get(method, method)
-        )
+        dataset.cluster_distance = scipy.spatial.distance.cdist(fps, fps, metric=method)
 
 
 # if __name__ == '__main__':

diff --git a/datasail/solver/cluster_1d.py b/datasail/solver/cluster_1d.py
@@ -63,8 +63,6 @@ def solve_c1(
     ) for e2 in range(e1 + 1, len(clusters))] for e1 in range(len(clusters))]  # 15
 
     loss = cvxpy.sum([t for tmp_list in tmp for t in tmp_list])
-    if distances is not None:
-        loss = -loss
     problem = solve(loss, constraints, max_sec, solver, log_file)
 
     return None if problem is None else {

diff --git a/tests/test_clustering.py b/tests/test_clustering.py
@@ -257,9 +257,7 @@ def test_mmseqspp_protein():
 @pytest.mark.parametrize("in_type", ["Original", "List", "Numpy"])
 @pytest.mark.parametrize("method", [
     "allbit", "asymmetric", "braunblanquet", "cosine", "dice", "kulczynski", "onbit", "rogotgoldberg", "russel",
-    "sokal",
-    "canberra", "chebyshev", "cityblock", "euclidean", "hamming", "jaccard",
-    "mahalanobis", "manhattan", "matching", "minkowski", "sqeuclidean", "tanimoto"
+    "sokal", "tanimoto", "hamming", "jaccard", "mahalanobis", "matching"
 ])
 def test_vector(md_calculator, algo, in_type, method) :
     data = molecule_data()
@@ -284,9 +282,7 @@ def test_vector(md_calculator, algo, in_type, method) :
 
 @pytest.mark.parametrize("method", [
     "allbit", "asymmetric", "braunblanquet", "cosine", "dice", "kulczynski", "onbit", "rogotgoldberg", "russel",
-    "sokal",
-    "canberra", "chebyshev", "cityblock", "euclidean", "hamming", "jaccard", "mahalanobis", "manhattan", "matching",
-    "minkowski", "sqeuclidean", "tanimoto"
+    "sokal", "hamming", "jaccard", "mahalanobis", "matching"
 ])
 def test_vector_edge(method):
     dataset = DataSet(