helmholtz-analytics · mrfh92 · Aug 13, 2024 · Aug 1, 2024 · Aug 1, 2024 · Aug 1, 2024
diff --git a/heat/cluster/batchparallelclustering.py b/heat/cluster/batchparallelclustering.py
@@ -19,20 +19,24 @@
 """
 
 
-def _initialize_plus_plus(X, n_clusters, p, random_state=None):
+def _initialize_plus_plus(X, n_clusters, p, random_state=None, max_samples=2**24 - 1):
     """
     Auxiliary function: single-process k-means++/k-medians++ initialization in pytorch
     p is the norm used for computing distances
     """
     if random_state is not None:
         torch.manual_seed(random_state)
-    idxs = torch.zeros(n_clusters, dtype=torch.long, device=X.device)
-    idxs[0] = torch.randint(0, X.shape[0], (1,))
-    for i in range(1, n_clusters):
-        dist = torch.cdist(X, X[idxs[:i]], p=p)
-        dist = torch.min(dist, dim=1)[0]
-        idxs[i] = torch.multinomial(dist, 1)
-    return X[idxs]
+    if X.shape[0] <= max_samples:  # torch's multinomial is limited to 2^24 categories
+        idxs = torch.zeros(n_clusters, dtype=torch.long, device=X.device)
+        idxs[0] = torch.randint(0, X.shape[0], (1,))
+        for i in range(1, n_clusters):
+            dist = torch.cdist(X, X[idxs[:i]], p=p)
+            dist = torch.min(dist, dim=1)[0]
+            idxs[i] = torch.multinomial(dist, 1)
+        return X[idxs]
+    else:  # if X is too large for the 2^24-bound, use a random subset of X
+        idxs = torch.randint(0, X.shape[0], (max_samples,))
+        return _initialize_plus_plus(X[idxs], n_clusters, p, random_state)
 
 
 def _kmex(X, p, n_clusters, init, max_iter, tol, random_state=None):
@@ -289,7 +293,7 @@ def predict(self, x: DNDarray):
 
         local_labels = _parallel_batched_kmex_predict(
             x.larray, self._cluster_centers.larray, self._p
-        )
+        ).to(torch.int32)
         labels = DNDarray(
             local_labels,
             gshape=(x.shape[0], 1),

diff --git a/heat/cluster/tests/test_batchparallelclustering.py b/heat/cluster/tests/test_batchparallelclustering.py
@@ -7,7 +7,7 @@
 from mpi4py import MPI
 
 from ...core.tests.test_suites.basic_test import TestCase
-from ..batchparallelclustering import _kmex, _BatchParallelKCluster
+from ..batchparallelclustering import _kmex, _initialize_plus_plus, _BatchParallelKCluster
 
 # test BatchParallelKCluster base class and auxiliary functions
 
@@ -32,6 +32,10 @@ def test_kmex(self):
         init = torch.rand(2, 3)
         _kmex(X, 2, 2, init, max_iter, tol)
 
+    def test_initialize_plus_plus(self):
+        X = torch.rand(100, 3)
+        _initialize_plus_plus(X, 3, 2, random_state=None, max_samples=50)
+
     def test_BatchParallelKClustering(self):
         with self.assertRaises(TypeError):
             _BatchParallelKCluster(2, 10, "++", 100, 1e-2, random_state=3.14, n_procs_to_merge=None)