Merge branch 'main' into bugs/1321-_Bug_factories_array_illegal_param…

…eter_combinations_copy_&_split
helmholtz-analytics · Apr 15, 2024 · 91b7125 · 91b7125 · github-actions · Apr 15, 2024
2 parents 820bf43 + b070199
commit 91b7125
Show file tree

Hide file tree

Showing 19 changed files with 1,198 additions and 17 deletions.
diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml
@@ -11,9 +11,7 @@ categories:
     label: 'benchmarking'
   - title: '📜 Documentation'
     label: 'documentation :book:'
-  - title: '🧹 Maintenance'
-    label: 'chore'
-  - title: '🧪 Testing'
+  - title: ' Testing'
     label: 'testing'
   - title: 'Arithmetic'
     label: 'arithmetic'
@@ -92,11 +90,18 @@ categories:
     label: 'trigonometrics'
   - title: 'Types'
     label: 'types'
+exclude-labels:
+  - 'github-actions'
+  - 'dependencies'
+  - 'chore'
+  - 'workflows'
+autolabeler:
+  - label: 'chore'
+    title:
+    - '\[pre-commit.ci\] .*'
 
 change-template: '- #$NUMBER $TITLE (by @$AUTHOR)'
 category-template: '### $TITLE'
-exclude-labels:
-- 'workflow'
 template: |
   ## Changes
 

diff --git a/.github/workflows/bench_report.yml b/.github/workflows/bench_report.yml
@@ -38,7 +38,7 @@ jobs:
           cat report.txt >> $GITHUB_STEP_SUMMARY
       - name: Compare and Save Benchmark Results
         id: action_bench
-        uses: benchmark-action/github-action-benchmark@70405016b032d44f409e4b1b451c40215cbe2393 # v1.18.0
+        uses: benchmark-action/github-action-benchmark@1846227a307d8c0149b960b986d46f8f4c95db0c # v1.20.1
         with:
           github-token: ${{secrets.GITHUB_TOKEN}}
           # Benchmark action input and output

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
@@ -50,7 +50,7 @@ jobs:
 
       # Initializes the CodeQL tools for scanning.
       - name: Initialize CodeQL
-        uses: github/codeql-action/init@1b1aada464948af03b950897e5eb522f92603cc2 # v3.24.9
+        uses: github/codeql-action/init@4355270be187e1b672a7a1c7c7bae5afdc1ab94a # v3.24.10
         with:
           languages: ${{ matrix.language }}
           # If you wish to specify custom queries, you can do so here or in a config file.
@@ -60,7 +60,7 @@ jobs:
       # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
       # If this step fails, then you should remove it and run the build manually (see below)
       - name: Autobuild
-        uses: github/codeql-action/autobuild@1b1aada464948af03b950897e5eb522f92603cc2 # v3.24.9
+        uses: github/codeql-action/autobuild@4355270be187e1b672a7a1c7c7bae5afdc1ab94a # v3.24.10
 
       # ℹ️ Command-line programs to run using the OS shell.
       # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
@@ -73,6 +73,6 @@ jobs:
       #   ./location_of_script_within_repo/buildscript.sh
 
       - name: Perform CodeQL Analysis
-        uses: github/codeql-action/analyze@1b1aada464948af03b950897e5eb522f92603cc2 # v3.24.9
+        uses: github/codeql-action/analyze@4355270be187e1b672a7a1c7c7bae5afdc1ab94a # v3.24.10
         with:
           category: "/language:${{matrix.language}}"
diff --git a/.github/workflows/create-branch-on-assignment.yml b/.github/workflows/create-branch-on-assignment.yml
@@ -16,6 +16,6 @@ jobs:
           egress-policy: audit
 
       - name: Create Issue Branch
-        uses: robvanderleek/create-issue-branch@0be404227bd2ade7be1520cdde0ce9b8a11cf2d4 # main
+        uses: robvanderleek/create-issue-branch@de4d4fe6e4e37c743e86edd4bd98c3f522746fd4 # main
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml
@@ -64,14 +64,14 @@ jobs:
       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
       # format to the repository Actions tab.
       - name: "Upload artifact"
-        uses: actions/upload-artifact@97a0fba1372883ab732affbe8f94b823f91727db # v3.pre.node20
+        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1
         with:
           name: SARIF file
           path: results.sarif
           retention-days: 5
 
       # Upload the results to GitHub's code scanning dashboard.
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@1b1aada464948af03b950897e5eb522f92603cc2 # v3.24.9
+        uses: github/codeql-action/upload-sarif@4355270be187e1b672a7a1c7c7bae5afdc1ab94a # v3.24.10
         with:
           sarif_file: results.sarif
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@
 # See https://pre-commit.com/hooks.html for more hooks
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
+    rev: v4.6.0
     hooks:
       - id: trailing-whitespace
       - id: end-of-file-fixer

diff --git a/heat/cluster/__init__.py b/heat/cluster/__init__.py
@@ -7,3 +7,4 @@
 from .kmedians import *
 from .kmedoids import *
 from .spectral import *
+from .batchparallelclustering import *
diff --git a/heat/cluster/_kcluster.py b/heat/cluster/_kcluster.py
@@ -23,6 +23,7 @@ class _KCluster(ht.ClusteringMixin, ht.BaseEstimator):
 
         - ‘probability_based’ : selects initial cluster centers for the clustering in a smart way to speed up convergence (k-means++)
         - ‘random’: choose k observations (rows) at random from data for the initial centroids.
+        - 'batchparallel': use the batch parallel algorithm to initialize the centroids, only available for split=0 and KMeans or KMedians
         - ``DNDarray``: gives the initial centers, should be of Shape = (n_clusters, n_features)
     max_iter : int
         Maximum number of iterations for a single run.
@@ -50,9 +51,11 @@ def __init__(
         # in-place properties
         self._metric = metric
         self._cluster_centers = None
+        self._functional_value = None
         self._labels = None
         self._inertia = None
         self._n_iter = None
+        self._p = None
 
     @property
     def cluster_centers_(self) -> DNDarray:
@@ -84,6 +87,13 @@ def n_iter_(self) -> int:
         """
         return self._n_iter
 
+    @property
+    def functional_value_(self) -> DNDarray:
+        """
+        Returns the K-Clustering functional value of the clustering algorithm
+        """
+        return self._functional_value
+
     def _initialize_cluster_centers(self, x: DNDarray):
         """
         Initializes the K-Means centroids.
@@ -186,26 +196,58 @@ def _initialize_cluster_centers(self, x: DNDarray):
                 raise NotImplementedError("Not implemented for other splitting-axes")
             self._cluster_centers = centroids
 
+        elif self.init == "batchparallel":
+            if x.split == 0:
+                if self._p == 2:
+                    batch_parallel_clusterer = ht.cluster.BatchParallelKMeans(
+                        n_clusters=self.n_clusters,
+                        init="k-means++",
+                        max_iter=100,
+                        random_state=self.random_state,
+                    )
+                elif self._p == 1:
+                    batch_parallel_clusterer = ht.cluster.BatchParallelKMedians(
+                        n_clusters=self.n_clusters,
+                        init="k-medians++",
+                        max_iter=100,
+                        random_state=self.random_state,
+                    )
+                else:
+                    raise ValueError(
+                        "Batch parallel initialization only implemented for KMeans and KMedians"
+                    )
+                batch_parallel_clusterer.fit(x)
+                self._cluster_centers = batch_parallel_clusterer.cluster_centers_
+            else:
+                raise NotImplementedError(
+                    f"Batch parallel initalization only implemented for split = 0, but split was {x.split}"
+                )
+
         else:
             raise ValueError(
-                'init needs to be one of "random", ht.DNDarray or "kmeans++", but was {}'.format(
+                'init needs to be one of "random", ht.DNDarray, "kmeans++", or "batchparallel", but was {}'.format(
                     self.init
                 )
             )
 
-    def _assign_to_cluster(self, x: DNDarray):
+    def _assign_to_cluster(self, x: DNDarray, eval_functional_value: bool = False):
         """
         Assigns the passed data points to the centroids based on the respective metric
 
         Parameters
         ----------
         x : DNDarray
             Data points, Shape = (n_samples, n_features)
+        eval_functional_value : bool, default: False
+            If True, the current K-Clustering functional value of the clustering algorithm is evaluated
         """
         # calculate the distance matrix and determine the closest centroid
         distances = self._metric(x, self._cluster_centers)
         matching_centroids = distances.argmin(axis=1, keepdims=True)
 
+        if eval_functional_value:
+            self._functional_value = ht.norm(distances.min(axis=1), ord=self._p) ** self._p
+
         return matching_centroids
 
     def _update_centroids(self, x: DNDarray, matching_centroids: DNDarray):
@@ -251,4 +293,4 @@ def predict(self, x: DNDarray):
             raise ValueError(f"input needs to be a ht.DNDarray, but was {type(x)}")
 
         # determine the centroids
-        return self._assign_to_cluster(x)
+        return self._assign_to_cluster(x, eval_functional_value=True)
Benchmark suite	Current: `91b7125`	Previous: `b070199`	Ratio
`matmul_split_0_N1_GPU - RUNTIME`	`0.005878224968910217` s (`0.016186576336622238`)	`0.0028138693887740374` s (`0.00703318789601326`)	`2.09`
`heat_benchmarks_N4_CPU - ENERGY`	`0.35506324768066405` kJ (`0.19014888023064366`)	`0.1653309066772461` kJ (`0.017067791473682935`)	`2.15`
`apply_inplace_standard_scaler_and_inverse_N4_CPU - RUNTIME`	`0.044499970972537994` s (`0.016452759504318237`)	`0.01812922954559326` s (`0.007081166375428438`)	`2.45`