rapidsai · betatim · Apr 12, 2024 · Apr 15, 2024 · Apr 15, 2024 · dantegd
diff --git a/.gitignore b/.gitignore
@@ -39,6 +39,11 @@ docs/source/*.model
 docs/source/*.pkl
 docs/source/*.tl
 
+# Example gallery related files
+docs/source/sg_execution_times.rst
+docs/source/modules/generated/
+docs/source/auto_examples/
+
 ## eclipse
 .project
 .cproject

@@ -76,4 +76,5 @@ dependencies:
 - umap-learn==0.5.3
 - pip:
   - dask-glm==0.3.0
+  - sphinx-gallery
 name: all_cuda-118_arch-x86_64
@@ -72,4 +72,5 @@ dependencies:
 - umap-learn==0.5.3
 - pip:
   - dask-glm==0.3.0
+  - sphinx-gallery
 name: all_cuda-122_arch-x86_64
@@ -361,6 +361,8 @@ dependencies:
           - sphinx<6
           - sphinx-copybutton
           - sphinx-markdown-tables
+          - pip:
+              - sphinx-gallery
       - output_types: conda
         packages:
           - doxygen=1.9.1

diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -52,7 +52,7 @@ cuML provides experimental support for running selected estimators and operators
    * - Regression and Classification
      - Ridge
 
-If a CUDA-enabled GPU is available on the system, cuML will default to using it. Users can configure CPU or GPU execution for supported operators via context managers or global configuration. 
+If a CUDA-enabled GPU is available on the system, cuML will default to using it. Users can configure CPU or GPU execution for supported operators via context managers or global configuration.
 
 .. code-block:: python
 
@@ -150,6 +150,7 @@ Feature Scaling and Normalization (Single-GPU)
     :members:
 .. autoclass:: cuml.preprocessing.StandardScaler
     :members:
+.. include:: modules/generated/cuml.preprocessing.StandardScaler.examples
 .. autofunction:: cuml.preprocessing.maxabs_scale
 .. autofunction:: cuml.preprocessing.minmax_scale
 .. autofunction:: cuml.preprocessing.normalize
@@ -455,12 +456,16 @@ K-Means Clustering
 .. autoclass:: cuml.KMeans
     :members:
 
+.. include:: modules/generated/cuml.KMeans.examples
+
 DBSCAN
 ------
 
 .. autoclass:: cuml.DBSCAN
     :members:
 
+.. include:: modules/generated/cuml.DBSCAN.examples
+
 Agglomerative Clustering
 ------------------------
 
@@ -473,6 +478,8 @@ HDBSCAN
 .. autoclass:: cuml.cluster.hdbscan.HDBSCAN
     :members:
 
+.. include:: modules/generated/cuml.HDBSCAN.examples
+
 .. autofunction:: cuml.cluster.hdbscan.all_points_membership_vectors
 
 .. autofunction:: cuml.cluster.hdbscan.membership_vector

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -51,9 +51,21 @@
     "nbsphinx",
     "recommonmark",
     "sphinx_markdown_tables",
-    "sphinx_copybutton"
+    "sphinx_copybutton",
+    "sphinx_gallery.gen_gallery",
 ]
 
+sphinx_gallery_conf = {
+     'examples_dirs': ['../../examples'],
+     'gallery_dirs': ['auto_examples'],
+     "doc_module": "cuml",
+     "backreferences_dir": os.path.join("modules", "generated"),
+     'reference_url': {
+         # The module you locally document uses None
+        'cuml': None,
+    },
+}
+
 ipython_mplbackend = "str"
 
 # Add any paths that contain templates here, relative to this directory.
@@ -93,10 +105,10 @@
 # Usually you set "language" from the command line for these cases.
 language = 'en'
 
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This patterns also effect to html_static_path and html_extra_path
-exclude_patterns = []
+# sphinx-gallery generates several files per example, need to ignore
+# the notebook and ".md" files (which in reality are ".md5" files)
+# because Markdown and notebooks are valid document sources
+exclude_patterns = ["auto_examples/**/*.md", "auto_examples/**/*.ipynb"]
 
 # The name of the Pygments (syntax highlighting) style to use.
 pygments_style = 'sphinx'

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -25,6 +25,7 @@ Support for Windows is possible in the near future.
    api.rst
    user_guide.rst
    cuml_blogs.rst
+   auto_examples/index.rst
 
 
 Indices and tables

diff --git a/examples/README.rst b/examples/README.rst
@@ -0,0 +1,4 @@
+Example gallery
+===============
+
+Below is a gallery of examples showing how to use cuml.
-Below is a gallery of examples showing how to use cuml.
+Below is a gallery of examples showing how to use cuML.
-Below is a gallery of examples showing how to use cuml.
+Below is a gallery of examples showing how to use cuML.
diff --git a/examples/cluster/README.rst b/examples/cluster/README.rst
@@ -0,0 +1,6 @@
+.. _cluster_examples:
+
+Clustering
+----------
+
+Examples concerning the :mod:`cuml.cluster` module.
diff --git a/examples/cluster/plot_dbscan.py b/examples/cluster/plot_dbscan.py
@@ -0,0 +1,128 @@
+"""
+===================================
+Demo of DBSCAN clustering algorithm
+===================================
+
+DBSCAN (Density-Based Spatial Clustering of Applications with Noise) finds core
+samples in regions of high density and expands clusters from them. This
+algorithm is good for data which contains clusters of similar density.
+
+.. note:: Example adapted from `the scikit-learn gallery <https://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html>`_.
+
+"""
+
+# %%
+# Data generation
+# ---------------
+#
+# We use :class:`~sklearn.datasets.make_blobs` to create 3 synthetic clusters.
+
+import cupy as cp
+from cuml.datasets import make_blobs
+from cuml.preprocessing import StandardScaler
+
+centers = cp.array([[1, 1], [-1, -1], [1, -1]])
+X, labels_true = make_blobs(
+    n_samples=750, centers=centers, cluster_std=0.4, random_state=0
+)
+
+X = StandardScaler().fit_transform(X)
+
+# %%
+# We can visualize the resulting data:
+
+import matplotlib.pyplot as plt
+
+X_ = X.get()
+plt.scatter(X_[:, 0], X_[:, 1])
+plt.show()
+
+# %%
+# Compute DBSCAN
+# --------------
+#
+# One can access the labels assigned by :class:`~sklearn.cluster.DBSCAN` using
+# the `labels_` attribute. Noisy samples are given the label math:`-1`.
+
+import numpy as np
+
+from cuml import metrics
+from sklearn import metrics as sk_metrics
+from cuml.cluster import DBSCAN
+
+db = DBSCAN(eps=0.3, min_samples=10).fit(X)
+labels = db.labels_
+
+# Number of clusters in labels, ignoring noise if present.
+n_clusters_ = len(cp.unique(labels)) - (1 if -1 in labels else 0)
+n_noise_ = list(labels.get()).count(-1)
+
+print("Estimated number of clusters: %d" % n_clusters_)
+print("Estimated number of noise points: %d" % n_noise_)
+
+# %%
+# Clustering algorithms are fundamentally unsupervised learning methods.
+# However, since :class:`~sklearn.datasets.make_blobs` gives access to the true
+# labels of the synthetic clusters, it is possible to use evaluation metrics
+# that leverage this "supervised" ground truth information to quantify the
+# quality of the resulting clusters. Examples of such metrics are the
+# homogeneity, completeness, V-measure, Rand-Index, Adjusted Rand-Index and
+# Adjusted Mutual Information (AMI).
+#
+# If the ground truth labels are not known, evaluation can only be performed
+# using the model results itself. In that case, the Silhouette Coefficient comes
+# in handy.
+
+print(f"Homogeneity: {metrics.homogeneity_score(labels_true.astype(cp.int32), labels):.3f}")
+print(f"Completeness: {metrics.completeness_score(labels_true.astype(cp.int32), labels):.3f}")
+print(f"V-measure: {metrics.v_measure_score(labels_true.astype(cp.int32), labels):.3f}")
+print(f"Adjusted Rand Index: {metrics.adjusted_rand_score(labels_true.astype(cp.int32), labels):.3f}")
+print(
+    "Adjusted Mutual Information:"
+    f" {sk_metrics.adjusted_mutual_info_score(labels_true.astype(cp.int32).get(), labels.get()):.3f}"
+)
+print(f"Silhouette Coefficient: {sk_metrics.silhouette_score(X_, labels.get()):.3f}")
+
+# %%
+# Plot results
+# ------------
+#
+# Core samples (large dots) and non-core samples (small dots) are color-coded
+# according to the assigned cluster. Samples tagged as noise are represented in
+# black.
+
+unique_labels = cp.unique(labels)
+core_samples_mask = cp.zeros_like(labels, dtype=bool)
+core_samples_mask[db.core_sample_indices_] = True
+core_samples_mask = core_samples_mask.get()
+
+colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]
+for k, col in zip(unique_labels, colors):
+    if k == -1:
+        # Black used for noise.
+        col = [0, 0, 0, 1]
+
+    class_member_mask = (labels == k).get()
+
+    xy = X_[class_member_mask & core_samples_mask]
+    plt.plot(
+        xy[:, 0],
+        xy[:, 1],
+        "o",
+        markerfacecolor=tuple(col),
+        markeredgecolor="k",
+        markersize=14,
+    )
+
+    xy = X_[class_member_mask & ~core_samples_mask]
+    plt.plot(
+        xy[:, 0],
+        xy[:, 1],
+        "o",
+        markerfacecolor=tuple(col),
+        markeredgecolor="k",
+        markersize=6,
+    )
+
+plt.title(f"Estimated number of clusters: {n_clusters_}")
+plt.show()