Skip to content

V0.1.2 #12

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 29 commits into from
Nov 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
1f3b65f
feat: allow explicit theta selection
WilliamCappelletti Oct 18, 2024
3ebebc4
feat: laplacians prior in estimation
WilliamCappelletti Oct 18, 2024
00de486
feat: backbone for impreved kgraphs
WilliamCappelletti Oct 18, 2024
3de6a41
refact: use set_params and get_params
WilliamCappelletti Oct 18, 2024
b59396b
feat: une-hot encoding
WilliamCappelletti Oct 18, 2024
9c035d5
feat: K-Graphs variation with centers
WilliamCappelletti Oct 18, 2024
fed0a17
feat: include theta parameerization
WilliamCappelletti Oct 18, 2024
210ebcc
doc: update
WilliamCappelletti Oct 18, 2024
e3cde1a
feat: KGraphsV2 available
WilliamCappelletti Oct 18, 2024
15e2333
refac!+feat: generate multilpe Laplacians and require seed as kwarg
WilliamCappelletti Oct 24, 2024
3211dee
remove unused test
WilliamCappelletti Oct 24, 2024
8da91bb
test clustering methods
WilliamCappelletti Oct 24, 2024
31bb0a2
test: compare metrics to baseline
WilliamCappelletti Oct 24, 2024
f6eb248
fix: avg_degree typehint
WilliamCappelletti Oct 24, 2024
3139bd2
fix: tests means
WilliamCappelletti Oct 24, 2024
d872525
doc: comment
WilliamCappelletti Oct 24, 2024
b819647
fix: kgraphs v2 coherent with v1
WilliamCappelletti Oct 24, 2024
de59179
test: doc values
WilliamCappelletti Oct 24, 2024
2e509b3
fix: handle single graph correctly
WilliamCappelletti Oct 24, 2024
02a16d8
doc: remove comments
WilliamCappelletti Oct 24, 2024
9535ea0
feat: blockwise theta
WilliamCappelletti Oct 24, 2024
4d47d0a
feat: allow separate degree for each block
WilliamCappelletti Oct 25, 2024
e5148dd
refact: change parameterization
WilliamCappelletti Oct 25, 2024
fc8bb4e
feat: GLMM supports blocks
WilliamCappelletti Oct 25, 2024
070b939
fix+test: blocks support
WilliamCappelletti Oct 25, 2024
3f55c40
Merge branch 'main' into dev
WilliamCappelletti Nov 11, 2024
f5c19f0
fix: correcr shape
WilliamCappelletti Nov 28, 2024
e02878b
typo
WilliamCappelletti Nov 28, 2024
0a1716d
refact: compute sq_pdiffs as internal method
WilliamCappelletti Nov 28, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion graph_learn/clustering/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
"""GSP clustering algorithm"""

__all__ = [
"KGraphs",
"KGraphsV2",
"GLMM",
]

from .glmm import GLMM
from .kgraphs import KGraphs
from .kgraphs import KGraphs, KGraphsV2
120 changes: 99 additions & 21 deletions graph_learn/clustering/glmm.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Implementation of Graph Laplacian Mixture Model"""

# pylint: disable=arguments-renamed

from typing import Optional
Expand All @@ -15,11 +16,47 @@


def _estimate_gauss_laplacian_parameters(
x: NDArray[np.float64], resp: NDArray[np.float64], avg_degree: float, delta: float
):
x: NDArray[np.float64],
resp: NDArray[np.float64],
delta: float,
*,
theta: float | NDArray[np.float64] | None = None,
avg_degree: Optional[int | dict[tuple[int, int], int]] = None,
blocks: NDArray[np.int64] = None,
laplacians: NDArray[np.float64] | None = None,
) -> tuple[NDArray[np.float64], NDArray[np.float64], NDArray[np.float64]]:
"""Estimate the parameters of Gaussian-Laplacian Mixture model, given the associations.

Args:
x (NDArray[np.float64]): Design matrix, shape (n_samples, n_nodes)
resp (NDArray[np.float64]): Association probabilities, shape (n_samples, n_components)
delta (float): Scale parameters of learned graphs
theta (float | NDArray[np.float64] | None, optional): Scale parameter of signals, or array
of them. Defaults to None. Incompatible with avg_degree.
avg_degree (int | dict, optional): Expected average degree of the graphs. Defaults to
None. Incompatible with theta, as the latter is estimated with the :method:`get_theta`.
blocks (NDArray[np.int64], optional): Node assignments to blocks. Defaults to None.
laplacians (NDArray[np.float64] | None, optional): Priors on the Laplacians, or previous
estimates. Defaults to None.

Raises:
ValueError: In case both theta and avg_degree are provided.
NotImplementedError: If laplacians are provided.

Returns:
tuple[NDArray[np.float64], NDArray[np.float64], NDArray[np.float64]]: Weights, means and
Laplacians of each component.
"""
if (theta is None) == (avg_degree is None):
raise ValueError("Exactly one of theta and avg_degree should be provided")

_n_samples, n_nodes = x.shape
_n_samples, n_components = resp.shape
laplacians = np.empty((n_components, n_nodes, n_nodes))

if laplacians is None:
edge_init = None
else:
# Must transpose as gsp_learn_graph_log_degrees expects edge weights on first axis
edge_init = -square_to_vec(laplacians).T

weights: NDArray[np.float64] = np.sum(resp, axis=0) # shape: n_components

Expand All @@ -30,11 +67,17 @@ def _estimate_gauss_laplacian_parameters(
y = resp[:, :, np.newaxis] * (x[:, np.newaxis, :] - means[np.newaxis, ...])
sq_dist = np.sum((y[..., np.newaxis] - y[..., np.newaxis, :]) ** 2, axis=0)

# theta = np.mean(sq_dist) / norm_par
# Theta should be in the order of np.mean(sq_dist)
if avg_degree is not None:
# Get Theta returns inversed thetas
theta_inv = np.array([get_theta(sqd, avg_degree, blocks=blocks) for sqd in sq_dist])
if len(theta_inv.shape) < 2:
theta_inv = theta_inv[:, np.newaxis]
else:
theta_inv = 1 / theta

edge_weights = delta * gsp_learn_graph_log_degrees(
square_to_vec(sq_dist) * [[get_theta(sqd, avg_degree)] for sqd in sq_dist],
alpha=1,
beta=1,
square_to_vec(sq_dist) * theta_inv, alpha=1, beta=1, edge_init=edge_init
)

laplacians = laplacian_squareform_vec(edge_weights)
Expand All @@ -53,12 +96,16 @@ class GLMM(BaseMixture):
max_iter (int, optional): Max EM iterations. Defaults to 100.
n_init (int, optional): Number of random initializations. Defaults to 1.
init_params (str, optional): Label initialization method. Defaults to "kmeans".
Accepts same values as :class:`GaussianMixture`.
regul (float, optional): GLMM regularization. Defaults to 0.15.
delta (float, optional): Graph leraning param. Defaults to 2.
laplacian_init (Optional[float | str], optional): Method for laplacian initialization. Defaults to None.
Options are:
theta (float | NDArray[np.float64], optional): Alternative parameterization to
:arg:`avg_degree`, is thr scale parameter of signals. Defaults to None.
delta (float, optional): Scale parameter of learned graphs. Defaults to 2.
laplacian_init (Optional[float | str], optional): Method for laplacian initialization.
Defaults to None. Options are:
- None: Estimate Laplacians from first assignments estimate, given by :arg:`init_params`
- float: Initialize as fully connected with weights eaual to arg value
- 'random': Edge weights are sampled as uniform ranodm variables and Laplacians are extracted.
- 'random': Edge weights are sampled as uniform random variables.
random_state (_type_, optional): Random state. Defaults to None.
warm_start (bool, optional): Wheter to use warm start in EM. Defaults to False.
verbose (int, optional): Verobsity level. Defaults to 0.
Expand All @@ -80,17 +127,18 @@ class GLMM(BaseMixture):
def __init__(
self,
n_components=1,
avg_degree: float = 0.5,
avg_degree: int | dict[tuple[int, int], int] = 2,
*,
tol=1e-3,
reg_covar=1e-6,
max_iter=100,
n_init=1,
init_params="kmeans",
regul: float = 0.15,
# norm_par: float = 1.5,
theta: NDArray[np.float64] = None,
delta: float = 2,
laplacian_init: Optional[float | str] = None,
blocks: NDArray[np.int64] = None,
random_state=None,
warm_start=False,
verbose=0,
Expand All @@ -110,9 +158,10 @@ def __init__(
)

self.regul = regul
# self.norm_par = norm_par
self.theta = theta
self.avg_degree = avg_degree
self.delta = delta
self.blocks = blocks

self.laplacian_init = laplacian_init

Expand All @@ -122,40 +171,68 @@ def __init__(
self.means_: NDArray[np.float64]
self.laplacians_: NDArray[np.float64]

self._propagate_laplacians = False

def _check_parameters(self, X):
pass

def _initialize(self, x, resp):
_n_samples, self.n_nodes_ = x.shape

self.weights_, self.means_, laplacians = _estimate_gauss_laplacian_parameters(
x, resp, self.avg_degree, self.delta
)

if self.laplacian_init is None:
self.laplacians_ = laplacians
self.laplacians_ = None

elif isinstance(self.laplacian_init, np.ndarray):
if self.laplacian_init.shape != (self.n_components, self.n_nodes_, self.n_nodes_):
raise ValueError("Laplacians must have shape (n_components, n_nodes, n_nodes)")

self._propagate_laplacians = True
self.laplacians_ = self.laplacian_init

raise NotImplementedError("Laplacian order should be related to resp to make sense")

elif isinstance(spread := self.laplacian_init, float):
self.laplacians_ = np.tile(
spread * np.eye(self.n_nodes_)
- spread / self.n_nodes_ * np.ones((self.n_nodes_, self.n_nodes_)),
(self.n_components, 1, 1),
)

elif self.laplacian_init == "random":
self.laplacians_ = np.stack(
[
sample_uniform_laplacian(self.n_nodes_, self.random_state)
for _ in range(self.n_components)
]
)

else:
raise ValueError("Invalid Laplacian init")

self.weights_, self.means_, self.laplacians_ = _estimate_gauss_laplacian_parameters(
x,
resp,
self.delta,
theta=self.theta,
avg_degree=self.avg_degree,
blocks=self.blocks,
laplacians=self.laplacians_ if self._propagate_laplacians else None,
)

def _m_step(self, x: NDArray[np.float64], log_resp: NDArray[np.float64]) -> None:
(
self.weights_,
self.means_,
self.laplacians_,
) = _estimate_gauss_laplacian_parameters(x, np.exp(log_resp), self.avg_degree, self.delta)
) = _estimate_gauss_laplacian_parameters(
x,
np.exp(log_resp),
self.delta,
theta=self.theta,
avg_degree=self.avg_degree,
blocks=self.blocks,
laplacians=self.laplacians_ if self._propagate_laplacians else None,
)
self.weights_ /= self.weights_.sum()

def _estimate_log_prob(self, x: ArrayLike) -> NDArray[np.float64]:
Expand All @@ -182,6 +259,7 @@ def _estimate_log_prob(self, x: ArrayLike) -> NDArray[np.float64]:
# shape: n_components
# log_weights = np.log(self.weights_)

# FIXME: This could be vectorized
for k in range(self.n_components):
# Compute pdf

Expand Down
103 changes: 90 additions & 13 deletions graph_learn/clustering/kgraphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state

from graph_learn.clustering.utils import init_labels
from graph_learn.clustering.glmm import _estimate_gauss_laplacian_parameters
from graph_learn.clustering.utils import init_labels, one_hot
from graph_learn.operators import laplacian_squareform_vec
from graph_learn.smooth_learning import get_theta, gsp_learn_graph_log_degrees

Expand Down Expand Up @@ -43,7 +44,7 @@ class KGraphs(BaseEstimator, ClusterMixin):
def __init__(
self,
n_clusters=1,
avg_degree: float = 0.5,
avg_degree: int = 2,
*,
max_iter=100,
n_init=1,
Expand Down Expand Up @@ -120,26 +121,18 @@ def _single_fit(self, x: NDArray[np.float64], _y=None) -> None:
self.labels_ = labels

def fit_predict(self, x: NDArray[np.float64], _y=None) -> NDArray[np.int64]:
n_samples, n_nodes = x.shape

best_score = np.inf
best_laplacians = np.empty((self.n_clusters, n_nodes, n_nodes))
best_labels = np.empty(n_samples, dtype=np.int64)
best_converged = None
best_params = {}

for _n in range(self.n_init):
self._single_fit(x)

if self.score_ < best_score:
best_score = self.score_
best_laplacians = self.laplacians_
best_labels = self.labels_
best_converged = self.converged_
best_params = self.get_params()

self.score_ = best_score
self.laplacians_ = best_laplacians
self.labels_ = best_labels
self.converged_ = best_converged
self.set_params(**best_params)

return self.labels_

Expand All @@ -154,3 +147,87 @@ def predict(self, x: NDArray[np.float64]) -> NDArray[np.int64]:
def predict_proba(self, x: NDArray[np.float64]) -> NDArray[np.float64]:
"Return softmax of smoothness"
return softmax(-self._smoothness(x), axis=1)


class KGraphsV2(KGraphs):
"""Extension of KGraphs to allow for centers estimation"""

def __init__(
self,
n_clusters=1,
avg_degree: int = 2,
*,
max_iter=100,
n_init=1,
init_params="kmeans",
delta: float = 1,
theta: Optional[float] = None,
random_state: RandomState | None = None,
) -> None:
super().__init__(
n_clusters,
avg_degree,
max_iter=max_iter,
n_init=n_init,
init_params=init_params,
delta=delta,
random_state=random_state,
)

self.theta = theta

self.means_: NDArray[np.float64] # shape: (n_clusters, n_nodes)

def _init_parameters(self, x: NDArray[np.float64]):
self.random_state = check_random_state(self.random_state)

self.labels_ = init_labels(
x,
self.n_clusters,
init_params=self.init_params,
random_state=self.random_state,
)

_, self.means_, self.laplacians_ = _estimate_gauss_laplacian_parameters(
x,
one_hot(self.labels_, self.n_clusters),
self.delta,
theta=self.theta,
avg_degree=self.avg_degree,
)

self.converged_ = False

self.score_ = np.inf

def _centered_smoothness(self, x: NDArray[np.float64]) -> NDArray[np.float64]:
x = x[np.newaxis, ...] - self.means_[:, np.newaxis, :]
return np.einsum("kni,kij,knj->nk", x, self.laplacians_, x)

def _single_fit(self, x: NDArray[np.float64], _y=None) -> None:
self._init_parameters(x)

for _i in range(self.max_iter):
# Compute assignments
# eisum.shape: (n_samples, n_clusters)

# FIXME: Why should I use smoothness instead of centerd one?
smoothness = self._smoothness(x)
labels = np.argmin(smoothness, axis=1)

self.score_ = np.sum(smoothness[np.arange(len(labels)), labels])

# Compute means and Laplacians
_, self.means_, self.laplacians_ = _estimate_gauss_laplacian_parameters(
x,
one_hot(labels, self.n_clusters),
self.delta,
theta=self.theta,
avg_degree=self.avg_degree,
)

if np.allclose(labels, self.labels_):
self.converged_ = True
return

self.labels_ = labels
11 changes: 11 additions & 0 deletions graph_learn/clustering/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,14 @@ def init_labels(
return kmeans_plusplus(x, n_clusters, random_state=random_state)
case _:
raise ValueError(f"Invalid init_params: {init_params}")


def one_hot(labels: NDArray[np.int_], n_labels: int = None) -> NDArray:
"""One hot encode labels"""
n_samples = labels.shape[0]
n_labels = n_labels or labels.max() + 1

y_one_hot = np.zeros((n_samples, n_labels))
y_one_hot[np.arange(n_samples), labels] = 1

return y_one_hot
Loading