Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Annotate decomposition module #650

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion dask_ml/decomposition/extmath.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,15 @@
import numpy as np
from sklearn.utils.extmath import _safe_accumulator_op

from .._typing import ArrayLike

def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count):

def _incremental_mean_and_var(
X: ArrayLike,
last_mean: ArrayLike,
last_variance: ArrayLike,
last_sample_count: ArrayLike,
):
"""
Note. Most of this script is taken from scikit-learn, except for the last line.

Expand Down
29 changes: 18 additions & 11 deletions dask_ml/decomposition/incremental_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
# Giorgio Patrini
# License: BSD 3 clause

from typing import Optional, Tuple

import dask
import numpy as np
from dask import array as da, compute, delayed
Expand All @@ -16,9 +18,10 @@
from ..utils import _svd_flip_copy, check_array
from . import pca
from .extmath import _incremental_mean_and_var
from ._typing import ArrayLike


def svd_flip(u, v):
def svd_flip(u: ArrayLike, v: ArrayLike) -> Tuple[ArrayLike, ArrayLike]:
"""
This is a replicate of svd_flip() which calls svd_flip_fixed()
instead of skm.svd_flip()
Expand Down Expand Up @@ -121,13 +124,13 @@ class IncrementalPCA(pca.PCA):

def __init__(
self,
n_components=None,
whiten=False,
copy=True,
batch_size=None,
svd_solver="auto",
iterated_power=0,
random_state=None,
n_components: Optional[int] = None,
whiten: bool = False,
copy: bool = True,
batch_size: Optional[int] = None,
svd_solver: str = "auto",
iterated_power: int = 0,
random_state: Optional[int] = None,
):
self.n_components = n_components
self.whiten = whiten
Expand All @@ -137,7 +140,7 @@ def __init__(
self.iterated_power = iterated_power
self.random_state = random_state

def _fit(self, X, y=None):
def _fit(self, X: ArrayLike, y: Optional[ArrayLike] = None) -> "IncrementalPCA":
"""Fit the model with X, using minibatches of size batch_size.

Parameters
Expand Down Expand Up @@ -189,7 +192,9 @@ def _fit(self, X, y=None):

return self

def fit_transform(self, X, y=None):
def fit_transform(
self, X: ArrayLike, y: Optional[ArrayLike] = None
) -> "IncrementalPCA":
"""Fit the model with X and apply the dimensionality reduction on X.

Parameters
Expand All @@ -216,7 +221,9 @@ def fit_transform(self, X, y=None):
# fit method of arity 2 (supervised transformation)
return self.fit(X, y).transform(X)

def partial_fit(self, X, y=None, check_input=True):
def partial_fit(
self, X: ArrayLike, y: Optional[ArrayLike] = None, check_input: bool = True
) -> "IncrementalPCA":
"""Incremental fit with X. All of X is processed as a single batch.

Parameters
Expand Down
34 changes: 19 additions & 15 deletions dask_ml/decomposition/pca.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numbers
from typing import List, Optional, Tuple, Union

import dask
import dask.array as da
Expand All @@ -10,6 +11,7 @@
from sklearn.utils.validation import check_random_state

from .._compat import check_is_fitted
from .._typing import ArrayLike, DataFrameType
from .._utils import draw_seed
from ..utils import svd_flip

Expand Down Expand Up @@ -179,13 +181,13 @@ class PCA(sklearn.decomposition.PCA):

def __init__(
self,
n_components=None,
copy=True,
whiten=False,
n_components: Optional[int] = None,
copy: bool = True,
whiten: bool = False,
svd_solver="auto",
tol=0.0,
iterated_power=0,
random_state=None,
tol: float = 0.0,
iterated_power: int = 0,
random_state: Optional[Union[int, np.random.RandomState]] = None,
):
self.n_components = n_components
self.copy = copy
Expand All @@ -195,13 +197,13 @@ def __init__(
self.iterated_power = iterated_power
self.random_state = random_state

def fit(self, X, y=None):
def fit(self, X: da.Array, y: da.Array = None) -> "PCA":
if not dask.is_dask_collection(X):
raise TypeError(_TYPE_MSG.format(type(X)))
self._fit(X)
return self

def _get_solver(self, X, n_components):
def _get_solver(self, X: da.Array, n_components: int) -> str:
n_samples, n_features = X.shape
solvers = {"full", "auto", "tsqr", "randomized"}
solver = self.svd_solver
Expand Down Expand Up @@ -248,7 +250,9 @@ def _get_solver(self, X, n_components):
raise ValueError(msg)
return solver

def _fit(self, X):
def _fit(
self, X: Union[ArrayLike, DataFrameType]
) -> Tuple[ArrayLike, ArrayLike, ArrayLike]:
if isinstance(X, dd.DataFrame):
X = X.values

Expand Down Expand Up @@ -364,7 +368,7 @@ def _fit(self, X):

return U, S, V

def transform(self, X):
def transform(self, X: ArrayLike) -> ArrayLike:
"""Apply dimensionality reduction on X.

X is projected on the first principal components previous extracted
Expand All @@ -391,7 +395,7 @@ def transform(self, X):
X_transformed /= np.sqrt(self.explained_variance_)
return X_transformed

def fit_transform(self, X, y=None):
def fit_transform(self, X: ArrayLike, y: Optional[ArrayLike] = None) -> ArrayLike:
"""Fit the model with X and apply the dimensionality reduction on X.

Parameters
Expand Down Expand Up @@ -422,7 +426,7 @@ def fit_transform(self, X, y=None):

return U

def inverse_transform(self, X):
def inverse_transform(self, X: ArrayLike) -> ArrayLike:
"""Transform data back to its original space.

Returns an array X_original whose transform would be X.
Expand Down Expand Up @@ -455,7 +459,7 @@ def inverse_transform(self, X):
else:
return da.dot(X, self.components_) + self.mean_

def score_samples(self, X):
def score_samples(self, X: ArrayLike) -> ArrayLike:
"""Return the log-likelihood of each sample.

See. "Pattern Recognition and Machine Learning"
Expand All @@ -482,7 +486,7 @@ def score_samples(self, X):
log_like -= 0.5 * (n_features * da.log(2.0 * np.pi) - fast_logdet(precision))
return log_like

def score(self, X, y=None):
def score(self, X: ArrayLike, y: Optional[ArrayLike] = None):
"""Return the average log-likelihood of all samples.

See. "Pattern Recognition and Machine Learning"
Expand All @@ -504,5 +508,5 @@ def score(self, X, y=None):
return da.mean(self.score_samples(X))


def _known_shape(shape):
def _known_shape(shape: List[int]):
return all(isinstance(x, numbers.Integral) for x in shape)
20 changes: 14 additions & 6 deletions dask_ml/decomposition/truncated_svd.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,21 @@
from typing import Optional

import dask.array as da
from dask import compute
from sklearn.base import BaseEstimator, TransformerMixin

from .._typing import ArrayLike
from ..utils import svd_flip


class TruncatedSVD(BaseEstimator, TransformerMixin):
def __init__(
self, n_components=2, algorithm="tsqr", n_iter=5, random_state=None, tol=0.0
self,
n_components: int = 2,
algorithm: str = "tsqr",
n_iter: int = 5,
random_state=None,
tol: float = 0.0,
):
"""Dimensionality reduction using truncated SVD (aka LSA).

Expand Down Expand Up @@ -115,7 +123,7 @@ def __init__(
self.random_state = random_state
self.tol = tol

def fit(self, X, y=None):
def fit(self, X: ArrayLike, y: Optional[ArrayLike] = None) -> "TruncatedSVD":
"""Fit truncated SVD on training data X

Parameters
Expand All @@ -133,15 +141,15 @@ def fit(self, X, y=None):
self.fit_transform(X)
return self

def _check_array(self, X):
def _check_array(self, X: ArrayLike) -> ArrayLike:
if self.n_components >= X.shape[1]:
raise ValueError(
"n_components must be < n_features; "
"got {} >= {}".format(self.n_components, X.shape[1])
)
return X

def fit_transform(self, X, y=None):
def fit_transform(self, X: ArrayLike, y: Optional[ArrayLike] = None) -> ArrayLike:
"""Fit model to X and perform dimensionality reduction on X.

Parameters
Expand Down Expand Up @@ -185,7 +193,7 @@ def fit_transform(self, X, y=None):
self.singular_values_ = sv
return X_transformed

def transform(self, X, y=None):
def transform(self, X: ArrayLike, y: Optional[ArrayLike] = None) -> ArrayLike:
"""Perform dimensionality reduction on X.

Parameters
Expand All @@ -205,7 +213,7 @@ def transform(self, X, y=None):
"""
return X.dot(self.components_.T)

def inverse_transform(self, X):
def inverse_transform(self, X: ArrayLike) -> ArrayLike:
"""Transform X back to its original space.

Returns an array X_original whose transform would be X.
Expand Down