From 41bc8fa7671e03e6b335670ebff9a3adaef595f4 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 8 Apr 2024 14:15:42 +0200 Subject: [PATCH] Backport PR #2942: updates sparse scale (#2985) Co-authored-by: Severin Dicks <37635888+Intron7@users.noreply.github.com> --- docs/release-notes/1.10.1.md | 3 +- scanpy/preprocessing/__init__.py | 2 +- scanpy/preprocessing/_scale.py | 310 +++++++++++++++++++++++++++++ scanpy/preprocessing/_simple.py | 228 +-------------------- scanpy/tests/test_scaling.py | 43 +++- scanpy/tools/_rank_genes_groups.py | 2 +- 6 files changed, 356 insertions(+), 232 deletions(-) create mode 100644 scanpy/preprocessing/_scale.py diff --git a/docs/release-notes/1.10.1.md b/docs/release-notes/1.10.1.md index 201c7f864c..e94c9d6fdd 100644 --- a/docs/release-notes/1.10.1.md +++ b/docs/release-notes/1.10.1.md @@ -1,6 +1,5 @@ ### 1.10.1 {small}`the future` - ```{rubric} Docs ``` @@ -11,5 +10,7 @@ * Fix `aggregate` when aggregating by more than two groups {pr}`2965` {smaller}`I Virshup` + ```{rubric} Performance ``` +* {func}`~scanpy.pp.scale` now uses numba kernels for `sparse.csr_matrix` and `sparse.csc_matrix` when `zero_center==False` and `mask_obs` is provided. This greatly speed up execution {pr}`2942` {smaller}`S Dicks` diff --git a/scanpy/preprocessing/__init__.py b/scanpy/preprocessing/__init__.py index 3cfcff477d..8c396d8640 100644 --- a/scanpy/preprocessing/__init__.py +++ b/scanpy/preprocessing/__init__.py @@ -8,6 +8,7 @@ from ._pca import pca from ._qc import calculate_qc_metrics from ._recipes import recipe_seurat, recipe_weinreb17, recipe_zheng17 +from ._scale import scale from ._scrublet import scrublet, scrublet_simulate_doublets from ._simple import ( downsample_counts, @@ -16,7 +17,6 @@ log1p, normalize_per_cell, regress_out, - scale, sqrt, subsample, ) diff --git a/scanpy/preprocessing/_scale.py b/scanpy/preprocessing/_scale.py new file mode 100644 index 0000000000..5f7f526aea --- /dev/null +++ b/scanpy/preprocessing/_scale.py @@ -0,0 +1,310 @@ +from __future__ import annotations + +import warnings +from functools import singledispatch +from operator import truediv +from typing import TYPE_CHECKING + +import numba +import numpy as np +from anndata import AnnData +from scipy.sparse import issparse, isspmatrix_csc, spmatrix + +from .. import logging as logg +from .._compat import DaskArray, old_positionals +from .._utils import ( + _check_array_function_arguments, + axis_mul_or_truediv, + renamed_arg, + view_to_actual, +) +from ..get import _check_mask, _get_obs_rep, _set_obs_rep +from ._utils import _get_mean_var + +# install dask if available +try: + import dask.array as da +except ImportError: + da = None + +if TYPE_CHECKING: + from numpy.typing import NDArray + + +@numba.njit(cache=True, parallel=True) +def _scale_sparse_numba(indptr, indices, data, *, std, mask_obs, clip): + for i in numba.prange(len(indptr) - 1): + if mask_obs[i]: + for j in range(indptr[i], indptr[i + 1]): + if clip: + data[j] = min(clip, data[j] / std[indices[j]]) + else: + data[j] /= std[indices[j]] + + +@renamed_arg("X", "data", pos_0=True) +@old_positionals("zero_center", "max_value", "copy", "layer", "obsm") +@singledispatch +def scale( + data: AnnData | spmatrix | np.ndarray | DaskArray, + *, + zero_center: bool = True, + max_value: float | None = None, + copy: bool = False, + layer: str | None = None, + obsm: str | None = None, + mask_obs: NDArray[np.bool_] | str | None = None, +) -> AnnData | spmatrix | np.ndarray | DaskArray | None: + """\ + Scale data to unit variance and zero mean. + + .. note:: + Variables (genes) that do not display any variation (are constant across + all observations) are retained and (for zero_center==True) set to 0 + during this operation. In the future, they might be set to NaNs. + + Parameters + ---------- + data + The (annotated) data matrix of shape `n_obs` × `n_vars`. + Rows correspond to cells and columns to genes. + zero_center + If `False`, omit zero-centering variables, which allows to handle sparse + input efficiently. + max_value + Clip (truncate) to this value after scaling. If `None`, do not clip. + copy + Whether this function should be performed inplace. If an AnnData object + is passed, this also determines if a copy is returned. + layer + If provided, which element of layers to scale. + obsm + If provided, which element of obsm to scale. + mask_obs + Restrict both the derivation of scaling parameters and the scaling itself + to a certain set of observations. The mask is specified as a boolean array + or a string referring to an array in :attr:`~anndata.AnnData.obs`. + This will transform data from csc to csr format if `issparse(data)`. + + Returns + ------- + Returns `None` if `copy=False`, else returns an updated `AnnData` object. Sets the following fields: + + `adata.X` | `adata.layers[layer]` : :class:`numpy.ndarray` | :class:`scipy.sparse._csr.csr_matrix` (dtype `float`) + Scaled count data matrix. + `adata.var['mean']` : :class:`pandas.Series` (dtype `float`) + Means per gene before scaling. + `adata.var['std']` : :class:`pandas.Series` (dtype `float`) + Standard deviations per gene before scaling. + `adata.var['var']` : :class:`pandas.Series` (dtype `float`) + Variances per gene before scaling. + """ + _check_array_function_arguments(layer=layer, obsm=obsm) + if layer is not None: + raise ValueError( + f"`layer` argument inappropriate for value of type {type(data)}" + ) + if obsm is not None: + raise ValueError( + f"`obsm` argument inappropriate for value of type {type(data)}" + ) + return scale_array( + data, zero_center=zero_center, max_value=max_value, copy=copy, mask_obs=mask_obs + ) + + +@scale.register(np.ndarray) +@scale.register(DaskArray) +def scale_array( + X: np.ndarray | DaskArray, + *, + zero_center: bool = True, + max_value: float | None = None, + copy: bool = False, + return_mean_std: bool = False, + mask_obs: NDArray[np.bool_] | None = None, +) -> ( + np.ndarray + | DaskArray + | tuple[ + np.ndarray | DaskArray, NDArray[np.float64] | DaskArray, NDArray[np.float64] + ] + | DaskArray +): + if copy: + X = X.copy() + if mask_obs is not None: + mask_obs = _check_mask(X, mask_obs, "obs") + scale_rv = scale_array( + X[mask_obs, :], + zero_center=zero_center, + max_value=max_value, + copy=False, + return_mean_std=return_mean_std, + mask_obs=None, + ) + + if return_mean_std: + X[mask_obs, :], mean, std = scale_rv + return X, mean, std + else: + X[mask_obs, :] = scale_rv + return X + + if not zero_center and max_value is not None: + logg.info( # Be careful of what? This should be more specific + "... be careful when using `max_value` " "without `zero_center`." + ) + + if np.issubdtype(X.dtype, np.integer): + logg.info( + "... as scaling leads to float results, integer " + "input is cast to float, returning copy." + ) + X = X.astype(float) + + mean, var = _get_mean_var(X) + std = np.sqrt(var) + std[std == 0] = 1 + if zero_center: + if isinstance(X, DaskArray) and issparse(X._meta): + warnings.warn( + "zero-center being used with `DaskArray` sparse chunks. This can be bad if you have large chunks or intend to eventually read the whole data into memory.", + UserWarning, + ) + X -= mean + + X = axis_mul_or_truediv( + X, + std, + op=truediv, + out=X if isinstance(X, np.ndarray) or issparse(X) else None, + axis=1, + ) + + # do the clipping + if max_value is not None: + logg.debug(f"... clipping at max_value {max_value}") + if isinstance(X, DaskArray) and issparse(X._meta): + + def clip_set(x): + x = x.copy() + x[x > max_value] = max_value + if zero_center: + x[x < -max_value] = -max_value + return x + + X = da.map_blocks(clip_set, X) + else: + if zero_center: + a_min, a_max = -max_value, max_value + X = np.clip(X, a_min, a_max) # dask does not accept these as kwargs + else: + if issparse(X): + X.data[X.data > max_value] = max_value + else: + X[X > max_value] = max_value + if return_mean_std: + return X, mean, std + else: + return X + + +@scale.register(spmatrix) +def scale_sparse( + X: spmatrix, + *, + zero_center: bool = True, + max_value: float | None = None, + copy: bool = False, + return_mean_std: bool = False, + mask_obs: NDArray[np.bool_] | None = None, +) -> np.ndarray | tuple[np.ndarray, NDArray[np.float64], NDArray[np.float64]]: + # need to add the following here to make inplace logic work + if zero_center: + logg.info( + "... as `zero_center=True`, sparse input is " + "densified and may lead to large memory consumption" + ) + X = X.toarray() + copy = False # Since the data has been copied + return scale_array( + X, + zero_center=zero_center, + copy=copy, + max_value=max_value, + return_mean_std=return_mean_std, + mask_obs=mask_obs, + ) + elif mask_obs is None: + return scale_array( + X, + zero_center=zero_center, + copy=copy, + max_value=max_value, + return_mean_std=return_mean_std, + mask_obs=mask_obs, + ) + else: + if isspmatrix_csc(X): + X = X.tocsr() + elif copy: + X = X.copy() + + if mask_obs is not None: + mask_obs = _check_mask(X, mask_obs, "obs") + + mean, var = _get_mean_var(X[mask_obs, :]) + + std = np.sqrt(var) + std[std == 0] = 1 + + if max_value is None: + max_value = 0 + + _scale_sparse_numba( + X.indptr, + X.indices, + X.data, + std=std.astype(X.dtype), + mask_obs=mask_obs, + clip=max_value, + ) + + if return_mean_std: + return X, mean, std + else: + return X + + +@scale.register(AnnData) +def scale_anndata( + adata: AnnData, + *, + zero_center: bool = True, + max_value: float | None = None, + copy: bool = False, + layer: str | None = None, + obsm: str | None = None, + mask_obs: NDArray[np.bool_] | str | None = None, +) -> AnnData | None: + adata = adata.copy() if copy else adata + str_mean_std = ("mean", "std") + if mask_obs is not None: + if isinstance(mask_obs, str): + str_mean_std = (f"mean of {mask_obs}", f"std of {mask_obs}") + else: + str_mean_std = ("mean with mask", "std with mask") + mask_obs = _check_mask(adata, mask_obs, "obs") + view_to_actual(adata) + X = _get_obs_rep(adata, layer=layer, obsm=obsm) + X, adata.var[str_mean_std[0]], adata.var[str_mean_std[1]] = scale( + X, + zero_center=zero_center, + max_value=max_value, + copy=False, # because a copy has already been made, if it were to be made + return_mean_std=True, + mask_obs=mask_obs, + ) + _set_obs_rep(adata, X, layer=layer, obsm=obsm) + return adata if copy else None diff --git a/scanpy/preprocessing/_simple.py b/scanpy/preprocessing/_simple.py index a8099f52e7..62f2290f31 100644 --- a/scanpy/preprocessing/_simple.py +++ b/scanpy/preprocessing/_simple.py @@ -7,7 +7,6 @@ import warnings from functools import singledispatch -from operator import truediv from typing import TYPE_CHECKING, Literal import numba @@ -24,15 +23,13 @@ from .._utils import ( AnyRandom, _check_array_function_arguments, - axis_mul_or_truediv, axis_sum, renamed_arg, sanitize_anndata, view_to_actual, ) -from ..get import _check_mask, _get_obs_rep, _set_obs_rep +from ..get import _get_obs_rep, _set_obs_rep from ._distributed import materialize_as_ndarray -from ._utils import _get_mean_var # install dask if available try: @@ -749,229 +746,6 @@ def _regress_out_chunk(data): return np.vstack(responses_chunk_list) -@renamed_arg("X", "data", pos_0=True) -@old_positionals("zero_center", "max_value", "copy", "layer", "obsm") -@singledispatch -def scale( - data: AnnData | spmatrix | np.ndarray | DaskArray, - *, - zero_center: bool = True, - max_value: float | None = None, - copy: bool = False, - layer: str | None = None, - obsm: str | None = None, - mask_obs: NDArray[np.bool_] | str | None = None, -) -> AnnData | spmatrix | np.ndarray | DaskArray | None: - """\ - Scale data to unit variance and zero mean. - - .. note:: - Variables (genes) that do not display any variation (are constant across - all observations) are retained and (for zero_center==True) set to 0 - during this operation. In the future, they might be set to NaNs. - - Parameters - ---------- - data - The (annotated) data matrix of shape `n_obs` × `n_vars`. - Rows correspond to cells and columns to genes. - zero_center - If `False`, omit zero-centering variables, which allows to handle sparse - input efficiently. - max_value - Clip (truncate) to this value after scaling. If `None`, do not clip. - copy - Whether this function should be performed inplace. If an AnnData object - is passed, this also determines if a copy is returned. - layer - If provided, which element of layers to scale. - obsm - If provided, which element of obsm to scale. - mask_obs - Restrict both the derivation of scaling parameters and the scaling itself - to a certain set of observations. The mask is specified as a boolean array - or a string referring to an array in :attr:`~anndata.AnnData.obs`. - - Returns - ------- - Returns `None` if `copy=False`, else returns an updated `AnnData` object. Sets the following fields: - - `adata.X` | `adata.layers[layer]` : :class:`numpy.ndarray` | :class:`scipy.sparse._csr.csr_matrix` (dtype `float`) - Scaled count data matrix. - `adata.var['mean']` : :class:`pandas.Series` (dtype `float`) - Means per gene before scaling. - `adata.var['std']` : :class:`pandas.Series` (dtype `float`) - Standard deviations per gene before scaling. - `adata.var['var']` : :class:`pandas.Series` (dtype `float`) - Variances per gene before scaling. - """ - _check_array_function_arguments(layer=layer, obsm=obsm) - if layer is not None: - raise ValueError( - f"`layer` argument inappropriate for value of type {type(data)}" - ) - if obsm is not None: - raise ValueError( - f"`obsm` argument inappropriate for value of type {type(data)}" - ) - return scale_array( - data, zero_center=zero_center, max_value=max_value, copy=copy, mask_obs=mask_obs - ) - - -@scale.register(np.ndarray) -@scale.register(DaskArray) -def scale_array( - X: np.ndarray | DaskArray, - *, - zero_center: bool = True, - max_value: float | None = None, - copy: bool = False, - return_mean_std: bool = False, - mask_obs: NDArray[np.bool_] | None = None, -) -> ( - np.ndarray - | DaskArray - | tuple[ - np.ndarray | DaskArray, NDArray[np.float64] | DaskArray, NDArray[np.float64] - ] - | DaskArray -): - if copy: - X = X.copy() - if mask_obs is not None: - mask_obs = _check_mask(X, mask_obs, "obs") - scale_rv = scale_array( - X[mask_obs, :], - zero_center=zero_center, - max_value=max_value, - copy=False, - return_mean_std=return_mean_std, - mask_obs=None, - ) - if return_mean_std: - X[mask_obs, :], mean, std = scale_rv - return X, mean, std - else: - X[mask_obs, :] = scale_rv - return X - - if not zero_center and max_value is not None: - logg.info( # Be careful of what? This should be more specific - "... be careful when using `max_value` " "without `zero_center`." - ) - - if np.issubdtype(X.dtype, np.integer): - logg.info( - "... as scaling leads to float results, integer " - "input is cast to float, returning copy." - ) - X = X.astype(float) - - mean, var = _get_mean_var(X) - std = np.sqrt(var) - std[std == 0] = 1 - if zero_center: - if isinstance(X, DaskArray) and issparse(X._meta): - warnings.warn( - "zero-center being used with `DaskArray` sparse chunks. This can be bad if you have large chunks or intend to eventually read the whole data into memory.", - UserWarning, - ) - X -= mean - X = axis_mul_or_truediv( - X, - std, - op=truediv, - out=X if isinstance(X, np.ndarray) or issparse(X) else None, - axis=1, - ) - - # do the clipping - if max_value is not None: - logg.debug(f"... clipping at max_value {max_value}") - if isinstance(X, DaskArray) and issparse(X._meta): - - def clip_set(x): - x = x.copy() - x[x > max_value] = max_value - if zero_center: - x[x < -max_value] = -max_value - return x - - X = da.map_blocks(clip_set, X) - else: - if zero_center: - a_min, a_max = -max_value, max_value - X = np.clip(X, a_min, a_max) # dask does not accept these as kwargs - else: - X[X > max_value] = max_value - if return_mean_std: - return X, mean, std - else: - return X - - -@scale.register(spmatrix) -def scale_sparse( - X: spmatrix, - *, - zero_center: bool = True, - max_value: float | None = None, - copy: bool = False, - return_mean_std: bool = False, - mask_obs: NDArray[np.bool_] | None = None, -) -> np.ndarray | tuple[np.ndarray, NDArray[np.float64], NDArray[np.float64]]: - # need to add the following here to make inplace logic work - if zero_center: - logg.info( - "... as `zero_center=True`, sparse input is " - "densified and may lead to large memory consumption" - ) - X = X.toarray() - copy = False # Since the data has been copied - return scale_array( - X, - zero_center=zero_center, - copy=copy, - max_value=max_value, - return_mean_std=return_mean_std, - mask_obs=mask_obs, - ) - - -@scale.register(AnnData) -def scale_anndata( - adata: AnnData, - *, - zero_center: bool = True, - max_value: float | None = None, - copy: bool = False, - layer: str | None = None, - obsm: str | None = None, - mask_obs: NDArray[np.bool_] | str | None = None, -) -> AnnData | None: - adata = adata.copy() if copy else adata - str_mean_std = ("mean", "std") - if mask_obs is not None: - if isinstance(mask_obs, str): - str_mean_std = (f"mean of {mask_obs}", f"std of {mask_obs}") - else: - str_mean_std = ("mean with mask", "std with mask") - mask_obs = _check_mask(adata, mask_obs, "obs") - view_to_actual(adata) - X = _get_obs_rep(adata, layer=layer, obsm=obsm) - X, adata.var[str_mean_std[0]], adata.var[str_mean_std[1]] = scale( - X, - zero_center=zero_center, - max_value=max_value, - copy=False, # because a copy has already been made, if it were to be made - return_mean_std=True, - mask_obs=mask_obs, - ) - _set_obs_rep(adata, X, layer=layer, obsm=obsm) - return adata if copy else None - - @old_positionals("n_obs", "random_state", "copy") def subsample( data: AnnData | np.ndarray | spmatrix, diff --git a/scanpy/tests/test_scaling.py b/scanpy/tests/test_scaling.py index ba425e4408..b169f7a51e 100644 --- a/scanpy/tests/test_scaling.py +++ b/scanpy/tests/test_scaling.py @@ -3,7 +3,7 @@ import numpy as np import pytest from anndata import AnnData -from scipy.sparse import csr_matrix +from scipy.sparse import csc_matrix, csr_matrix import scanpy as sc @@ -23,6 +23,12 @@ [1, 0, 1, 0], [0, 0, 0, 0], ] # with gene std 1,0,1,0 and center 0,0,0,0 +X_scaled_original_clipped = [ + [-1, 1, 0, 0], + [1, 1, 1, 0], + [0, 1, 1, 0], +] # with gene std 1,0,1,0 and center 0,2,1,0 + X_for_mask = [ [27, 27, 27, 27], @@ -51,9 +57,20 @@ [27, 27, 27, 27], [27, 27, 27, 27], ] +X_scaled_for_mask_clipped = [ + [27, 27, 27, 27], + [27, 27, 27, 27], + [-1, 1, 0, 0], + [1, 1, 1, 0], + [0, 1, 1, 0], + [27, 27, 27, 27], + [27, 27, 27, 27], +] -@pytest.mark.parametrize("typ", [np.array, csr_matrix], ids=lambda x: x.__name__) +@pytest.mark.parametrize( + "typ", [np.array, csr_matrix, csc_matrix], ids=lambda x: x.__name__ +) @pytest.mark.parametrize("dtype", ["float32", "int64"]) @pytest.mark.parametrize( ("mask_obs", "X", "X_centered", "X_scaled"), @@ -113,3 +130,25 @@ def test_clip(zero_center): if zero_center: assert adata.X.min() >= -1 assert adata.X.max() <= 1 + + +@pytest.mark.parametrize( + ("mask_obs", "X", "X_scaled", "X_clipped"), + [ + (None, X_original, X_scaled_original, X_scaled_original_clipped), + ( + np.array((0, 0, 1, 1, 1, 0, 0), dtype=bool), + X_for_mask, + X_scaled_for_mask, + X_scaled_for_mask_clipped, + ), + ], +) +def test_scale_sparse(*, mask_obs, X, X_scaled, X_clipped): + adata0 = AnnData(csr_matrix(X).astype(np.float32)) + sc.pp.scale(adata0, mask_obs=mask_obs, zero_center=False) + assert np.allclose(csr_matrix(adata0.X).toarray(), X_scaled) + # test scaling with explicit zero_center == True + adata1 = AnnData(csr_matrix(X).astype(np.float32)) + sc.pp.scale(adata1, zero_center=False, mask_obs=mask_obs, max_value=1) + assert np.allclose(csr_matrix(adata1.X).toarray(), X_clipped) diff --git a/scanpy/tools/_rank_genes_groups.py b/scanpy/tools/_rank_genes_groups.py index c1aab62182..b91f846da1 100644 --- a/scanpy/tools/_rank_genes_groups.py +++ b/scanpy/tools/_rank_genes_groups.py @@ -14,7 +14,7 @@ from .._compat import old_positionals from .._utils import check_nonnegative_integers from ..get import _check_mask -from ..preprocessing._simple import _get_mean_var +from ..preprocessing._utils import _get_mean_var if TYPE_CHECKING: from collections.abc import Generator, Iterable