Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

updates sparse scale #2942

Merged
merged 30 commits into from
Apr 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
0ee8601
updates sparse scale
Intron7 Mar 22, 2024
a1662d1
use prange
Intron7 Mar 22, 2024
e2652ad
adds release note
Intron7 Mar 22, 2024
250bbcc
adds csc tests
Intron7 Mar 22, 2024
b808433
remove prange to avoid segfault in test
Intron7 Mar 22, 2024
44018e7
switches csc to csr for mask_obs
Intron7 Mar 22, 2024
501b8a4
update docstring
Intron7 Mar 22, 2024
efd0e55
Merge branch 'main' into inplace_sparse_scale
Intron7 Mar 22, 2024
9cdf3e2
add kernel tests
Intron7 Mar 22, 2024
fcca28f
update release note to performance
Intron7 Mar 27, 2024
119cde7
Merge branch 'main' into inplace_sparse_scale
Intron7 Mar 27, 2024
532333a
fixes end of file
Intron7 Mar 27, 2024
b06e9b0
updates so dask is covered with mask
Intron7 Mar 28, 2024
95540de
rework sparse scale
Intron7 Mar 28, 2024
c9e2736
remove redundant line
Intron7 Mar 28, 2024
b1336f0
update complier_constant
Intron7 Apr 2, 2024
ba73360
Merge branch 'main' into inplace_sparse_scale
Intron7 Apr 2, 2024
c0deab6
remove small oversight
Intron7 Apr 2, 2024
35411aa
updates max_value
Intron7 Apr 2, 2024
20b6d36
adds sparse kernel tests
Intron7 Apr 3, 2024
87be224
update inner kernel
Intron7 Apr 3, 2024
8e6f52b
move scale out of simple
Intron7 Apr 3, 2024
38cbd29
updates a dependency
Intron7 Apr 3, 2024
894426b
caches the kernel
Intron7 Apr 3, 2024
3766925
only use kernel if a mask is given
Intron7 Apr 3, 2024
a1a0ffd
fixes an issue with max_value for sparse matrixes
Intron7 Apr 3, 2024
43edd8a
removes print
Intron7 Apr 3, 2024
5f91805
remove parallel
Intron7 Apr 3, 2024
99cd8a1
removee unused dependency
Intron7 Apr 4, 2024
35dd438
Move numba code to it's own method
ivirshup Apr 8, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion docs/release-notes/1.10.1.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
### 1.10.1 {small}`the future`


```{rubric} Docs
```

Expand All @@ -9,5 +8,7 @@

* Fix `aggregate` when aggregating by more than two groups {pr}`2965` {smaller}`I Virshup`


```{rubric} Performance
```
* {func}`~scanpy.pp.scale` now uses numba kernels for `sparse.csr_matrix` and `sparse.csc_matrix` when `zero_center==False` and `mask_obs` is provided. This greatly speed up execution {pr}`2942` {smaller}`S Dicks`
2 changes: 1 addition & 1 deletion scanpy/preprocessing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from ._pca import pca
from ._qc import calculate_qc_metrics
from ._recipes import recipe_seurat, recipe_weinreb17, recipe_zheng17
from ._scale import scale
from ._scrublet import scrublet, scrublet_simulate_doublets
from ._simple import (
downsample_counts,
Expand All @@ -16,7 +17,6 @@
log1p,
normalize_per_cell,
regress_out,
scale,
sqrt,
subsample,
)
Expand Down
310 changes: 310 additions & 0 deletions scanpy/preprocessing/_scale.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,310 @@
from __future__ import annotations

import warnings
from functools import singledispatch
from operator import truediv
from typing import TYPE_CHECKING

import numba
import numpy as np
from anndata import AnnData
from scipy.sparse import issparse, isspmatrix_csc, spmatrix

from .. import logging as logg
from .._compat import DaskArray, old_positionals
from .._utils import (
_check_array_function_arguments,
axis_mul_or_truediv,
renamed_arg,
view_to_actual,
)
from ..get import _check_mask, _get_obs_rep, _set_obs_rep
from ._utils import _get_mean_var

# install dask if available
try:
import dask.array as da
except ImportError:
da = None

Check warning on line 28 in scanpy/preprocessing/_scale.py

View check run for this annotation

Codecov / codecov/patch

scanpy/preprocessing/_scale.py#L27-L28

Added lines #L27 - L28 were not covered by tests

if TYPE_CHECKING:
from numpy.typing import NDArray


@numba.njit(cache=True, parallel=True)
def _scale_sparse_numba(indptr, indices, data, *, std, mask_obs, clip):
for i in numba.prange(len(indptr) - 1):
if mask_obs[i]:
for j in range(indptr[i], indptr[i + 1]):
if clip:
data[j] = min(clip, data[j] / std[indices[j]])

Check warning on line 40 in scanpy/preprocessing/_scale.py

View check run for this annotation

Codecov / codecov/patch

scanpy/preprocessing/_scale.py#L36-L40

Added lines #L36 - L40 were not covered by tests
else:
data[j] /= std[indices[j]]

Check warning on line 42 in scanpy/preprocessing/_scale.py

View check run for this annotation

Codecov / codecov/patch

scanpy/preprocessing/_scale.py#L42

Added line #L42 was not covered by tests


@renamed_arg("X", "data", pos_0=True)
@old_positionals("zero_center", "max_value", "copy", "layer", "obsm")
@singledispatch
def scale(
data: AnnData | spmatrix | np.ndarray | DaskArray,
*,
zero_center: bool = True,
max_value: float | None = None,
copy: bool = False,
layer: str | None = None,
obsm: str | None = None,
mask_obs: NDArray[np.bool_] | str | None = None,
) -> AnnData | spmatrix | np.ndarray | DaskArray | None:
"""\
Scale data to unit variance and zero mean.

.. note::
Variables (genes) that do not display any variation (are constant across
all observations) are retained and (for zero_center==True) set to 0
during this operation. In the future, they might be set to NaNs.

Parameters
----------
data
The (annotated) data matrix of shape `n_obs` × `n_vars`.
Rows correspond to cells and columns to genes.
zero_center
If `False`, omit zero-centering variables, which allows to handle sparse
input efficiently.
max_value
Clip (truncate) to this value after scaling. If `None`, do not clip.
copy
Whether this function should be performed inplace. If an AnnData object
is passed, this also determines if a copy is returned.
layer
If provided, which element of layers to scale.
obsm
If provided, which element of obsm to scale.
mask_obs
Restrict both the derivation of scaling parameters and the scaling itself
to a certain set of observations. The mask is specified as a boolean array
or a string referring to an array in :attr:`~anndata.AnnData.obs`.
This will transform data from csc to csr format if `issparse(data)`.

Returns
-------
Returns `None` if `copy=False`, else returns an updated `AnnData` object. Sets the following fields:

`adata.X` | `adata.layers[layer]` : :class:`numpy.ndarray` | :class:`scipy.sparse._csr.csr_matrix` (dtype `float`)
Scaled count data matrix.
`adata.var['mean']` : :class:`pandas.Series` (dtype `float`)
Means per gene before scaling.
`adata.var['std']` : :class:`pandas.Series` (dtype `float`)
Standard deviations per gene before scaling.
`adata.var['var']` : :class:`pandas.Series` (dtype `float`)
Variances per gene before scaling.
"""
_check_array_function_arguments(layer=layer, obsm=obsm)
if layer is not None:
raise ValueError(

Check warning on line 104 in scanpy/preprocessing/_scale.py

View check run for this annotation

Codecov / codecov/patch

scanpy/preprocessing/_scale.py#L102-L104

Added lines #L102 - L104 were not covered by tests
f"`layer` argument inappropriate for value of type {type(data)}"
)
if obsm is not None:
raise ValueError(

Check warning on line 108 in scanpy/preprocessing/_scale.py

View check run for this annotation

Codecov / codecov/patch

scanpy/preprocessing/_scale.py#L107-L108

Added lines #L107 - L108 were not covered by tests
f"`obsm` argument inappropriate for value of type {type(data)}"
)
return scale_array(

Check warning on line 111 in scanpy/preprocessing/_scale.py

View check run for this annotation

Codecov / codecov/patch

scanpy/preprocessing/_scale.py#L111

Added line #L111 was not covered by tests
data, zero_center=zero_center, max_value=max_value, copy=copy, mask_obs=mask_obs
)


@scale.register(np.ndarray)
@scale.register(DaskArray)
def scale_array(
X: np.ndarray | DaskArray,
*,
zero_center: bool = True,
max_value: float | None = None,
copy: bool = False,
return_mean_std: bool = False,
mask_obs: NDArray[np.bool_] | None = None,
) -> (
np.ndarray
| DaskArray
| tuple[
np.ndarray | DaskArray, NDArray[np.float64] | DaskArray, NDArray[np.float64]
]
| DaskArray
):
if copy:
X = X.copy()
if mask_obs is not None:
mask_obs = _check_mask(X, mask_obs, "obs")
scale_rv = scale_array(
X[mask_obs, :],
zero_center=zero_center,
max_value=max_value,
copy=False,
return_mean_std=return_mean_std,
mask_obs=None,
)

if return_mean_std:
X[mask_obs, :], mean, std = scale_rv
return X, mean, std
else:
X[mask_obs, :] = scale_rv
return X

if not zero_center and max_value is not None:
logg.info( # Be careful of what? This should be more specific
"... be careful when using `max_value` " "without `zero_center`."
)

if np.issubdtype(X.dtype, np.integer):
logg.info(
"... as scaling leads to float results, integer "
"input is cast to float, returning copy."
)
X = X.astype(float)

mean, var = _get_mean_var(X)
std = np.sqrt(var)
std[std == 0] = 1
if zero_center:
if isinstance(X, DaskArray) and issparse(X._meta):
warnings.warn(
"zero-center being used with `DaskArray` sparse chunks. This can be bad if you have large chunks or intend to eventually read the whole data into memory.",
UserWarning,
)
X -= mean

X = axis_mul_or_truediv(
X,
std,
op=truediv,
out=X if isinstance(X, np.ndarray) or issparse(X) else None,
axis=1,
)

# do the clipping
if max_value is not None:
logg.debug(f"... clipping at max_value {max_value}")
if isinstance(X, DaskArray) and issparse(X._meta):

def clip_set(x):
x = x.copy()
x[x > max_value] = max_value
if zero_center:
x[x < -max_value] = -max_value

Check warning on line 194 in scanpy/preprocessing/_scale.py

View check run for this annotation

Codecov / codecov/patch

scanpy/preprocessing/_scale.py#L194

Added line #L194 was not covered by tests
return x

X = da.map_blocks(clip_set, X)
else:
if zero_center:
a_min, a_max = -max_value, max_value
X = np.clip(X, a_min, a_max) # dask does not accept these as kwargs
else:
if issparse(X):
X.data[X.data > max_value] = max_value
else:
X[X > max_value] = max_value
if return_mean_std:
return X, mean, std
else:
return X


@scale.register(spmatrix)
def scale_sparse(
X: spmatrix,
*,
zero_center: bool = True,
max_value: float | None = None,
copy: bool = False,
return_mean_std: bool = False,
mask_obs: NDArray[np.bool_] | None = None,
) -> np.ndarray | tuple[np.ndarray, NDArray[np.float64], NDArray[np.float64]]:
# need to add the following here to make inplace logic work
if zero_center:
logg.info(
"... as `zero_center=True`, sparse input is "
"densified and may lead to large memory consumption"
)
X = X.toarray()
copy = False # Since the data has been copied
return scale_array(
X,
zero_center=zero_center,
copy=copy,
max_value=max_value,
return_mean_std=return_mean_std,
mask_obs=mask_obs,
)
elif mask_obs is None:
return scale_array(
X,
zero_center=zero_center,
copy=copy,
max_value=max_value,
return_mean_std=return_mean_std,
mask_obs=mask_obs,
)
else:
if isspmatrix_csc(X):
X = X.tocsr()
elif copy:
X = X.copy()

if mask_obs is not None:
mask_obs = _check_mask(X, mask_obs, "obs")

mean, var = _get_mean_var(X[mask_obs, :])

std = np.sqrt(var)
std[std == 0] = 1

if max_value is None:
max_value = 0

_scale_sparse_numba(
X.indptr,
X.indices,
X.data,
std=std.astype(X.dtype),
mask_obs=mask_obs,
clip=max_value,
)

if return_mean_std:
return X, mean, std
else:
return X


@scale.register(AnnData)
def scale_anndata(
adata: AnnData,
*,
zero_center: bool = True,
max_value: float | None = None,
copy: bool = False,
layer: str | None = None,
obsm: str | None = None,
mask_obs: NDArray[np.bool_] | str | None = None,
) -> AnnData | None:
adata = adata.copy() if copy else adata
str_mean_std = ("mean", "std")
if mask_obs is not None:
if isinstance(mask_obs, str):
str_mean_std = (f"mean of {mask_obs}", f"std of {mask_obs}")
else:
str_mean_std = ("mean with mask", "std with mask")
mask_obs = _check_mask(adata, mask_obs, "obs")
view_to_actual(adata)
X = _get_obs_rep(adata, layer=layer, obsm=obsm)
X, adata.var[str_mean_std[0]], adata.var[str_mean_std[1]] = scale(
X,
zero_center=zero_center,
max_value=max_value,
copy=False, # because a copy has already been made, if it were to be made
return_mean_std=True,
mask_obs=mask_obs,
)
_set_obs_rep(adata, X, layer=layer, obsm=obsm)
return adata if copy else None
Loading
Loading