Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

updates sparse scale #2942

Merged
merged 30 commits into from
Apr 8, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
0ee8601
updates sparse scale
Intron7 Mar 22, 2024
a1662d1
use prange
Intron7 Mar 22, 2024
e2652ad
adds release note
Intron7 Mar 22, 2024
250bbcc
adds csc tests
Intron7 Mar 22, 2024
b808433
remove prange to avoid segfault in test
Intron7 Mar 22, 2024
44018e7
switches csc to csr for mask_obs
Intron7 Mar 22, 2024
501b8a4
update docstring
Intron7 Mar 22, 2024
efd0e55
Merge branch 'main' into inplace_sparse_scale
Intron7 Mar 22, 2024
9cdf3e2
add kernel tests
Intron7 Mar 22, 2024
fcca28f
update release note to performance
Intron7 Mar 27, 2024
119cde7
Merge branch 'main' into inplace_sparse_scale
Intron7 Mar 27, 2024
532333a
fixes end of file
Intron7 Mar 27, 2024
b06e9b0
updates so dask is covered with mask
Intron7 Mar 28, 2024
95540de
rework sparse scale
Intron7 Mar 28, 2024
c9e2736
remove redundant line
Intron7 Mar 28, 2024
b1336f0
update complier_constant
Intron7 Apr 2, 2024
ba73360
Merge branch 'main' into inplace_sparse_scale
Intron7 Apr 2, 2024
c0deab6
remove small oversight
Intron7 Apr 2, 2024
35411aa
updates max_value
Intron7 Apr 2, 2024
20b6d36
adds sparse kernel tests
Intron7 Apr 3, 2024
87be224
update inner kernel
Intron7 Apr 3, 2024
8e6f52b
move scale out of simple
Intron7 Apr 3, 2024
38cbd29
updates a dependency
Intron7 Apr 3, 2024
894426b
caches the kernel
Intron7 Apr 3, 2024
3766925
only use kernel if a mask is given
Intron7 Apr 3, 2024
a1a0ffd
fixes an issue with max_value for sparse matrixes
Intron7 Apr 3, 2024
43edd8a
removes print
Intron7 Apr 3, 2024
5f91805
remove parallel
Intron7 Apr 3, 2024
99cd8a1
removee unused dependency
Intron7 Apr 4, 2024
35dd438
Move numba code to it's own method
ivirshup Apr 8, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/release-notes/1.10.1.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
### 1.10.1 {small}`the future`

```{rubric} Features
Intron7 marked this conversation as resolved.
Show resolved Hide resolved
```
* {func}`~scanpy.pp.scale` now uses numba kernels for `sparse.csr_matrix` and `sparse.csc_matrix` when `zero_center==False` and `mask_obs` is provided. This greatly speed up execution {pr}`2942` {smaller}`S Dicks`
3 changes: 3 additions & 0 deletions docs/release-notes/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@

## Version 1.10

```{include} /release-notes/1.10.1.md
```

```{include} /release-notes/1.10.0.md
```

Expand Down
28 changes: 20 additions & 8 deletions scanpy/preprocessing/_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import scipy as sp
from anndata import AnnData
from pandas.api.types import CategoricalDtype
from scipy.sparse import csr_matrix, issparse, isspmatrix_csr, spmatrix
from scipy.sparse import csr_matrix, issparse, isspmatrix_csc, isspmatrix_csr, spmatrix
from sklearn.utils import check_array, sparsefuncs

from .. import logging as logg
Expand All @@ -29,7 +29,7 @@
)
from ..get import _check_mask, _get_obs_rep, _set_obs_rep
from ._distributed import materialize_as_ndarray
from ._utils import _get_mean_var
from ._utils import _get_mean_var, update_spmatrix_inplace

# install dask if available
try:
Expand Down Expand Up @@ -788,6 +788,7 @@ def scale(
Restrict both the derivation of scaling parameters and the scaling itself
to a certain set of observations. The mask is specified as a boolean array
or a string referring to an array in :attr:`~anndata.AnnData.obs`.
This will transform data from csc to csr format if `issparse(data)`.

Returns
-------
Expand Down Expand Up @@ -830,6 +831,8 @@ def scale_array(
X = X.copy()
if mask_obs is not None:
mask_obs = _check_mask(X, mask_obs, "obs")
if isspmatrix_csc(X):
X = X.tocsr()
scale_rv = scale_array(
X[mask_obs, :],
zero_center=zero_center,
Expand All @@ -838,12 +841,21 @@ def scale_array(
return_mean_std=return_mean_std,
mask_obs=None,
)
if return_mean_std:
X[mask_obs, :], mean, std = scale_rv
return X, mean, std
else:
X[mask_obs, :] = scale_rv
return X
if isinstance(X, np.ndarray):
if return_mean_std:
X[mask_obs, :], mean, std = scale_rv
return X, mean, std
else:
X[mask_obs, :] = scale_rv
return X
elif issparse(X):
if return_mean_std:
scaled, mean, std = scale_rv
update_spmatrix_inplace(X, scaled, mask_obs)
return X, mean, std
else:
update_spmatrix_inplace(X, scale_rv, mask_obs)
return X

if not zero_center and max_value is not None:
logg.info( # Be careful of what? This should be more specific
Expand Down
39 changes: 39 additions & 0 deletions scanpy/preprocessing/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,3 +138,42 @@
np.prod(dims), nsamp, random_state=random_state, method=method
)
return np.vstack(np.unravel_index(idx, dims)).T


def update_spmatrix_inplace(X, update, mask):
"""
Update the values in a sparse matrix inplace.

Parameters
----------
X
The sparse matrix to update.
update
The values to update with.
mask
The mask of the values to update.
"""
subset_mask = np.where(mask)[0]

_update_csr_inplace(
X.indptr,
X.data,
update_indptr=update.indptr,
update_data=update.data,
mask=subset_mask,
)


@numba.njit()
def _update_csr_inplace(indptr, data, *, update_indptr, update_data, mask):
for i in range(len(update_indptr) - 1):
sub_start_idx = update_indptr[i]
sub_stop_idx = update_indptr[i + 1]
subidx = mask[i]

Check warning on line 172 in scanpy/preprocessing/_utils.py

View check run for this annotation

Codecov / codecov/patch

scanpy/preprocessing/_utils.py#L169-L172

Added lines #L169 - L172 were not covered by tests

start_idx = indptr[subidx]
stop_idx = indptr[subidx + 1]

Check warning on line 175 in scanpy/preprocessing/_utils.py

View check run for this annotation

Codecov / codecov/patch

scanpy/preprocessing/_utils.py#L174-L175

Added lines #L174 - L175 were not covered by tests

if sub_stop_idx - sub_start_idx == stop_idx - start_idx:
for j in range(sub_stop_idx - sub_start_idx):
data[start_idx + j] = update_data[sub_start_idx + j]

Check warning on line 179 in scanpy/preprocessing/_utils.py

View check run for this annotation

Codecov / codecov/patch

scanpy/preprocessing/_utils.py#L177-L179

Added lines #L177 - L179 were not covered by tests
27 changes: 25 additions & 2 deletions scanpy/tests/test_scaling.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
import numpy as np
import pytest
from anndata import AnnData
from scipy.sparse import csr_matrix
from scipy.sparse import csc_matrix, csr_matrix

import scanpy as sc
from scanpy.preprocessing._utils import _update_csr_inplace, update_spmatrix_inplace

# test "data" for 3 cells * 4 genes
X_original = [
Expand Down Expand Up @@ -53,7 +54,9 @@
]


@pytest.mark.parametrize("typ", [np.array, csr_matrix], ids=lambda x: x.__name__)
@pytest.mark.parametrize(
"typ", [np.array, csr_matrix, csc_matrix], ids=lambda x: x.__name__
)
@pytest.mark.parametrize("dtype", ["float32", "int64"])
@pytest.mark.parametrize(
("mask_obs", "X", "X_centered", "X_scaled"),
Expand Down Expand Up @@ -113,3 +116,23 @@ def test_clip(zero_center):
if zero_center:
assert adata.X.min() >= -1
assert adata.X.max() <= 1


def test_inplace_sparse():
org = csr_matrix(np.array([[2, 0, 2], [2, 0, 2], [2, 0, 2], [2, 0, 2]]))
sub = csr_matrix(np.array([[1, 0, 1], [1, 0, 1]]))
res = csr_matrix(np.array([[2, 0, 2], [1, 0, 1], [1, 0, 1], [2, 0, 2]]))
mask = np.array([False, True, True, False])
update_spmatrix_inplace(org, sub, mask)
np.testing.assert_equal(org.toarray(), res.toarray())


def test_inplace_sparse_kernel():
org = csr_matrix(np.array([[2, 0, 2], [2, 0, 2], [2, 0, 2], [2, 0, 2]]))
sub = csr_matrix(np.array([[1, 0, 1], [1, 0, 1]]))
res = csr_matrix(np.array([[2, 0, 2], [1, 0, 1], [1, 0, 1], [2, 0, 2]]))
mask = np.array([1, 2])
_update_csr_inplace(
org.indptr, org.data, update_indptr=sub.indptr, update_data=sub.data, mask=mask
)
np.testing.assert_equal(org.toarray(), res.toarray())
Loading