Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

updates sparse scale #2942

Merged
merged 30 commits into from
Apr 8, 2024
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
0ee8601
updates sparse scale
Intron7 Mar 22, 2024
a1662d1
use prange
Intron7 Mar 22, 2024
e2652ad
adds release note
Intron7 Mar 22, 2024
250bbcc
adds csc tests
Intron7 Mar 22, 2024
b808433
remove prange to avoid segfault in test
Intron7 Mar 22, 2024
44018e7
switches csc to csr for mask_obs
Intron7 Mar 22, 2024
501b8a4
update docstring
Intron7 Mar 22, 2024
efd0e55
Merge branch 'main' into inplace_sparse_scale
Intron7 Mar 22, 2024
9cdf3e2
add kernel tests
Intron7 Mar 22, 2024
fcca28f
update release note to performance
Intron7 Mar 27, 2024
119cde7
Merge branch 'main' into inplace_sparse_scale
Intron7 Mar 27, 2024
532333a
fixes end of file
Intron7 Mar 27, 2024
b06e9b0
updates so dask is covered with mask
Intron7 Mar 28, 2024
95540de
rework sparse scale
Intron7 Mar 28, 2024
c9e2736
remove redundant line
Intron7 Mar 28, 2024
b1336f0
update complier_constant
Intron7 Apr 2, 2024
ba73360
Merge branch 'main' into inplace_sparse_scale
Intron7 Apr 2, 2024
c0deab6
remove small oversight
Intron7 Apr 2, 2024
35411aa
updates max_value
Intron7 Apr 2, 2024
20b6d36
adds sparse kernel tests
Intron7 Apr 3, 2024
87be224
update inner kernel
Intron7 Apr 3, 2024
8e6f52b
move scale out of simple
Intron7 Apr 3, 2024
38cbd29
updates a dependency
Intron7 Apr 3, 2024
894426b
caches the kernel
Intron7 Apr 3, 2024
3766925
only use kernel if a mask is given
Intron7 Apr 3, 2024
a1a0ffd
fixes an issue with max_value for sparse matrixes
Intron7 Apr 3, 2024
43edd8a
removes print
Intron7 Apr 3, 2024
5f91805
remove parallel
Intron7 Apr 3, 2024
99cd8a1
removee unused dependency
Intron7 Apr 4, 2024
35dd438
Move numba code to it's own method
ivirshup Apr 8, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion docs/release-notes/1.10.1.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
### 1.10.1 {small}`the future`


```{rubric} Docs
```

Expand All @@ -9,5 +8,7 @@

* Fix `aggregate` when aggregating by more than two groups {pr}`2965` {smaller}`I Virshup`


```{rubric} Performance
```
* {func}`~scanpy.pp.scale` now uses numba kernels for `sparse.csr_matrix` and `sparse.csc_matrix` when `zero_center==False` and `mask_obs` is provided. This greatly speed up execution {pr}`2942` {smaller}`S Dicks`
76 changes: 69 additions & 7 deletions scanpy/preprocessing/_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import scipy as sp
from anndata import AnnData
from pandas.api.types import CategoricalDtype
from scipy.sparse import csr_matrix, issparse, isspmatrix_csr, spmatrix
from scipy.sparse import csr_matrix, issparse, isspmatrix_csc, isspmatrix_csr, spmatrix
from sklearn.utils import check_array, sparsefuncs

from .. import logging as logg
Expand Down Expand Up @@ -791,6 +791,7 @@
Restrict both the derivation of scaling parameters and the scaling itself
to a certain set of observations. The mask is specified as a boolean array
or a string referring to an array in :attr:`~anndata.AnnData.obs`.
This will transform data from csc to csr format if `issparse(data)`.

Returns
-------
Expand Down Expand Up @@ -849,6 +850,7 @@
return_mean_std=return_mean_std,
mask_obs=None,
)

if return_mean_std:
X[mask_obs, :], mean, std = scale_rv
return X, mean, std
Expand Down Expand Up @@ -929,15 +931,75 @@
)
X = X.toarray()
copy = False # Since the data has been copied
return scale_array(
X,
zero_center=zero_center,
copy=copy,
max_value=max_value,
return_mean_std=return_mean_std,
return scale_array(
X,
zero_center=zero_center,
copy=copy,
max_value=max_value,
return_mean_std=return_mean_std,
mask_obs=mask_obs,
)
elif mask_obs is None and isspmatrix_csc(X):
return scale_array(
X,
zero_center=zero_center,
copy=copy,
max_value=max_value,
return_mean_std=return_mean_std,
mask_obs=mask_obs,
)
else:
if isspmatrix_csc(X):
X = X.tocsr()
elif copy:
X = X.copy()

if mask_obs is not None:
mask_obs = _check_mask(X, mask_obs, "obs")
has_mask = True
else:
mask_obs = np.ones(X.shape[0], dtype=bool)
has_mask = False
mean, var = _get_mean_var(X[mask_obs, :])

std = np.sqrt(var)
std[std == 0] = 1

@numba.njit()
def _scale_sparse_numba(indptr, indices, data, *, std, mask_obs, has_mask, clip):
def _loop_scale(cell_ix):
for j in numba.prange(indptr[cell_ix], indptr[cell_ix + 1]):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this multithreaded if you don't pass parallel=True to njit?

Copy link
Member Author

@Intron7 Intron7 Apr 3, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

its slower, because of the way the memory access happens. I tested it. So no its not multi-threaded

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok after running some more tests. Its not the memory access but the compile time. The speedup only happens for very large matrices in the 2nd run so I dont think its worth it.

if clip:
data[j] = min(clip, data[j] / std[indices[j]])

Check warning on line 973 in scanpy/preprocessing/_simple.py

View check run for this annotation

Codecov / codecov/patch

scanpy/preprocessing/_simple.py#L970-L973

Added lines #L970 - L973 were not covered by tests
else:
data[j] /= std[indices[j]]

Check warning on line 975 in scanpy/preprocessing/_simple.py

View check run for this annotation

Codecov / codecov/patch

scanpy/preprocessing/_simple.py#L975

Added line #L975 was not covered by tests

if has_mask:
for i in numba.prange(len(indptr) - 1):
if mask_obs[i]:
_loop_scale(i)

Check warning on line 980 in scanpy/preprocessing/_simple.py

View check run for this annotation

Codecov / codecov/patch

scanpy/preprocessing/_simple.py#L977-L980

Added lines #L977 - L980 were not covered by tests
else:
for i in numba.prange(len(indptr) - 1):
_loop_scale(i)

Check warning on line 983 in scanpy/preprocessing/_simple.py

View check run for this annotation

Codecov / codecov/patch

scanpy/preprocessing/_simple.py#L982-L983

Added lines #L982 - L983 were not covered by tests

if max_value is None:
max_value = 0

_scale_sparse_numba(
X.indptr,
X.indices,
X.data,
std=std.astype(X.dtype),
mask_obs=mask_obs,
has_mask=has_mask,
clip=max_value,
)

if return_mean_std:
return X, mean, std
else:
return X


@scale.register(AnnData)
def scale_anndata(
Expand Down
43 changes: 41 additions & 2 deletions scanpy/tests/test_scaling.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import numpy as np
import pytest
from anndata import AnnData
from scipy.sparse import csr_matrix
from scipy.sparse import csc_matrix, csr_matrix

import scanpy as sc

Expand All @@ -23,6 +23,12 @@
[1, 0, 1, 0],
[0, 0, 0, 0],
] # with gene std 1,0,1,0 and center 0,0,0,0
X_scaled_original_clipped = [
[-1, 1, 0, 0],
[1, 1, 1, 0],
[0, 1, 1, 0],
] # with gene std 1,0,1,0 and center 0,2,1,0


X_for_mask = [
[27, 27, 27, 27],
Expand Down Expand Up @@ -51,9 +57,20 @@
[27, 27, 27, 27],
[27, 27, 27, 27],
]
X_scaled_for_mask_clipped = [
[27, 27, 27, 27],
[27, 27, 27, 27],
[-1, 1, 0, 0],
[1, 1, 1, 0],
[0, 1, 1, 0],
[27, 27, 27, 27],
[27, 27, 27, 27],
]


@pytest.mark.parametrize("typ", [np.array, csr_matrix], ids=lambda x: x.__name__)
@pytest.mark.parametrize(
"typ", [np.array, csr_matrix, csc_matrix], ids=lambda x: x.__name__
)
@pytest.mark.parametrize("dtype", ["float32", "int64"])
@pytest.mark.parametrize(
("mask_obs", "X", "X_centered", "X_scaled"),
Expand Down Expand Up @@ -113,3 +130,25 @@ def test_clip(zero_center):
if zero_center:
assert adata.X.min() >= -1
assert adata.X.max() <= 1


@pytest.mark.parametrize(
("mask_obs", "X", "X_scaled", "X_clipped"),
[
(None, X_original, X_scaled_original, X_scaled_original_clipped),
(
np.array((0, 0, 1, 1, 1, 0, 0), dtype=bool),
X_for_mask,
X_scaled_for_mask,
X_scaled_for_mask_clipped,
),
],
)
def test_scale_sparse(*, mask_obs, X, X_scaled, X_clipped):
adata0 = AnnData(csr_matrix(X).astype(np.float32))
sc.pp.scale(adata0, mask_obs=mask_obs, zero_center=False)
assert np.allclose(csr_matrix(adata0.X).toarray(), X_scaled)
# test scaling with explicit zero_center == True
adata1 = AnnData(csr_matrix(X).astype(np.float32))
sc.pp.scale(adata1, zero_center=False, mask_obs=mask_obs, max_value=1)
assert np.allclose(csr_matrix(adata1.X).toarray(), X_clipped)
Loading