scverse · ivirshup · Apr 8, 2024 · Mar 22, 2024 · Mar 22, 2024 · Mar 22, 2024
diff --git a/docs/release-notes/1.10.1.md b/docs/release-notes/1.10.1.md
@@ -1,6 +1,5 @@
 ### 1.10.1 {small}`the future`
 
-
 ```{rubric} Docs
 ```
 
@@ -9,5 +8,7 @@
 
 * Fix `aggregate` when aggregating by more than two groups {pr}`2965` {smaller}`I Virshup`
 
+
 ```{rubric} Performance
 ```
+* {func}`~scanpy.pp.scale` now uses numba kernels for `sparse.csr_matrix` and `sparse.csc_matrix` when `zero_center==False` and `mask_obs` is provided. This greatly speed up execution {pr}`2942` {smaller}`S Dicks`
diff --git a/scanpy/preprocessing/_simple.py b/scanpy/preprocessing/_simple.py
@@ -15,7 +15,7 @@
 import scipy as sp
 from anndata import AnnData
 from pandas.api.types import CategoricalDtype
-from scipy.sparse import csr_matrix, issparse, isspmatrix_csr, spmatrix
+from scipy.sparse import csr_matrix, issparse, isspmatrix_csc, isspmatrix_csr, spmatrix
 from sklearn.utils import check_array, sparsefuncs
 
 from .. import logging as logg
@@ -791,6 +791,7 @@
         Restrict both the derivation of scaling parameters and the scaling itself
         to a certain set of observations. The mask is specified as a boolean array
         or a string referring to an array in :attr:`~anndata.AnnData.obs`.
+        This will transform data from csc to csr format if `issparse(data)`.
 
     Returns
     -------
@@ -849,6 +850,7 @@
             return_mean_std=return_mean_std,
             mask_obs=None,
         )
+
         if return_mean_std:
             X[mask_obs, :], mean, std = scale_rv
             return X, mean, std
@@ -929,15 +931,75 @@
         )
         X = X.toarray()
         copy = False  # Since the data has been copied
-    return scale_array(
-        X,
-        zero_center=zero_center,
-        copy=copy,
-        max_value=max_value,
-        return_mean_std=return_mean_std,
+        return scale_array(
+            X,
+            zero_center=zero_center,
+            copy=copy,
+            max_value=max_value,
+            return_mean_std=return_mean_std,
+            mask_obs=mask_obs,
+        )
+    elif mask_obs is None and isspmatrix_csc(X):
+        return scale_array(
+            X,
+            zero_center=zero_center,
+            copy=copy,
+            max_value=max_value,
+            return_mean_std=return_mean_std,
+            mask_obs=mask_obs,
+        )
+    else:
+        if isspmatrix_csc(X):
+            X = X.tocsr()
+        elif copy:
+            X = X.copy()
+
+        if mask_obs is not None:
+            mask_obs = _check_mask(X, mask_obs, "obs")
+            has_mask = True
+        else:
+            mask_obs = np.ones(X.shape[0], dtype=bool)
+            has_mask = False
+    mean, var = _get_mean_var(X[mask_obs, :])
+
+    std = np.sqrt(var)
+    std[std == 0] = 1
+
+    @numba.njit()
+    def _scale_sparse_numba(indptr, indices, data, *, std, mask_obs, has_mask, clip):
+        def _loop_scale(cell_ix):
+            for j in numba.prange(indptr[cell_ix], indptr[cell_ix + 1]):
+                if clip:
+                    data[j] = min(clip, data[j] / std[indices[j]])
+                else:
+                    data[j] /= std[indices[j]]
+
+        if has_mask:
+            for i in numba.prange(len(indptr) - 1):
+                if mask_obs[i]:
+                    _loop_scale(i)
+        else:
+            for i in numba.prange(len(indptr) - 1):
+                _loop_scale(i)
+
+    if max_value is None:
+        max_value = 0
+
+    _scale_sparse_numba(
+        X.indptr,
+        X.indices,
+        X.data,
+        std=std.astype(X.dtype),
         mask_obs=mask_obs,
+        has_mask=has_mask,
+        clip=max_value,
     )
 
+    if return_mean_std:
+        return X, mean, std
+    else:
+        return X
+
 
 @scale.register(AnnData)
 def scale_anndata(

diff --git a/scanpy/tests/test_scaling.py b/scanpy/tests/test_scaling.py
@@ -3,7 +3,7 @@
 import numpy as np
 import pytest
 from anndata import AnnData
-from scipy.sparse import csr_matrix
+from scipy.sparse import csc_matrix, csr_matrix
 
 import scanpy as sc
 
@@ -23,6 +23,12 @@
     [1, 0, 1, 0],
     [0, 0, 0, 0],
 ]  # with gene std 1,0,1,0 and center 0,0,0,0
+X_scaled_original_clipped = [
+    [-1, 1, 0, 0],
+    [1, 1, 1, 0],
+    [0, 1, 1, 0],
+]  # with gene std 1,0,1,0 and center 0,2,1,0
+
 
 X_for_mask = [
     [27, 27, 27, 27],
@@ -51,9 +57,20 @@
     [27, 27, 27, 27],
     [27, 27, 27, 27],
 ]
+X_scaled_for_mask_clipped = [
+    [27, 27, 27, 27],
+    [27, 27, 27, 27],
+    [-1, 1, 0, 0],
+    [1, 1, 1, 0],
+    [0, 1, 1, 0],
+    [27, 27, 27, 27],
+    [27, 27, 27, 27],
+]
 
 
-@pytest.mark.parametrize("typ", [np.array, csr_matrix], ids=lambda x: x.__name__)
+@pytest.mark.parametrize(
+    "typ", [np.array, csr_matrix, csc_matrix], ids=lambda x: x.__name__
+)
 @pytest.mark.parametrize("dtype", ["float32", "int64"])
 @pytest.mark.parametrize(
     ("mask_obs", "X", "X_centered", "X_scaled"),
@@ -113,3 +130,25 @@ def test_clip(zero_center):
     if zero_center:
         assert adata.X.min() >= -1
     assert adata.X.max() <= 1
+
+
+@pytest.mark.parametrize(
+    ("mask_obs", "X", "X_scaled", "X_clipped"),
+    [
+        (None, X_original, X_scaled_original, X_scaled_original_clipped),
+        (
+            np.array((0, 0, 1, 1, 1, 0, 0), dtype=bool),
+            X_for_mask,
+            X_scaled_for_mask,
+            X_scaled_for_mask_clipped,
+        ),
+    ],
+)
+def test_scale_sparse(*, mask_obs, X, X_scaled, X_clipped):
+    adata0 = AnnData(csr_matrix(X).astype(np.float32))
+    sc.pp.scale(adata0, mask_obs=mask_obs, zero_center=False)
+    assert np.allclose(csr_matrix(adata0.X).toarray(), X_scaled)
+    # test scaling with explicit zero_center == True
+    adata1 = AnnData(csr_matrix(X).astype(np.float32))
+    sc.pp.scale(adata1, zero_center=False, mask_obs=mask_obs, max_value=1)
+    assert np.allclose(csr_matrix(adata1.X).toarray(), X_clipped)