From 41bc8fa7671e03e6b335670ebff9a3adaef595f4 Mon Sep 17 00:00:00 2001
From: "Lumberbot (aka Jack)"
 <39504233+meeseeksmachine@users.noreply.github.com>
Date: Mon, 8 Apr 2024 14:15:42 +0200
Subject: [PATCH] Backport PR #2942: updates sparse scale (#2985)

Co-authored-by: Severin Dicks <37635888+Intron7@users.noreply.github.com>
---
 docs/release-notes/1.10.1.md       |   3 +-
 scanpy/preprocessing/__init__.py   |   2 +-
 scanpy/preprocessing/_scale.py     | 310 +++++++++++++++++++++++++++++
 scanpy/preprocessing/_simple.py    | 228 +--------------------
 scanpy/tests/test_scaling.py       |  43 +++-
 scanpy/tools/_rank_genes_groups.py |   2 +-
 6 files changed, 356 insertions(+), 232 deletions(-)
 create mode 100644 scanpy/preprocessing/_scale.py

diff --git a/docs/release-notes/1.10.1.md b/docs/release-notes/1.10.1.md
index 201c7f864c..e94c9d6fdd 100644
--- a/docs/release-notes/1.10.1.md
+++ b/docs/release-notes/1.10.1.md
@@ -1,6 +1,5 @@
 ### 1.10.1 {small}`the future`
 
-
 ```{rubric} Docs
 ```
 
@@ -11,5 +10,7 @@
 
 * Fix `aggregate` when aggregating by more than two groups {pr}`2965` {smaller}`I Virshup`
 
+
 ```{rubric} Performance
 ```
+* {func}`~scanpy.pp.scale` now uses numba kernels for `sparse.csr_matrix` and `sparse.csc_matrix` when `zero_center==False` and `mask_obs` is provided. This greatly speed up execution {pr}`2942` {smaller}`S Dicks`
diff --git a/scanpy/preprocessing/__init__.py b/scanpy/preprocessing/__init__.py
index 3cfcff477d..8c396d8640 100644
--- a/scanpy/preprocessing/__init__.py
+++ b/scanpy/preprocessing/__init__.py
@@ -8,6 +8,7 @@
 from ._pca import pca
 from ._qc import calculate_qc_metrics
 from ._recipes import recipe_seurat, recipe_weinreb17, recipe_zheng17
+from ._scale import scale
 from ._scrublet import scrublet, scrublet_simulate_doublets
 from ._simple import (
     downsample_counts,
@@ -16,7 +17,6 @@
     log1p,
     normalize_per_cell,
     regress_out,
-    scale,
     sqrt,
     subsample,
 )
diff --git a/scanpy/preprocessing/_scale.py b/scanpy/preprocessing/_scale.py
new file mode 100644
index 0000000000..5f7f526aea
--- /dev/null
+++ b/scanpy/preprocessing/_scale.py
@@ -0,0 +1,310 @@
+from __future__ import annotations
+
+import warnings
+from functools import singledispatch
+from operator import truediv
+from typing import TYPE_CHECKING
+
+import numba
+import numpy as np
+from anndata import AnnData
+from scipy.sparse import issparse, isspmatrix_csc, spmatrix
+
+from .. import logging as logg
+from .._compat import DaskArray, old_positionals
+from .._utils import (
+    _check_array_function_arguments,
+    axis_mul_or_truediv,
+    renamed_arg,
+    view_to_actual,
+)
+from ..get import _check_mask, _get_obs_rep, _set_obs_rep
+from ._utils import _get_mean_var
+
+# install dask if available
+try:
+    import dask.array as da
+except ImportError:
+    da = None
+
+if TYPE_CHECKING:
+    from numpy.typing import NDArray
+
+
+@numba.njit(cache=True, parallel=True)
+def _scale_sparse_numba(indptr, indices, data, *, std, mask_obs, clip):
+    for i in numba.prange(len(indptr) - 1):
+        if mask_obs[i]:
+            for j in range(indptr[i], indptr[i + 1]):
+                if clip:
+                    data[j] = min(clip, data[j] / std[indices[j]])
+                else:
+                    data[j] /= std[indices[j]]
+
+
+@renamed_arg("X", "data", pos_0=True)
+@old_positionals("zero_center", "max_value", "copy", "layer", "obsm")
+@singledispatch
+def scale(
+    data: AnnData | spmatrix | np.ndarray | DaskArray,
+    *,
+    zero_center: bool = True,
+    max_value: float | None = None,
+    copy: bool = False,
+    layer: str | None = None,
+    obsm: str | None = None,
+    mask_obs: NDArray[np.bool_] | str | None = None,
+) -> AnnData | spmatrix | np.ndarray | DaskArray | None:
+    """\
+    Scale data to unit variance and zero mean.
+
+    .. note::
+        Variables (genes) that do not display any variation (are constant across
+        all observations) are retained and (for zero_center==True) set to 0
+        during this operation. In the future, they might be set to NaNs.
+
+    Parameters
+    ----------
+    data
+        The (annotated) data matrix of shape `n_obs` × `n_vars`.
+        Rows correspond to cells and columns to genes.
+    zero_center
+        If `False`, omit zero-centering variables, which allows to handle sparse
+        input efficiently.
+    max_value
+        Clip (truncate) to this value after scaling. If `None`, do not clip.
+    copy
+        Whether this function should be performed inplace. If an AnnData object
+        is passed, this also determines if a copy is returned.
+    layer
+        If provided, which element of layers to scale.
+    obsm
+        If provided, which element of obsm to scale.
+    mask_obs
+        Restrict both the derivation of scaling parameters and the scaling itself
+        to a certain set of observations. The mask is specified as a boolean array
+        or a string referring to an array in :attr:`~anndata.AnnData.obs`.
+        This will transform data from csc to csr format if `issparse(data)`.
+
+    Returns
+    -------
+    Returns `None` if `copy=False`, else returns an updated `AnnData` object. Sets the following fields:
+
+    `adata.X` | `adata.layers[layer]` : :class:`numpy.ndarray` | :class:`scipy.sparse._csr.csr_matrix` (dtype `float`)
+        Scaled count data matrix.
+    `adata.var['mean']` : :class:`pandas.Series` (dtype `float`)
+        Means per gene before scaling.
+    `adata.var['std']` : :class:`pandas.Series` (dtype `float`)
+        Standard deviations per gene before scaling.
+    `adata.var['var']` : :class:`pandas.Series` (dtype `float`)
+        Variances per gene before scaling.
+    """
+    _check_array_function_arguments(layer=layer, obsm=obsm)
+    if layer is not None:
+        raise ValueError(
+            f"`layer` argument inappropriate for value of type {type(data)}"
+        )
+    if obsm is not None:
+        raise ValueError(
+            f"`obsm` argument inappropriate for value of type {type(data)}"
+        )
+    return scale_array(
+        data, zero_center=zero_center, max_value=max_value, copy=copy, mask_obs=mask_obs
+    )
+
+
+@scale.register(np.ndarray)
+@scale.register(DaskArray)
+def scale_array(
+    X: np.ndarray | DaskArray,
+    *,
+    zero_center: bool = True,
+    max_value: float | None = None,
+    copy: bool = False,
+    return_mean_std: bool = False,
+    mask_obs: NDArray[np.bool_] | None = None,
+) -> (
+    np.ndarray
+    | DaskArray
+    | tuple[
+        np.ndarray | DaskArray, NDArray[np.float64] | DaskArray, NDArray[np.float64]
+    ]
+    | DaskArray
+):
+    if copy:
+        X = X.copy()
+    if mask_obs is not None:
+        mask_obs = _check_mask(X, mask_obs, "obs")
+        scale_rv = scale_array(
+            X[mask_obs, :],
+            zero_center=zero_center,
+            max_value=max_value,
+            copy=False,
+            return_mean_std=return_mean_std,
+            mask_obs=None,
+        )
+
+        if return_mean_std:
+            X[mask_obs, :], mean, std = scale_rv
+            return X, mean, std
+        else:
+            X[mask_obs, :] = scale_rv
+            return X
+
+    if not zero_center and max_value is not None:
+        logg.info(  # Be careful of what? This should be more specific
+            "... be careful when using `max_value` " "without `zero_center`."
+        )
+
+    if np.issubdtype(X.dtype, np.integer):
+        logg.info(
+            "... as scaling leads to float results, integer "
+            "input is cast to float, returning copy."
+        )
+        X = X.astype(float)
+
+    mean, var = _get_mean_var(X)
+    std = np.sqrt(var)
+    std[std == 0] = 1
+    if zero_center:
+        if isinstance(X, DaskArray) and issparse(X._meta):
+            warnings.warn(
+                "zero-center being used with `DaskArray` sparse chunks.  This can be bad if you have large chunks or intend to eventually read the whole data into memory.",
+                UserWarning,
+            )
+        X -= mean
+
+    X = axis_mul_or_truediv(
+        X,
+        std,
+        op=truediv,
+        out=X if isinstance(X, np.ndarray) or issparse(X) else None,
+        axis=1,
+    )
+
+    # do the clipping
+    if max_value is not None:
+        logg.debug(f"... clipping at max_value {max_value}")
+        if isinstance(X, DaskArray) and issparse(X._meta):
+
+            def clip_set(x):
+                x = x.copy()
+                x[x > max_value] = max_value
+                if zero_center:
+                    x[x < -max_value] = -max_value
+                return x
+
+            X = da.map_blocks(clip_set, X)
+        else:
+            if zero_center:
+                a_min, a_max = -max_value, max_value
+                X = np.clip(X, a_min, a_max)  # dask does not accept these as kwargs
+            else:
+                if issparse(X):
+                    X.data[X.data > max_value] = max_value
+                else:
+                    X[X > max_value] = max_value
+    if return_mean_std:
+        return X, mean, std
+    else:
+        return X
+
+
+@scale.register(spmatrix)
+def scale_sparse(
+    X: spmatrix,
+    *,
+    zero_center: bool = True,
+    max_value: float | None = None,
+    copy: bool = False,
+    return_mean_std: bool = False,
+    mask_obs: NDArray[np.bool_] | None = None,
+) -> np.ndarray | tuple[np.ndarray, NDArray[np.float64], NDArray[np.float64]]:
+    # need to add the following here to make inplace logic work
+    if zero_center:
+        logg.info(
+            "... as `zero_center=True`, sparse input is "
+            "densified and may lead to large memory consumption"
+        )
+        X = X.toarray()
+        copy = False  # Since the data has been copied
+        return scale_array(
+            X,
+            zero_center=zero_center,
+            copy=copy,
+            max_value=max_value,
+            return_mean_std=return_mean_std,
+            mask_obs=mask_obs,
+        )
+    elif mask_obs is None:
+        return scale_array(
+            X,
+            zero_center=zero_center,
+            copy=copy,
+            max_value=max_value,
+            return_mean_std=return_mean_std,
+            mask_obs=mask_obs,
+        )
+    else:
+        if isspmatrix_csc(X):
+            X = X.tocsr()
+        elif copy:
+            X = X.copy()
+
+        if mask_obs is not None:
+            mask_obs = _check_mask(X, mask_obs, "obs")
+
+    mean, var = _get_mean_var(X[mask_obs, :])
+
+    std = np.sqrt(var)
+    std[std == 0] = 1
+
+    if max_value is None:
+        max_value = 0
+
+    _scale_sparse_numba(
+        X.indptr,
+        X.indices,
+        X.data,
+        std=std.astype(X.dtype),
+        mask_obs=mask_obs,
+        clip=max_value,
+    )
+
+    if return_mean_std:
+        return X, mean, std
+    else:
+        return X
+
+
+@scale.register(AnnData)
+def scale_anndata(
+    adata: AnnData,
+    *,
+    zero_center: bool = True,
+    max_value: float | None = None,
+    copy: bool = False,
+    layer: str | None = None,
+    obsm: str | None = None,
+    mask_obs: NDArray[np.bool_] | str | None = None,
+) -> AnnData | None:
+    adata = adata.copy() if copy else adata
+    str_mean_std = ("mean", "std")
+    if mask_obs is not None:
+        if isinstance(mask_obs, str):
+            str_mean_std = (f"mean of {mask_obs}", f"std of {mask_obs}")
+        else:
+            str_mean_std = ("mean with mask", "std with mask")
+        mask_obs = _check_mask(adata, mask_obs, "obs")
+    view_to_actual(adata)
+    X = _get_obs_rep(adata, layer=layer, obsm=obsm)
+    X, adata.var[str_mean_std[0]], adata.var[str_mean_std[1]] = scale(
+        X,
+        zero_center=zero_center,
+        max_value=max_value,
+        copy=False,  # because a copy has already been made, if it were to be made
+        return_mean_std=True,
+        mask_obs=mask_obs,
+    )
+    _set_obs_rep(adata, X, layer=layer, obsm=obsm)
+    return adata if copy else None
diff --git a/scanpy/preprocessing/_simple.py b/scanpy/preprocessing/_simple.py
index a8099f52e7..62f2290f31 100644
--- a/scanpy/preprocessing/_simple.py
+++ b/scanpy/preprocessing/_simple.py
@@ -7,7 +7,6 @@
 
 import warnings
 from functools import singledispatch
-from operator import truediv
 from typing import TYPE_CHECKING, Literal
 
 import numba
@@ -24,15 +23,13 @@
 from .._utils import (
     AnyRandom,
     _check_array_function_arguments,
-    axis_mul_or_truediv,
     axis_sum,
     renamed_arg,
     sanitize_anndata,
     view_to_actual,
 )
-from ..get import _check_mask, _get_obs_rep, _set_obs_rep
+from ..get import _get_obs_rep, _set_obs_rep
 from ._distributed import materialize_as_ndarray
-from ._utils import _get_mean_var
 
 # install dask if available
 try:
@@ -749,229 +746,6 @@ def _regress_out_chunk(data):
     return np.vstack(responses_chunk_list)
 
 
-@renamed_arg("X", "data", pos_0=True)
-@old_positionals("zero_center", "max_value", "copy", "layer", "obsm")
-@singledispatch
-def scale(
-    data: AnnData | spmatrix | np.ndarray | DaskArray,
-    *,
-    zero_center: bool = True,
-    max_value: float | None = None,
-    copy: bool = False,
-    layer: str | None = None,
-    obsm: str | None = None,
-    mask_obs: NDArray[np.bool_] | str | None = None,
-) -> AnnData | spmatrix | np.ndarray | DaskArray | None:
-    """\
-    Scale data to unit variance and zero mean.
-
-    .. note::
-        Variables (genes) that do not display any variation (are constant across
-        all observations) are retained and (for zero_center==True) set to 0
-        during this operation. In the future, they might be set to NaNs.
-
-    Parameters
-    ----------
-    data
-        The (annotated) data matrix of shape `n_obs` × `n_vars`.
-        Rows correspond to cells and columns to genes.
-    zero_center
-        If `False`, omit zero-centering variables, which allows to handle sparse
-        input efficiently.
-    max_value
-        Clip (truncate) to this value after scaling. If `None`, do not clip.
-    copy
-        Whether this function should be performed inplace. If an AnnData object
-        is passed, this also determines if a copy is returned.
-    layer
-        If provided, which element of layers to scale.
-    obsm
-        If provided, which element of obsm to scale.
-    mask_obs
-        Restrict both the derivation of scaling parameters and the scaling itself
-        to a certain set of observations. The mask is specified as a boolean array
-        or a string referring to an array in :attr:`~anndata.AnnData.obs`.
-
-    Returns
-    -------
-    Returns `None` if `copy=False`, else returns an updated `AnnData` object. Sets the following fields:
-
-    `adata.X` | `adata.layers[layer]` : :class:`numpy.ndarray` | :class:`scipy.sparse._csr.csr_matrix` (dtype `float`)
-        Scaled count data matrix.
-    `adata.var['mean']` : :class:`pandas.Series` (dtype `float`)
-        Means per gene before scaling.
-    `adata.var['std']` : :class:`pandas.Series` (dtype `float`)
-        Standard deviations per gene before scaling.
-    `adata.var['var']` : :class:`pandas.Series` (dtype `float`)
-        Variances per gene before scaling.
-    """
-    _check_array_function_arguments(layer=layer, obsm=obsm)
-    if layer is not None:
-        raise ValueError(
-            f"`layer` argument inappropriate for value of type {type(data)}"
-        )
-    if obsm is not None:
-        raise ValueError(
-            f"`obsm` argument inappropriate for value of type {type(data)}"
-        )
-    return scale_array(
-        data, zero_center=zero_center, max_value=max_value, copy=copy, mask_obs=mask_obs
-    )
-
-
-@scale.register(np.ndarray)
-@scale.register(DaskArray)
-def scale_array(
-    X: np.ndarray | DaskArray,
-    *,
-    zero_center: bool = True,
-    max_value: float | None = None,
-    copy: bool = False,
-    return_mean_std: bool = False,
-    mask_obs: NDArray[np.bool_] | None = None,
-) -> (
-    np.ndarray
-    | DaskArray
-    | tuple[
-        np.ndarray | DaskArray, NDArray[np.float64] | DaskArray, NDArray[np.float64]
-    ]
-    | DaskArray
-):
-    if copy:
-        X = X.copy()
-    if mask_obs is not None:
-        mask_obs = _check_mask(X, mask_obs, "obs")
-        scale_rv = scale_array(
-            X[mask_obs, :],
-            zero_center=zero_center,
-            max_value=max_value,
-            copy=False,
-            return_mean_std=return_mean_std,
-            mask_obs=None,
-        )
-        if return_mean_std:
-            X[mask_obs, :], mean, std = scale_rv
-            return X, mean, std
-        else:
-            X[mask_obs, :] = scale_rv
-            return X
-
-    if not zero_center and max_value is not None:
-        logg.info(  # Be careful of what? This should be more specific
-            "... be careful when using `max_value` " "without `zero_center`."
-        )
-
-    if np.issubdtype(X.dtype, np.integer):
-        logg.info(
-            "... as scaling leads to float results, integer "
-            "input is cast to float, returning copy."
-        )
-        X = X.astype(float)
-
-    mean, var = _get_mean_var(X)
-    std = np.sqrt(var)
-    std[std == 0] = 1
-    if zero_center:
-        if isinstance(X, DaskArray) and issparse(X._meta):
-            warnings.warn(
-                "zero-center being used with `DaskArray` sparse chunks.  This can be bad if you have large chunks or intend to eventually read the whole data into memory.",
-                UserWarning,
-            )
-        X -= mean
-    X = axis_mul_or_truediv(
-        X,
-        std,
-        op=truediv,
-        out=X if isinstance(X, np.ndarray) or issparse(X) else None,
-        axis=1,
-    )
-
-    # do the clipping
-    if max_value is not None:
-        logg.debug(f"... clipping at max_value {max_value}")
-        if isinstance(X, DaskArray) and issparse(X._meta):
-
-            def clip_set(x):
-                x = x.copy()
-                x[x > max_value] = max_value
-                if zero_center:
-                    x[x < -max_value] = -max_value
-                return x
-
-            X = da.map_blocks(clip_set, X)
-        else:
-            if zero_center:
-                a_min, a_max = -max_value, max_value
-                X = np.clip(X, a_min, a_max)  # dask does not accept these as kwargs
-            else:
-                X[X > max_value] = max_value
-    if return_mean_std:
-        return X, mean, std
-    else:
-        return X
-
-
-@scale.register(spmatrix)
-def scale_sparse(
-    X: spmatrix,
-    *,
-    zero_center: bool = True,
-    max_value: float | None = None,
-    copy: bool = False,
-    return_mean_std: bool = False,
-    mask_obs: NDArray[np.bool_] | None = None,
-) -> np.ndarray | tuple[np.ndarray, NDArray[np.float64], NDArray[np.float64]]:
-    # need to add the following here to make inplace logic work
-    if zero_center:
-        logg.info(
-            "... as `zero_center=True`, sparse input is "
-            "densified and may lead to large memory consumption"
-        )
-        X = X.toarray()
-        copy = False  # Since the data has been copied
-    return scale_array(
-        X,
-        zero_center=zero_center,
-        copy=copy,
-        max_value=max_value,
-        return_mean_std=return_mean_std,
-        mask_obs=mask_obs,
-    )
-
-
-@scale.register(AnnData)
-def scale_anndata(
-    adata: AnnData,
-    *,
-    zero_center: bool = True,
-    max_value: float | None = None,
-    copy: bool = False,
-    layer: str | None = None,
-    obsm: str | None = None,
-    mask_obs: NDArray[np.bool_] | str | None = None,
-) -> AnnData | None:
-    adata = adata.copy() if copy else adata
-    str_mean_std = ("mean", "std")
-    if mask_obs is not None:
-        if isinstance(mask_obs, str):
-            str_mean_std = (f"mean of {mask_obs}", f"std of {mask_obs}")
-        else:
-            str_mean_std = ("mean with mask", "std with mask")
-        mask_obs = _check_mask(adata, mask_obs, "obs")
-    view_to_actual(adata)
-    X = _get_obs_rep(adata, layer=layer, obsm=obsm)
-    X, adata.var[str_mean_std[0]], adata.var[str_mean_std[1]] = scale(
-        X,
-        zero_center=zero_center,
-        max_value=max_value,
-        copy=False,  # because a copy has already been made, if it were to be made
-        return_mean_std=True,
-        mask_obs=mask_obs,
-    )
-    _set_obs_rep(adata, X, layer=layer, obsm=obsm)
-    return adata if copy else None
-
-
 @old_positionals("n_obs", "random_state", "copy")
 def subsample(
     data: AnnData | np.ndarray | spmatrix,
diff --git a/scanpy/tests/test_scaling.py b/scanpy/tests/test_scaling.py
index ba425e4408..b169f7a51e 100644
--- a/scanpy/tests/test_scaling.py
+++ b/scanpy/tests/test_scaling.py
@@ -3,7 +3,7 @@
 import numpy as np
 import pytest
 from anndata import AnnData
-from scipy.sparse import csr_matrix
+from scipy.sparse import csc_matrix, csr_matrix
 
 import scanpy as sc
 
@@ -23,6 +23,12 @@
     [1, 0, 1, 0],
     [0, 0, 0, 0],
 ]  # with gene std 1,0,1,0 and center 0,0,0,0
+X_scaled_original_clipped = [
+    [-1, 1, 0, 0],
+    [1, 1, 1, 0],
+    [0, 1, 1, 0],
+]  # with gene std 1,0,1,0 and center 0,2,1,0
+
 
 X_for_mask = [
     [27, 27, 27, 27],
@@ -51,9 +57,20 @@
     [27, 27, 27, 27],
     [27, 27, 27, 27],
 ]
+X_scaled_for_mask_clipped = [
+    [27, 27, 27, 27],
+    [27, 27, 27, 27],
+    [-1, 1, 0, 0],
+    [1, 1, 1, 0],
+    [0, 1, 1, 0],
+    [27, 27, 27, 27],
+    [27, 27, 27, 27],
+]
 
 
-@pytest.mark.parametrize("typ", [np.array, csr_matrix], ids=lambda x: x.__name__)
+@pytest.mark.parametrize(
+    "typ", [np.array, csr_matrix, csc_matrix], ids=lambda x: x.__name__
+)
 @pytest.mark.parametrize("dtype", ["float32", "int64"])
 @pytest.mark.parametrize(
     ("mask_obs", "X", "X_centered", "X_scaled"),
@@ -113,3 +130,25 @@ def test_clip(zero_center):
     if zero_center:
         assert adata.X.min() >= -1
     assert adata.X.max() <= 1
+
+
+@pytest.mark.parametrize(
+    ("mask_obs", "X", "X_scaled", "X_clipped"),
+    [
+        (None, X_original, X_scaled_original, X_scaled_original_clipped),
+        (
+            np.array((0, 0, 1, 1, 1, 0, 0), dtype=bool),
+            X_for_mask,
+            X_scaled_for_mask,
+            X_scaled_for_mask_clipped,
+        ),
+    ],
+)
+def test_scale_sparse(*, mask_obs, X, X_scaled, X_clipped):
+    adata0 = AnnData(csr_matrix(X).astype(np.float32))
+    sc.pp.scale(adata0, mask_obs=mask_obs, zero_center=False)
+    assert np.allclose(csr_matrix(adata0.X).toarray(), X_scaled)
+    # test scaling with explicit zero_center == True
+    adata1 = AnnData(csr_matrix(X).astype(np.float32))
+    sc.pp.scale(adata1, zero_center=False, mask_obs=mask_obs, max_value=1)
+    assert np.allclose(csr_matrix(adata1.X).toarray(), X_clipped)
diff --git a/scanpy/tools/_rank_genes_groups.py b/scanpy/tools/_rank_genes_groups.py
index c1aab62182..b91f846da1 100644
--- a/scanpy/tools/_rank_genes_groups.py
+++ b/scanpy/tools/_rank_genes_groups.py
@@ -14,7 +14,7 @@
 from .._compat import old_positionals
 from .._utils import check_nonnegative_integers
 from ..get import _check_mask
-from ..preprocessing._simple import _get_mean_var
+from ..preprocessing._utils import _get_mean_var
 
 if TYPE_CHECKING:
     from collections.abc import Generator, Iterable