diff --git a/.azure-pipelines.yml b/.azure-pipelines.yml index 9e72bd321d..00c7dc2f14 100644 --- a/.azure-pipelines.yml +++ b/.azure-pipelines.yml @@ -1,5 +1,5 @@ trigger: -- master +- main - "*.*.x" variables: @@ -52,21 +52,21 @@ jobs: - script: | python -m pip install --upgrade pip - pip install wheel coverage + pip install wheel pip install .[dev,$(TEST_EXTRA)] displayName: 'Install dependencies' condition: eq(variables['DEPENDENCIES_VERSION'], 'latest') - script: | python -m pip install --pre --upgrade pip - pip install --pre wheel coverage + pip install --pre wheel pip install --pre .[dev,$(TEST_EXTRA)] pip install -v "anndata[dev,test] @ git+https://github.com/scverse/anndata" displayName: 'Install dependencies release candidates' condition: eq(variables['DEPENDENCIES_VERSION'], 'pre-release') - script: | - python -m pip install pip wheel tomli packaging pytest-cov + python -m pip install pip wheel tomli packaging pip install `python3 ci/scripts/min-deps.py pyproject.toml --extra dev test` pip install --no-deps . displayName: 'Install dependencies minimum version' @@ -81,8 +81,7 @@ jobs: condition: eq(variables['TEST_TYPE'], 'standard') - script: | - coverage run -m pytest - coverage xml + pytest --cov --cov-report=xml --cov-context=test displayName: 'PyTest (coverage)' condition: eq(variables['TEST_TYPE'], 'coverage') diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml index 38fa7a0d4b..7e94695e47 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.yml +++ b/.github/ISSUE_TEMPLATE/bug-report.yml @@ -14,7 +14,7 @@ body: required: true - label: I have confirmed this bug exists on the latest version of scanpy. required: true - - label: (optional) I have confirmed this bug exists on the master branch of scanpy. + - label: (optional) I have confirmed this bug exists on the main branch of scanpy. required: false - type: markdown attributes: diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index a0c4b12e00..1505f196f5 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1,4 +1,4 @@ -blank_issues_enabled: false +blank_issues_enabled: true contact_links: - name: Scanpy Community Forum url: https://discourse.scverse.org/ diff --git a/.github/workflows/check-pr.yml b/.github/workflows/check-pr.yml index 21a1f361df..e74c041c8b 100644 --- a/.github/workflows/check-pr.yml +++ b/.github/workflows/check-pr.yml @@ -4,7 +4,6 @@ on: pull_request: branches: - main - - master types: # milestone changes - milestoned diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 18ea71fc98..709cc29b1a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.2.1 + rev: v0.2.2 hooks: - id: ruff types_or: [python, pyi, jupyter] @@ -26,7 +26,7 @@ repos: - id: check-merge-conflict - id: detect-private-key - id: no-commit-to-branch - args: ["--branch=master", "--branch=main"] + args: ["--branch=main"] ci: autofix_prs: false diff --git a/README.md b/README.md index 219f51a5ca..53c4c35706 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![Downloads](https://static.pepy.tech/badge/scanpy)](https://pepy.tech/project/scanpy) [![Conda](https://img.shields.io/conda/dn/conda-forge/scanpy?logo=Anaconda)](https://anaconda.org/conda-forge/scanpy) [![Docs](https://readthedocs.com/projects/icb-scanpy/badge/?version=latest)](https://scanpy.readthedocs.io) -[![Build Status](https://dev.azure.com/scverse/scanpy/_apis/build/status/theislab.scanpy?branchName=master)](https://dev.azure.com/scverse/scanpy/_build) +[![Build Status](https://dev.azure.com/scverse/scanpy/_apis/build/status/theislab.scanpy?branchName=main)](https://dev.azure.com/scverse/scanpy/_build) [![Discourse topics](https://img.shields.io/discourse/posts?color=yellow&logo=discourse&server=https%3A%2F%2Fdiscourse.scverse.org)](https://discourse.scverse.org/) [![Chat](https://img.shields.io/badge/zulip-join_chat-%2367b08f.svg)](https://scverse.zulipchat.com) [![Powered by NumFOCUS](https://img.shields.io/badge/powered%20by-NumFOCUS-orange.svg?style=flat&colorA=E1523D&colorB=007D8A)](https://numfocus.org/) diff --git a/docs/api/get.md b/docs/api/get.md index 82f74ed411..039769d0a6 100644 --- a/docs/api/get.md +++ b/docs/api/get.md @@ -19,5 +19,6 @@ useful formats. get.obs_df get.var_df get.rank_genes_groups_df + get.aggregate ``` diff --git a/docs/dev/code.md b/docs/dev/code.md index 61fffb31cb..06c267951b 100644 --- a/docs/dev/code.md +++ b/docs/dev/code.md @@ -20,5 +20,5 @@ We ignore a couple of flake8 checks which are documented in the .flake8 file in To learn how to ignore checks per line please read [flake8 violations](https://flake8.pycqa.org/en/latest/user/violations.html). Additionally, we use Scanpy’s -[EditorConfig](https://github.com/scverse/scanpy/blob/master/.editorconfig), +[EditorConfig](https://github.com/scverse/scanpy/blob/main/.editorconfig), so using an editor/IDE with support for both is helpful. diff --git a/docs/dev/getting-set-up.md b/docs/dev/getting-set-up.md index b7124303f5..750af53e91 100644 --- a/docs/dev/getting-set-up.md +++ b/docs/dev/getting-set-up.md @@ -21,7 +21,7 @@ This is very straight forward if you're using [GitHub's CLI](https://cli.github. $ gh repo fork scverse/scanpy --clone --remote ``` -This will fork the repo to your github account, create a clone of the repo on your current machine, add our repository as a remote, and set the `master` development branch to track our repository. +This will fork the repo to your github account, create a clone of the repo on your current machine, add our repository as a remote, and set the `main` development branch to track our repository. To do this manually, first make a fork of the repository by clicking the "fork" button on our main github package. Then, on your machine, run: @@ -32,7 +32,7 @@ git clone https://github.com/{your-username}/scanpy.git cd scanpy # Add our repository as a remote git remote add upstream https://github.com/scverse/scanpy.git -# git branch --set-upstream-to "upstream/master" +# git branch --set-upstream-to "upstream/main" ``` ### `pre-commit` @@ -61,11 +61,11 @@ If you choose not to run the hooks on each commit, you can run them manually wit ### Creating a branch for your feature All development should occur in branches dedicated to the particular work being done. -Additionally, unless you are a maintainer, all changes should be directed at the `master` branch. +Additionally, unless you are a maintainer, all changes should be directed at the `main` branch. You can create a branch with: ```shell -git checkout master # Starting from the master branch +git checkout main # Starting from the main branch git pull # Syncing with the repo git checkout -b {your-branch-name} # Making and changing to the new branch ``` diff --git a/docs/dev/release.md b/docs/dev/release.md index def899ac67..f4bcb528a6 100644 --- a/docs/dev/release.md +++ b/docs/dev/release.md @@ -67,4 +67,4 @@ If you want to replicate the process more exactly, make sure you are careful, and create a version tag before building (make sure you delete it after uploading to TestPyPI!). [hatch-build]: https://hatch.pypa.io/latest/config/build/ -[publish workflow]: https://github.com/scverse/scanpy/tree/master/.github/workflows/publish.yml +[publish workflow]: https://github.com/scverse/scanpy/tree/main/.github/workflows/publish.yml diff --git a/docs/dev/testing.md b/docs/dev/testing.md index e0297cff32..46ab9f129c 100644 --- a/docs/dev/testing.md +++ b/docs/dev/testing.md @@ -21,7 +21,7 @@ It can take a while to run the whole test suite. There are a few ways to cut dow ## Writing tests -You can refer to the [existing test suite](https://github.com/scverse/scanpy/tree/master/scanpy/tests) for examples. +You can refer to the [existing test suite](https://github.com/scverse/scanpy/tree/main/scanpy/tests) for examples. If you haven't written tests before, Software Carpentry has an [in-depth guide](https://katyhuff.github.io/2016-07-11-scipy/testing/01-basics.html) on the topic. We highly recommend using [Test Driven Development](https://en.wikipedia.org/wiki/Test-driven_development) when contributing code. diff --git a/docs/extensions/git_ref.py b/docs/extensions/git_ref.py index 1f57de7a25..dd61f0cce0 100644 --- a/docs/extensions/git_ref.py +++ b/docs/extensions/git_ref.py @@ -32,7 +32,7 @@ def get() -> str | None: try: git_ref = git("rev-parse", "HEAD") except Exception: - git_ref = "master" + git_ref = "main" return git_ref diff --git a/docs/release-notes/1.10.0.md b/docs/release-notes/1.10.0.md index 71ab684386..1b338fd5ed 100644 --- a/docs/release-notes/1.10.0.md +++ b/docs/release-notes/1.10.0.md @@ -1,50 +1,55 @@ -### 1.10.0 {small}`the future` +### 1.10.0rc2 {small}`2024-02-22` + +```{rubric} Bug fixes +``` + +* Fix pytest deprecation warning {pr}`2879` {smaller}`P Angerer` + +### 1.10.0rc1 {small}`2024-02-22` ```{rubric} Features ``` +* {func}`~scanpy.pp.scrublet` and {func}`~scanpy.pp.scrublet_simulate_doublets` were moved from {mod}`scanpy.external.pp` to {mod}`scanpy.pp`. The `scrublet` implementation is now maintained as part of scanpy {pr}`2703` {smaller}`P Angerer` +* {func}`scanpy.pp.pca`, {func}`scanpy.pp.scale`, {func}`scanpy.pl.embedding`, and {func}`scanpy.experimental.pp.normalize_pearson_residuals_pca` now support a `mask` parameter {pr}`2272` {smaller}`C Bright, T Marcella, & P Angerer` +* Enhanced dask support for some internal utilities, paving the way for more extensive dask support {pr}`2696` {smaller}`P Angerer` +* {func}`scanpy.pp.highly_variable_genes` supports dask for the default `seurat` and `cell_ranger` flavors {pr}`2809` {smaller}`P Angerer` +* New function {func}`scanpy.get.aggregate` which allows grouped aggregations over your data. Useful for pseudobulking! {pr}`2590` {smaller}`Isaac Virshup` {smaller}`Ilan Gold` {smaller}`Jon Bloom` +* {func}`scanpy.pp.neighbors` now has a `transformer` argument allowing the use of different ANN/ KNN libraries {pr}`2536` {smaller}`P Angerer` +* {func}`scanpy.experimental.pp.highly_variable_genes` using `flavor='pearson_residuals'` now uses numba for variance computation and is faster {pr}`2612` {smaller}`S Dicks & P Angerer` +* {func}`scanpy.tl.leiden` now offers `igraph`'s implementation of the leiden algorithm via via `flavor` when set to `igraph`. `leidenalg`'s implementation is still default, but discouraged. {pr}`2815` {smaller}`I Gold` +* {func}`scanpy.pp.highly_variable_genes` has new flavor `seurat_v3_paper` that is in its implementation consistent with the paper description in Stuart et al 2018. {pr}`2792` {smaller}`E Roellin` * {func}`scanpy.datasets.blobs` now accepts a `random_state` argument {pr}`2683` {smaller}`E Roellin` * {func}`scanpy.pp.pca` and {func}`scanpy.pp.regress_out` now accept a layer argument {pr}`2588` {smaller}`S Dicks` * {func}`scanpy.pp.subsample` with `copy=True` can now be called in backed mode {pr}`2624` {smaller}`E Roellin` -* {func}`scanpy.pp.neighbors` now has a `transformer` argument allowing for more flexibility {pr}`2536` {smaller}`P Angerer` -* {func}`scanpy.experimental.pp.highly_variable_genes` using `flavor='pearson_residuals'` - now uses numba for variance computation {pr}`2612` {smaller}`S Dicks & P Angerer` * {func}`scanpy.external.pp.harmony_integrate` now runs with 64 bit floats improving reproducibility {pr}`2655` {smaller}`S Dicks` -* {func}`~scanpy.pp.scrublet` and {func}`~scanpy.pp.scrublet_simulate_doublets` were moved from {mod}`scanpy.external.pp` to {mod}`scanpy.pp`. - The `scrublet` implementation is now maintained as part of scanpy {pr}`2703` {smaller}`P Angerer` -* Enhanced dask support for some internal utilities, paving the way for more extensive dask support {pr}`2696` {smaller}`P Angerer` -* {func}`scanpy.pp.pca`, {func}`scanpy.pp.scale`, {func}`scanpy.pl.embedding`, and {func}`scanpy.experimental.pp.normalize_pearson_residuals_pca` - now support a `mask` parameter {pr}`2272` {smaller}`C Bright, T Marcella, & P Angerer` * {func}`scanpy.tl.rank_genes_groups` no longer warns that it's default was changed from t-test_overestim_var to t-test {pr}`2798` {smaller}`L Heumos` -* {func}`scanpy.pp.highly_variable_genes` has new flavor `seurat_v3_paper` that is in its implementation consistent with the paper description in Stuart et al 2018. {pr}`2792` {smaller}`E Roellin` -* {func}`scanpy.pp.highly_variable_genes` supports dask for the default `seurat` and `cell_ranger` flavors {pr}`2809` {smaller}`P Angerer` -* Auto conversion of strings to collections in `scanpy.pp.calculate_qc_metrics` {pr}`2859` {smaller}`N Teyssier` +* `scanpy.pp.calculate_qc_metrics` now allows `qc_vars` to be passed as a string {pr}`2859` {smaller}`N Teyssier` ```{rubric} Docs ``` + +* Re-add search-as-you-type, this time via `readthedocs-sphinx-search` {pr}`2805` {smaller}`P Angerer` * Fixed a lot of broken usage examples {pr}`2605` {smaller}`P Angerer` * Improved harmonization of return field of `sc.pp` and `sc.tl` functions {pr}`2742` {smaller}`E Roellin` -* Re-add search-as-you-type, this time via `readthedocs-sphinx-search` {pr}`2805` {smaller}`P Angerer` * Improved docs for `percent_top` argument of {func}`~scanpy.pp.calculate_qc_metrics` {pr}`2849` {smaller}`I Virshup` ```{rubric} Bug fixes ``` * Updated {func}`~scanpy.read_visium` such that it can read spaceranger 2.0 files {smaller}`L Lehner` -* Fix {func}`~scanpy.pp.normalize_total` {pr}`2466` {smaller}`P Angerer` -* Fix testing package build {pr}`2468` {smaller}`P Angerer` +* Fix {func}`~scanpy.pp.normalize_total` for dask {pr}`2466` {smaller}`P Angerer` * Fix setting `sc.settings.verbosity` in some cases {pr}`2605` {smaller}`P Angerer` * Fix all remaining pandas warnings {pr}`2789` {smaller}`P Angerer` * Fix some annoying plotting warnings around violin plots {pr}`2844` {smaller}`P Angerer` * Scanpy now has a test job which tests against the minumum versions of the dependencies. In the process of implementing this, many bugs associated with using older versions of `pandas`, `anndata`, `numpy`, and `matplotlib` were fixed. {pr}`2816` {smaller}`I Virshup` +* Fix warnings caused by internal usage of `pandas.DataFrame.stack` with `pandas>=2.1` {pr}`2864`{smaller}`I Virshup` ```{rubric} Development ``` * Scanpy is now tested against python 3.12 {pr}`2863` {smaller}`ivirshup` - -```{rubric} Ecosystem -``` +* Fix testing package build {pr}`2468` {smaller}`P Angerer` ```{rubric} Deprecations ``` @@ -52,3 +57,4 @@ * Dropped support for Python 3.8. [More details here](https://numpy.org/neps/nep-0029-deprecation_policy.html). {pr}`2695` {smaller}`P Angerer` * Deprecated specifying large numbers of function parameters by position as opposed to by name/keyword in all public APIs. e.g. prefer `sc.tl.umap(adata, min_dist=0.1, spread=0.8)` over `sc.tl.umap(adata, 0.1, 0.8)` {pr}`2702` {smaller}`P Angerer` +* Dropped support for `umap<0.5` for performance reasons. {pr}`2870` {smaller}`P Angerer` diff --git a/docs/release-notes/1.11.0.md b/docs/release-notes/1.11.0.md new file mode 100644 index 0000000000..fb6c33a443 --- /dev/null +++ b/docs/release-notes/1.11.0.md @@ -0,0 +1,13 @@ +### 1.11.0 {small}`the future` + +```{rubric} Features +``` + +```{rubric} Docs +``` + +```{rubric} Bug fixes +``` + +```{rubric} Deprecations +``` diff --git a/docs/release-notes/index.md b/docs/release-notes/index.md index 72160d4eac..0dd4e2a020 100644 --- a/docs/release-notes/index.md +++ b/docs/release-notes/index.md @@ -2,6 +2,11 @@ # Release notes +## Version 1.11 + +```{include} /release-notes/1.11.0.md +``` + ## Version 1.10 ```{include} /release-notes/1.10.0.md diff --git a/pyproject.toml b/pyproject.toml index 9046317d07..3acaed64b3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,7 +63,7 @@ dependencies = [ "natsort", "joblib", "numba>=0.56", - "umap-learn>=0.3.10", + "umap-learn>=0.5,!=0.5.0", "pynndescent>=0.5", "packaging>=21.3", "session-info", @@ -87,6 +87,7 @@ test-min = [ "pytest>=7.4.2", "pytest-nunit", "pytest-mock", + "pytest-cov", "profimp", ] test = [ @@ -159,7 +160,6 @@ addopts = [ "--import-mode=importlib", "--strict-markers", "--doctest-modules", - "-pscanpy.testing._pytest", ] testpaths = ["scanpy"] norecursedirs = ["scanpy/tests/_images"] diff --git a/scanpy/__init__.py b/scanpy/__init__.py index d6a2e46770..be2a10f5bf 100644 --- a/scanpy/__init__.py +++ b/scanpy/__init__.py @@ -1,6 +1,8 @@ """Single-Cell Analysis in Python.""" from __future__ import annotations +import sys + try: # See https://github.com/maresb/hatch-vcs-footgun-example from setuptools_scm import get_version @@ -21,6 +23,11 @@ # the actual API # (start with settings as several tools are using it) + +from ._settings import Verbosity, settings + +set_figure_params = settings.set_figure_params + from anndata import ( AnnData, concat, @@ -38,15 +45,10 @@ from . import plotting as pl from . import preprocessing as pp from . import tools as tl -from ._settings import Verbosity, settings from .neighbors import Neighbors from .readwrite import read, read_10x_h5, read_10x_mtx, read_visium, write -set_figure_params = settings.set_figure_params - # has to be done at the end, after everything has been imported -import sys - sys.modules.update({f"{__name__}.{m}": globals()[m] for m in ["tl", "pp", "pl"]}) from ._utils import annotate_doc_types diff --git a/scanpy/_utils/__init__.py b/scanpy/_utils/__init__.py index 6d21a11fd5..e182ab3840 100644 --- a/scanpy/_utils/__init__.py +++ b/scanpy/_utils/__init__.py @@ -7,9 +7,11 @@ import importlib.util import inspect +import random import sys import warnings from collections import namedtuple +from contextlib import contextmanager from enum import Enum from functools import partial, singledispatch, wraps from textwrap import dedent @@ -20,10 +22,10 @@ import numpy as np from anndata import AnnData from anndata import __version__ as anndata_version -from numpy import random from numpy.typing import NDArray from packaging import version from scipy import sparse +from sklearn.utils import check_random_state from .. import logging as logg from .._compat import DaskArray @@ -45,16 +47,43 @@ def __repr__(self) -> str: _empty = Empty.token # e.g. https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html -AnyRandom = Union[int, random.RandomState, None] # maybe in the future random.Generator +# maybe in the future random.Generator +AnyRandom = Union[int, np.random.RandomState, None] -EPS = 1e-15 +class RNGIgraph: + """ + Random number generator for ipgraph so global seed is not changed. + See :func:`igraph.set_random_number_generator` for the requirements. + """ + + def __init__(self, random_state: int = 0) -> None: + self._rng = check_random_state(random_state) -def check_versions(): - from .._compat import pkg_version + def __getattr__(self, attr: str): + return getattr(self._rng, "normal" if attr == "gauss" else attr) + + +@contextmanager +def set_igraph_random_state(random_state: int): + try: + import igraph + except ImportError: + raise ImportError( + "Please install igraph: `conda install -c conda-forge igraph` or `pip3 install igraph`." + ) + rng = RNGIgraph(random_state) + try: + igraph.set_random_number_generator(rng) + yield None + finally: + igraph.set_random_number_generator(random) - umap_version = pkg_version("umap-learn") +EPS = 1e-15 + + +def check_versions(): if version.parse(anndata_version) < version.parse("0.6.10"): from .. import __version__ @@ -63,15 +92,6 @@ def check_versions(): f"not {anndata_version}.\nRun `pip install anndata -U --no-deps`." ) - if umap_version < version.parse("0.3.0"): - from . import __version__ - - # make this a warning, not an error - # it might be useful for people to still be able to run it - logg.warning( - f"Scanpy {__version__} needs umap " f"version >=0.3.0, not {umap_version}." - ) - def getdoc(c_or_f: Callable | type) -> str | None: if getattr(c_or_f, "__doc__", None) is None: @@ -459,10 +479,10 @@ def moving_average(a: np.ndarray, n: int): return ret[n - 1 :] / n -def get_random_state(seed: AnyRandom) -> random.RandomState: +def get_random_state(seed: AnyRandom) -> np.random.RandomState: if isinstance(seed, np.random.RandomState): return seed - return random.RandomState(seed) + return np.random.RandomState(seed) # -------------------------------------------------------------------------------- @@ -841,3 +861,13 @@ def _choose_graph(adata, obsp, neighbors_key): "to compute a neighborhood graph." ) return neighbors["connectivities"] + + +def _resolve_axis( + axis: Literal["obs", 0, "var", 1], +) -> tuple[Literal[0], Literal["obs"]] | tuple[Literal[1], Literal["var"]]: + if axis in {0, "obs"}: + return (0, "obs") + if axis in {1, "var"}: + return (1, "var") + raise ValueError(f"`axis` must be either 0, 1, 'obs', or 'var', was {axis!r}") diff --git a/scanpy/get/__init__.py b/scanpy/get/__init__.py index 08567cfc6e..56c0d3c130 100644 --- a/scanpy/get/__init__.py +++ b/scanpy/get/__init__.py @@ -1,7 +1,6 @@ -# Public -# Private from __future__ import annotations +from ._aggregated import aggregate from .get import ( _check_mask, _get_obs_rep, @@ -15,6 +14,7 @@ "_check_mask", "_get_obs_rep", "_set_obs_rep", + "aggregate", "obs_df", "rank_genes_groups_df", "var_df", diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py new file mode 100644 index 0000000000..159f7572e3 --- /dev/null +++ b/scanpy/get/_aggregated.py @@ -0,0 +1,387 @@ +from __future__ import annotations + +from functools import singledispatch +from typing import TYPE_CHECKING, Literal, Union, get_args + +import numpy as np +import pandas as pd +from anndata import AnnData, utils +from scipy import sparse + +from .._utils import _resolve_axis +from .get import _check_mask + +if TYPE_CHECKING: + from collections.abc import Collection, Iterable + + from numpy.typing import NDArray + +Array = Union[np.ndarray, sparse.csc_matrix, sparse.csr_matrix] +AggType = Literal["count_nonzero", "mean", "sum", "var"] + + +class Aggregate: + """\ + Functionality for generic grouping and aggregating. + + There is currently support for count_nonzero, sum, mean, and variance. + + **Implementation** + + Moments are computed using weighted sum aggregation of data by some feature + via multiplication by a sparse coordinate matrix A. + + Runtime is effectively computation of the product `A @ X`, i.e. the count of (non-zero) + entries in X with multiplicity the number of group memberships for that entry. + This is `O(data)` for partitions (each observation belonging to exactly one group), + independent of the number of groups. + + Params + ------ + groupby + :class:`~pandas.Categorical` containing values for grouping by. + data + Data matrix for aggregation. + mask + Mask to be used for aggregation. + """ + + def __init__( + self, + groupby: pd.Categorical, + data: Array, + *, + mask: NDArray[np.bool_] | None = None, + ) -> None: + self.groupby = groupby + self.indicator_matrix = sparse_indicator(groupby, mask=mask) + self.data = data + + groupby: pd.Categorical + indicator_matrix: sparse.coo_matrix + data: Array + + def count_nonzero(self) -> NDArray[np.integer]: + """\ + Count the number of observations in each group. + + Returns + ------- + Array of counts. + """ + # pattern = self.data._with_data(np.broadcast_to(1, len(self.data.data))) + # return self.indicator_matrix @ pattern + return self.indicator_matrix @ (self.data != 0) + + def sum(self) -> Array: + """\ + Compute the sum per feature per group of observations. + + Returns + ------- + Array of sum. + """ + return utils.asarray(self.indicator_matrix @ self.data) + + def mean(self) -> Array: + """\ + Compute the mean per feature per group of observations. + + Returns + ------- + Array of mean. + """ + return ( + utils.asarray(self.indicator_matrix @ self.data) + / np.bincount(self.groupby.codes)[:, None] + ) + + def mean_var(self, dof: int = 1) -> tuple[np.ndarray, np.ndarray]: + """\ + Compute the count, as well as mean and variance per feature, per group of observations. + + The formula `Var(X) = E(X^2) - E(X)^2` suffers loss of precision when the variance is a + very small fraction of the squared mean. In particular, when X is constant, the formula may + nonetheless be non-zero. By default, our implementation resets the variance to exactly zero + when the computed variance, relative to the squared mean, nears limit of precision of the + floating-point significand. + + Params + ------ + dof + Degrees of freedom for variance. + + Returns + ------- + Object with `count`, `mean`, and `var` attributes. + """ + assert dof >= 0 + + group_counts = np.bincount(self.groupby.codes) + mean_ = self.mean() + # sparse matrices do not support ** for elementwise power. + mean_sq = ( + utils.asarray(self.indicator_matrix @ _power(self.data, 2)) + / group_counts[:, None] + ) + sq_mean = mean_**2 + var_ = mean_sq - sq_mean + # TODO: Why these values exactly? Because they are high relative to the datatype? + # (unchanged from original code: https://github.com/scverse/anndata/pull/564) + precision = 2 << (42 if self.data.dtype == np.float64 else 20) + # detects loss of precision in mean_sq - sq_mean, which suggests variance is 0 + var_[precision * var_ < sq_mean] = 0 + if dof != 0: + var_ *= (group_counts / (group_counts - dof))[:, np.newaxis] + return mean_, var_ + + +def _power(X: Array, power: float | int) -> Array: + """\ + Generate elementwise power of a matrix. + + Needed for non-square sparse matrices because they do not support `**` so the `.power` function is used. + + Params + ------ + X + Matrix whose power is to be raised. + power + Integer power value + + Returns + ------- + Matrix whose power has been raised. + """ + return X**power if isinstance(X, np.ndarray) else X.power(power) + + +@singledispatch +def aggregate( + adata: AnnData, + by: str | Collection[str], + func: AggType | Iterable[AggType], + *, + axis: Literal["obs", 0, "var", 1] | None = None, + mask: NDArray[np.bool_] | str | None = None, + dof: int = 1, + layer: str | None = None, + obsm: str | None = None, + varm: str | None = None, +) -> AnnData: + """\ + Aggregate data matrix based on some categorical grouping. + + This function is useful for pseudobulking as well as plotting. + + Aggregation to perform is specified by `func`, which can be a single metric or a + list of metrics. Each metric is computed over the group and results in a new layer + in the output `AnnData` object. + + If none of `layer`, `obsm`, or `varm` are passed in, `X` will be used for aggregation data. + If `func` only has length 1 or is just an `AggType`, then aggregation data is written to `X`. + Otherwise, it is written to `layers` or `xxxm` as appropriate for the dimensions of the aggregation data. + + Params + ------ + adata + :class:`~anndata.AnnData` to be aggregated. + by + Key of the column to be grouped-by. + func + How to aggregate. + axis + Axis on which to find group by column. + mask + Boolean mask (or key to column containing mask) to apply along the axis. + dof + Degrees of freedom for variance. Defaults to 1. + layer + If not None, key for aggregation data. + obsm + If not None, key for aggregation data. + varm + If not None, key for aggregation data. + + Returns + ------- + Aggregated :class:`~anndata.AnnData`. + + Examples + -------- + + Calculating mean expression and number of nonzero entries per cluster: + + >>> import scanpy as sc, pandas as pd + >>> pbmc = sc.datasets.pbmc3k_processed().raw.to_adata() + >>> pbmc.shape + (2638, 13714) + >>> aggregated = sc.get.aggregate(pbmc, by="louvain", func=["mean", "count_nonzero"]) + >>> aggregated + AnnData object with n_obs × n_vars = 8 × 13714 + obs: 'louvain' + var: 'n_cells' + layers: 'mean', 'count_nonzero' + + We can group over multiple columns: + + >>> pbmc.obs["percent_mito_binned"] = pd.cut(pbmc.obs["percent_mito"], bins=5) + >>> sc.get.aggregate(pbmc, by=["louvain", "percent_mito_binned"], func=["mean", "count_nonzero"]) + AnnData object with n_obs × n_vars = 40 × 13714 + obs: 'louvain', 'percent_mito_binned' + var: 'n_cells' + layers: 'mean', 'count_nonzero' + + Note that this filters out any combination of groups that wasn't present in the original data. + """ + if axis is None: + axis = 1 if varm else 0 + axis, axis_name = _resolve_axis(axis) + if mask is not None: + mask = _check_mask(adata, mask, axis_name) + data = adata.X + if sum(p is not None for p in [varm, obsm, layer]) > 1: + raise TypeError("Please only provide one (or none) of varm, obsm, or layer") + + if varm is not None: + if axis != 1: + raise ValueError("varm can only be used when axis is 1") + data = adata.varm[varm] + elif obsm is not None: + if axis != 0: + raise ValueError("obsm can only be used when axis is 0") + data = adata.obsm[obsm] + elif layer is not None: + data = adata.layers[layer] + if axis == 1: + data = data.T + elif axis == 1: + # i.e., all of `varm`, `obsm`, `layers` are None so we use `X` which must be transposed + data = data.T + + dim_df = getattr(adata, axis_name) + categorical, new_label_df = _combine_categories(dim_df, by) + # Actual computation + layers = aggregate( + data, + by=categorical, + func=func, + mask=mask, + dof=dof, + ) + result = AnnData( + layers=layers, + obs=new_label_df, + var=getattr(adata, "var" if axis == 0 else "obs"), + ) + + if axis == 1: + return result.T + else: + return result + + +@aggregate.register(np.ndarray) +@aggregate.register(sparse.spmatrix) +def aggregate_array( + data, + by: pd.Categorical, + func: AggType | Iterable[AggType], + *, + mask: NDArray[np.bool_] | None = None, + dof: int = 1, +) -> dict[AggType, np.ndarray]: + groupby = Aggregate(groupby=by, data=data, mask=mask) + result = {} + + funcs = set([func] if isinstance(func, str) else func) + if unknown := funcs - set(get_args(AggType)): + raise ValueError(f"func {unknown} is not one of {get_args(AggType)}") + + if "sum" in funcs: # sum is calculated separately from the rest + agg = groupby.sum() + result["sum"] = agg + # here and below for count, if var is present, these can be calculate alongside var + if "mean" in funcs and "var" not in funcs: + agg = groupby.mean() + result["mean"] = agg + if "count_nonzero" in funcs: + result["count_nonzero"] = groupby.count_nonzero() + if "var" in funcs: + mean_, var_ = groupby.mean_var(dof) + result["var"] = var_ + if "mean" in funcs: + result["mean"] = mean_ + + return result + + +def _combine_categories( + label_df: pd.DataFrame, cols: Collection[str] | str +) -> tuple[pd.Categorical, pd.DataFrame]: + """ + Returns both the result categories and a dataframe labelling each row + """ + from itertools import product + + if isinstance(cols, str): + cols = [cols] + + df = pd.DataFrame( + {c: pd.Categorical(label_df[c]).remove_unused_categories() for c in cols}, + ) + n_categories = [len(df[c].cat.categories) for c in cols] + + # It's like np.concatenate([x for x in product(*[range(n) for n in n_categories])]) + code_combinations = np.indices(n_categories).reshape(len(n_categories), -1) + result_categories = pd.Index( + ["_".join(map(str, x)) for x in product(*[df[c].cat.categories for c in cols])] + ) + + # Dataframe with unique combination of categories for each row + new_label_df = pd.DataFrame( + { + c: pd.Categorical.from_codes(code_combinations[i], df[c].cat.categories) + for i, c in enumerate(cols) + }, + index=result_categories, + ) + + # Calculating result codes + factors = np.ones(len(cols) + 1, dtype=np.int32) # First factor needs to be 1 + np.cumsum(n_categories[::-1], out=factors[1:]) + factors = factors[:-1][::-1] + + code_array = np.zeros((len(cols), df.shape[0]), dtype=np.int32) + for i, c in enumerate(cols): + code_array[i] = df[c].cat.codes + code_array *= factors[:, None] + + result_categorical = pd.Categorical.from_codes( + code_array.sum(axis=0), categories=result_categories + ) + + # Filter unused categories + result_categorical = result_categorical.remove_unused_categories() + new_label_df = new_label_df.loc[result_categorical.categories] + + return result_categorical, new_label_df + + +def sparse_indicator( + categorical: pd.Categorical, + *, + mask: NDArray[np.bool_] | None = None, + weight: NDArray[np.floating] | None = None, +) -> sparse.coo_matrix: + if mask is not None and weight is None: + weight = mask.astype(np.float32) + elif mask is not None and weight is not None: + weight = mask * weight + elif mask is None and weight is None: + weight = np.broadcast_to(1.0, len(categorical)) + A = sparse.coo_matrix( + (weight, (categorical.codes, np.arange(len(categorical)))), + shape=(len(categorical.categories), len(categorical)), + ) + return A diff --git a/scanpy/get/get.py b/scanpy/get/get.py index a9ca372dc0..3080dfb2bb 100644 --- a/scanpy/get/get.py +++ b/scanpy/get/get.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd from anndata import AnnData +from packaging.version import Version from scipy.sparse import spmatrix if TYPE_CHECKING: @@ -23,7 +24,7 @@ # TODO: implement diffxpy method, make singledispatch def rank_genes_groups_df( adata: AnnData, - group: str | Iterable[str], + group: str | Iterable[str] | None, *, key: str = "rank_genes_groups", pval_cutoff: float | None = None, @@ -74,7 +75,10 @@ def rank_genes_groups_df( d = [pd.DataFrame(adata.uns[key][c])[group] for c in colnames] d = pd.concat(d, axis=1, names=[None, "group"], keys=colnames) - d = d.stack(level=1).reset_index() + if Version(pd.__version__) >= Version("2.1"): + d = d.stack(level=1, future_stack=True).reset_index() + else: + d = d.stack(level=1).reset_index() d["group"] = pd.Categorical(d["group"], categories=group) d = d.sort_values(["group", "level_0"]).drop(columns="level_0") diff --git a/scanpy/neighbors/_connectivity.py b/scanpy/neighbors/_connectivity.py index f6788d33d2..adbde723f7 100644 --- a/scanpy/neighbors/_connectivity.py +++ b/scanpy/neighbors/_connectivity.py @@ -123,7 +123,7 @@ def umap( from umap.umap_ import fuzzy_simplicial_set X = coo_matrix(([], ([], [])), shape=(n_obs, 1)) - connectivities = fuzzy_simplicial_set( + connectivities, _sigmas, _rhos = fuzzy_simplicial_set( X, n_neighbors, None, @@ -134,8 +134,4 @@ def umap( local_connectivity=local_connectivity, ) - if isinstance(connectivities, tuple): - # In umap-learn 0.4, this returns (result, sigmas, rhos) - connectivities = connectivities[0] - return connectivities.tocsr() diff --git a/scanpy/plotting/_stacked_violin.py b/scanpy/plotting/_stacked_violin.py index 6b9abfd49c..21a9d9705b 100644 --- a/scanpy/plotting/_stacked_violin.py +++ b/scanpy/plotting/_stacked_violin.py @@ -7,6 +7,7 @@ import pandas as pd from matplotlib import pyplot as plt from matplotlib.colors import Normalize, is_color_like +from packaging.version import Version from .. import logging as logg from .._compat import old_positionals @@ -468,9 +469,13 @@ def _make_rows_of_violinplots( # the expression value # This format is convenient to aggregate per gene or per category # while making the violin plots. + if Version(pd.__version__) >= Version("2.1"): + stack_kwargs = {"future_stack": True} + else: + stack_kwargs = {"dropna": False} df = ( - pd.DataFrame(_matrix.stack(dropna=False)) + pd.DataFrame(_matrix.stack(**stack_kwargs)) .reset_index() .rename( columns={ diff --git a/scanpy/preprocessing/_docs.py b/scanpy/preprocessing/_docs.py index b46a5d48ff..3c13e84afb 100644 --- a/scanpy/preprocessing/_docs.py +++ b/scanpy/preprocessing/_docs.py @@ -26,7 +26,7 @@ By default uses them if they have been determined beforehand. .. deprecated:: 1.10.0 - Use `mask` instead + Use `mask_var` instead """ doc_obs_qc_args = """\ diff --git a/scanpy/testing/_doctests.py b/scanpy/testing/_doctests.py index 9be5b3f227..6a339ab7f8 100644 --- a/scanpy/testing/_doctests.py +++ b/scanpy/testing/_doctests.py @@ -9,22 +9,8 @@ def doctest_needs(mod: str) -> Callable[[F], F]: """Mark function with doctest dependency.""" - try: - from ._pytest.marks import needs - except ImportError: - mark = None - else: - try: - mark = needs[mod] - except KeyError: - raise KeyError( - f"Unknown dependency {mod}. If it isn’t a typo, " - "please add it to `needs` enum in `scanpy.testing._pytests.marks`." - ) from None - def decorator(func: F) -> F: - if mark is not None: - func._doctest_needs = mark + func._doctest_needs = mod return func return decorator diff --git a/scanpy/testing/_pytest/__init__.py b/scanpy/testing/_pytest/__init__.py index ad42b8ec74..e864d31d3d 100644 --- a/scanpy/testing/_pytest/__init__.py +++ b/scanpy/testing/_pytest/__init__.py @@ -10,12 +10,11 @@ from ..._utils import _import_name from .fixtures import * # noqa: F403 +from .marks import needs if TYPE_CHECKING: from collections.abc import Generator, Iterable - from .marks import needs - # Defining it here because it’s autouse. @pytest.fixture(autouse=True) @@ -113,9 +112,9 @@ def _modify_doctests(request: pytest.FixtureRequest) -> None: request.getfixturevalue("_doctest_env") func = _import_name(request.node.name) - needs_marker: needs | None - if needs_marker := getattr(func, "_doctest_needs", None): - assert needs_marker.mark.name == "skipif" + needs_mod: str | None + if needs_mod := getattr(func, "_doctest_needs", None): + needs_marker = needs[needs_mod] if needs_marker.mark.args[0]: pytest.skip(reason=needs_marker.mark.kwargs["reason"]) skip_reason: str | None diff --git a/scanpy/testing/_pytest/marks.py b/scanpy/testing/_pytest/marks.py index e6fa85f936..dd6fda5db0 100644 --- a/scanpy/testing/_pytest/marks.py +++ b/scanpy/testing/_pytest/marks.py @@ -57,4 +57,4 @@ def __init__(self, mod: str) -> None: if self._name_.casefold() != mod.casefold().replace("-", "_"): reason = f"{reason} (`pip install {mod}`)" dec = pytest.mark.skipif(not find_spec(self._name_), reason=reason) - super().__init__(dec.mark) + super().__init__(dec.mark, _ispytest=True) diff --git a/scanpy/tests/_images/heatmap_var_as_dict/expected.png b/scanpy/tests/_images/heatmap_var_as_dict/expected.png index 9406010120..aed510396f 100644 Binary files a/scanpy/tests/_images/heatmap_var_as_dict/expected.png and b/scanpy/tests/_images/heatmap_var_as_dict/expected.png differ diff --git a/scanpy/tests/conftest.py b/scanpy/tests/conftest.py index a3dcf69d75..5c96484f4e 100644 --- a/scanpy/tests/conftest.py +++ b/scanpy/tests/conftest.py @@ -7,6 +7,8 @@ import pytest +pytest_plugins = ["scanpy.testing._pytest"] + # just import for the IMPORTED check import scanpy as _sc # noqa: F401 diff --git a/scanpy/tests/notebooks/_images_pbmc3k/rank_genes_groups_1/expected.png b/scanpy/tests/notebooks/_images_pbmc3k/rank_genes_groups_1/expected.png index d14bbab718..093cc370aa 100644 Binary files a/scanpy/tests/notebooks/_images_pbmc3k/rank_genes_groups_1/expected.png and b/scanpy/tests/notebooks/_images_pbmc3k/rank_genes_groups_1/expected.png differ diff --git a/scanpy/tests/notebooks/_images_pbmc3k/rank_genes_groups_2/expected.png b/scanpy/tests/notebooks/_images_pbmc3k/rank_genes_groups_2/expected.png index a7d4ea57b5..f4ca8d06f0 100644 Binary files a/scanpy/tests/notebooks/_images_pbmc3k/rank_genes_groups_2/expected.png and b/scanpy/tests/notebooks/_images_pbmc3k/rank_genes_groups_2/expected.png differ diff --git a/scanpy/tests/notebooks/_images_pbmc3k/rank_genes_groups_3/expected.png b/scanpy/tests/notebooks/_images_pbmc3k/rank_genes_groups_3/expected.png index e33a4e2d44..89d455fa3c 100644 Binary files a/scanpy/tests/notebooks/_images_pbmc3k/rank_genes_groups_3/expected.png and b/scanpy/tests/notebooks/_images_pbmc3k/rank_genes_groups_3/expected.png differ diff --git a/scanpy/tests/notebooks/_images_pbmc3k/scatter_3/expected.png b/scanpy/tests/notebooks/_images_pbmc3k/scatter_3/expected.png index 41bae501b5..2be9e84fd1 100644 Binary files a/scanpy/tests/notebooks/_images_pbmc3k/scatter_3/expected.png and b/scanpy/tests/notebooks/_images_pbmc3k/scatter_3/expected.png differ diff --git a/scanpy/tests/notebooks/_images_pbmc3k/violin_2/expected.png b/scanpy/tests/notebooks/_images_pbmc3k/violin_2/expected.png index 4f2bf2fbf2..a9c29f6c24 100644 Binary files a/scanpy/tests/notebooks/_images_pbmc3k/violin_2/expected.png and b/scanpy/tests/notebooks/_images_pbmc3k/violin_2/expected.png differ diff --git a/scanpy/tests/notebooks/test_pbmc3k.py b/scanpy/tests/notebooks/test_pbmc3k.py index 5e2c71e617..ad883d1403 100644 --- a/scanpy/tests/notebooks/test_pbmc3k.py +++ b/scanpy/tests/notebooks/test_pbmc3k.py @@ -28,11 +28,10 @@ @needs.leidenalg def test_pbmc3k(image_comparer): + # ensure violin plots and other non-determinstic plots have deterministic behavior + np.random.seed(0) save_and_compare_images = partial(image_comparer, ROOT, tol=20) - - adata = sc.read( - "./data/pbmc3k_raw.h5ad", backup_url="https://falexwolf.de/data/pbmc3k_raw.h5ad" - ) + adata = sc.datasets.pbmc3k() # Preprocessing @@ -105,13 +104,48 @@ def test_pbmc3k(image_comparer): # Clustering the graph - sc.tl.leiden(adata, resolution=0.9) - # sc.pl.umap(adata, color=['leiden', 'CST3', 'NKG7'], show=False) - # save_and_compare_images('umap_2') + sc.tl.leiden( + adata, + resolution=0.9, + random_state=0, + directed=False, + n_iterations=2, + flavor="igraph", + ) + + # sc.pl.umap(adata, color=["leiden", "CST3", "NKG7"], show=False) + # save_and_compare_images("umap_2") sc.pl.scatter(adata, "CST3", "NKG7", color="leiden", show=False) save_and_compare_images("scatter_3") # Finding marker genes + # Due to incosistency with our test runner vs local, these clusters need to + # be pre-annotated as the numbers for each cluster are not consistent. + marker_genes = [ + "RP11-18H21.1", + "GZMK", + "CD79A", + "FCGR3A", + "GNLY", + "S100A8", + "FCER1A", + "PPBP", + ] + new_labels = ["0", "1", "2", "3", "4", "5", "6", "7"] + data_df = adata[:, marker_genes].to_df() + data_df["leiden"] = adata.obs["leiden"] + max_idxs = data_df.groupby("leiden", observed=True).mean().idxmax() + leiden_relabel = {} + for marker_gene, new_label in zip(marker_genes, new_labels): + leiden_relabel[max_idxs[marker_gene]] = new_label + adata.obs["leiden_old"] = adata.obs["leiden"].copy() + adata.rename_categories( + "leiden", [leiden_relabel[key] for key in sorted(leiden_relabel.keys())] + ) + # ensure that the column can be sorted for consistent plotting since it is by default unordered + adata.obs["leiden"] = adata.obs["leiden"].cat.reorder_categories( + list(map(str, range(len(adata.obs["leiden"].cat.categories)))), ordered=True + ) sc.tl.rank_genes_groups(adata, "leiden") sc.pl.rank_genes_groups(adata, n_genes=20, sharey=False, show=False) @@ -129,18 +163,13 @@ def test_pbmc3k(image_comparer): # sc.pl.rank_genes_groups_violin(adata, groups='0', n_genes=8) # save_and_compare_images('rank_genes_groups_4') - if adata[adata.obs["leiden"] == "4", "CST3"].X.mean() < 1: - ( # switch clusters - adata.obs["leiden"][adata.obs["leiden"] == "4"], - adata.obs["leiden"][adata.obs["leiden"] == "5"], - ) = ("5", "4") new_cluster_names = [ "CD4 T cells", - "CD14+ Monocytes", - "B cells", "CD8 T cells", + "B cells", "NK cells", "FCGR3A+ Monocytes", + "CD14+ Monocytes", "Dendritic cells", "Megakaryocytes", ] @@ -148,7 +177,6 @@ def test_pbmc3k(image_comparer): # sc.pl.umap(adata, color='leiden', legend_loc='on data', title='', frameon=False, show=False) # save_and_compare_images('umap_3') - sc.pl.violin( adata, ["CST3", "NKG7", "PPBP"], groupby="leiden", rotation=90, show=False ) diff --git a/scanpy/tests/test_aggregated.py b/scanpy/tests/test_aggregated.py new file mode 100644 index 0000000000..0b3e14b163 --- /dev/null +++ b/scanpy/tests/test_aggregated.py @@ -0,0 +1,380 @@ +from __future__ import annotations + +import anndata as ad +import numpy as np +import pandas as pd +import pytest +from packaging.version import Version +from scipy.sparse import csr_matrix + +import scanpy as sc +from scanpy._utils import _resolve_axis +from scanpy.testing._helpers import assert_equal +from scanpy.testing._helpers.data import pbmc3k_processed +from scanpy.testing._pytest.params import ARRAY_TYPES_MEM + + +@pytest.fixture +def df_base(): + ax_base = ["A", "B"] + return pd.DataFrame(index=ax_base) + + +@pytest.fixture +def df_groupby(): + ax_groupby = [ + *["v0", "v1", "v2"], + *["w0", "w1"], + *["a1", "a2", "a3"], + *["b1", "b2"], + *["c1", "c2"], + "d0", + ] + + df_groupby = pd.DataFrame(index=pd.Index(ax_groupby, name="cell")) + df_groupby["key"] = pd.Categorical([c[0] for c in ax_groupby]) + df_groupby["key_superset"] = pd.Categorical([c[0] for c in ax_groupby]).map( + {"v": "v", "w": "v", "a": "a", "b": "a", "c": "a", "d": "a"} + ) + df_groupby["key_subset"] = pd.Categorical([c[1] for c in ax_groupby]) + df_groupby["weight"] = 2.0 + return df_groupby + + +@pytest.fixture +def X(): + data = [ + *[[0, -2], [1, 13], [2, 1]], # v + *[[3, 12], [4, 2]], # w + *[[5, 11], [6, 3], [7, 10]], # a + *[[8, 4], [9, 9]], # b + *[[10, 5], [11, 8]], # c + [12, 6], # d + ] + return np.array(data, dtype=np.float32) + + +def gen_adata(data_key, dim, df_base, df_groupby, X): + if (data_key == "varm" and dim == "obs") or (data_key == "obsm" and dim == "var"): + pytest.skip("invalid parameter combination") + + obs_df, var_df = (df_groupby, df_base) if dim == "obs" else (df_base, df_groupby) + data = X.T if dim == "var" and data_key != "varm" else X + if data_key != "X": + data_dict_sparse = {data_key: {"test": csr_matrix(data)}} + data_dict_dense = {data_key: {"test": data}} + else: + data_dict_sparse = {data_key: csr_matrix(data)} + data_dict_dense = {data_key: data} + + adata_sparse = ad.AnnData(obs=obs_df, var=var_df, **data_dict_sparse) + adata_dense = ad.AnnData(obs=obs_df, var=var_df, **data_dict_dense) + return adata_sparse, adata_dense + + +@pytest.mark.parametrize("axis", [0, 1]) +def test_mask(axis): + blobs = sc.datasets.blobs() + mask = blobs.obs["blobs"] == 0 + blobs.obs["mask_col"] = mask + if axis == 1: + blobs = blobs.T + by_name = sc.get.aggregate(blobs, "blobs", "sum", axis=axis, mask="mask_col") + by_value = sc.get.aggregate(blobs, "blobs", "sum", axis=axis, mask=mask) + + assert_equal(by_name, by_value) + + assert np.all(by_name["0"].layers["sum"] == 0) + + +@pytest.mark.parametrize("array_type", ARRAY_TYPES_MEM) +@pytest.mark.parametrize("metric", ["sum", "mean", "var", "count_nonzero"]) +def test_aggregate_vs_pandas(metric, array_type): + adata = pbmc3k_processed().raw.to_adata() + adata = adata[ + adata.obs["louvain"].isin(adata.obs["louvain"].cat.categories[:5]), :1_000 + ].copy() + adata.X = array_type(adata.X) + adata.obs["percent_mito_binned"] = pd.cut(adata.obs["percent_mito"], bins=5) + result = sc.get.aggregate(adata, ["louvain", "percent_mito_binned"], metric) + + if metric == "count_nonzero": + expected = ( + (adata.to_df() != 0) + .astype(np.float64) + .join(adata.obs[["louvain", "percent_mito_binned"]]) + .groupby(["louvain", "percent_mito_binned"], observed=True) + .agg("sum") + ) + else: + expected = ( + adata.to_df() + .astype(np.float64) + .join(adata.obs[["louvain", "percent_mito_binned"]]) + .groupby(["louvain", "percent_mito_binned"], observed=True) + .agg(metric) + ) + expected.index = expected.index.to_frame().apply( + lambda x: "_".join(map(str, x)), axis=1 + ) + expected.index.name = None + expected.columns.name = None + + result_df = result.to_df(layer=metric) + result_df.index.name = None + result_df.columns.name = None + + if Version(pd.__version__) < Version("2"): + # Order of results returned by groupby changed in pandas 2 + assert expected.shape == result_df.shape + assert expected.index.isin(result_df.index).all() + + expected = expected.loc[result_df.index] + + pd.testing.assert_frame_equal(result_df, expected, check_dtype=False, atol=1e-5) + + +@pytest.mark.parametrize("array_type", ARRAY_TYPES_MEM) +@pytest.mark.parametrize("metric", ["sum", "mean", "var", "count_nonzero"]) +def test_aggregate_axis(array_type, metric): + adata = pbmc3k_processed().raw.to_adata() + adata = adata[ + adata.obs["louvain"].isin(adata.obs["louvain"].cat.categories[:5]), :1_000 + ].copy() + adata.X = array_type(adata.X) + expected = sc.get.aggregate(adata, ["louvain"], metric) + actual = sc.get.aggregate(adata.T, ["louvain"], metric, axis=1).T + + assert_equal(expected, actual) + + +def test_aggregate_entry(): + args = ("blobs", ["mean", "var", "count_nonzero"]) + + adata = sc.datasets.blobs() + X_result = sc.get.aggregate(adata, *args) + # layer adata + layer_adata = ad.AnnData( + obs=adata.obs, + var=adata.var, + layers={"test": adata.X.copy()}, + ) + layer_result = sc.get.aggregate(layer_adata, *args, layer="test") + obsm_adata = ad.AnnData( + obs=adata.obs, + var=adata.var, + obsm={"test": adata.X.copy()}, + ) + obsm_result = sc.get.aggregate(obsm_adata, *args, obsm="test") + varm_adata = ad.AnnData( + obs=adata.var, + var=adata.obs, + varm={"test": adata.X.copy()}, + ) + varm_result = sc.get.aggregate(varm_adata, *args, varm="test") + + X_result_min = X_result.copy() + del X_result_min.var + X_result_min.var_names = [str(x) for x in np.arange(X_result_min.n_vars)] + + assert_equal(X_result, layer_result) + assert_equal(X_result_min, obsm_result) + assert_equal(X_result.layers, obsm_result.layers) + assert_equal(X_result.layers, varm_result.T.layers) + + +def test_aggregate_incorrect_dim(): + adata = pbmc3k_processed().raw.to_adata() + + with pytest.raises(ValueError, match="was 'foo'"): + sc.get.aggregate(adata, ["louvain"], "sum", axis="foo") + + +@pytest.mark.parametrize("axis_name", ["obs", "var"]) +def test_aggregate_axis_specification(axis_name): + axis, axis_name = _resolve_axis(axis_name) + by = "blobs" if axis == 0 else "labels" + + adata = sc.datasets.blobs() + adata.var["labels"] = np.tile(["a", "b"], adata.shape[1])[: adata.shape[1]] + + agg_index = sc.get.aggregate(adata, by=by, func="mean", axis=axis) + agg_name = sc.get.aggregate(adata, by=by, func="mean", axis=axis_name) + + np.testing.assert_equal(agg_index.layers["mean"], agg_name.layers["mean"]) + + if axis_name == "obs": + agg_unspecified = sc.get.aggregate(adata, by=by, func="mean") + np.testing.assert_equal(agg_name.layers["mean"], agg_unspecified.layers["mean"]) + + +@pytest.mark.parametrize( + ("matrix", "df", "keys", "metrics", "expected"), + [ + pytest.param( + np.block( + [ + [np.ones((2, 2)), np.zeros((2, 2))], + [np.zeros((2, 2)), np.ones((2, 2))], + ] + ), + pd.DataFrame( + { + "a": ["a", "a", "b", "b"], + "b": ["c", "d", "d", "d"], + } + ), + ["a", "b"], + ["count_nonzero"], # , "sum", "mean"], + ad.AnnData( + obs=pd.DataFrame( + {"a": ["a", "a", "b"], "b": ["c", "d", "d"]}, + index=["a_c", "a_d", "b_d"], + ).astype("category"), + var=pd.DataFrame(index=[f"gene_{i}" for i in range(4)]), + layers={ + "count_nonzero": np.array( + [[1, 1, 0, 0], [1, 1, 0, 0], [0, 0, 2, 2]] + ), + # "sum": np.array([[2, 0], [0, 2]]), + # "mean": np.array([[1, 0], [0, 1]]), + }, + ), + id="count_nonzero", + ), + pytest.param( + np.block( + [ + [np.ones((2, 2)), np.zeros((2, 2))], + [np.zeros((2, 2)), np.ones((2, 2))], + ] + ), + pd.DataFrame( + { + "a": ["a", "a", "b", "b"], + "b": ["c", "d", "d", "d"], + } + ), + ["a", "b"], + ["sum", "mean", "count_nonzero"], + ad.AnnData( + obs=pd.DataFrame( + {"a": ["a", "a", "b"], "b": ["c", "d", "d"]}, + index=["a_c", "a_d", "b_d"], + ).astype("category"), + var=pd.DataFrame(index=[f"gene_{i}" for i in range(4)]), + layers={ + "sum": np.array([[1, 1, 0, 0], [1, 1, 0, 0], [0, 0, 2, 2]]), + "mean": np.array([[1, 1, 0, 0], [1, 1, 0, 0], [0, 0, 1, 1]]), + "count_nonzero": np.array( + [[1, 1, 0, 0], [1, 1, 0, 0], [0, 0, 2, 2]] + ), + }, + ), + id="sum-mean-count_nonzero", + ), + pytest.param( + np.block( + [ + [np.ones((2, 2)), np.zeros((2, 2))], + [np.zeros((2, 2)), np.ones((2, 2))], + ] + ), + pd.DataFrame( + { + "a": ["a", "a", "b", "b"], + "b": ["c", "d", "d", "d"], + } + ), + ["a", "b"], + ["mean"], + ad.AnnData( + obs=pd.DataFrame( + {"a": ["a", "a", "b"], "b": ["c", "d", "d"]}, + index=["a_c", "a_d", "b_d"], + ).astype("category"), + var=pd.DataFrame(index=[f"gene_{i}" for i in range(4)]), + layers={ + "mean": np.array([[1, 1, 0, 0], [1, 1, 0, 0], [0, 0, 1, 1]]), + }, + ), + id="mean", + ), + ], +) +def test_aggregate_examples(matrix, df, keys, metrics, expected): + adata = ad.AnnData( + X=matrix, + obs=df, + var=pd.DataFrame(index=[f"gene_{i}" for i in range(matrix.shape[1])]), + ) + result = sc.get.aggregate(adata, by=keys, func=metrics) + + print(result) + print(expected) + + assert_equal(expected, result) + + +@pytest.mark.parametrize( + ("label_cols", "cols", "expected"), + [ + pytest.param( + dict( + a=pd.Categorical(["a", "b", "c"]), + b=pd.Categorical(["d", "d", "f"]), + ), + ["a", "b"], + pd.Categorical(["a_d", "b_d", "c_f"]), + id="two_of_two", + ), + pytest.param( + dict( + a=pd.Categorical(["a", "b", "c"]), + b=pd.Categorical(["d", "d", "f"]), + c=pd.Categorical(["g", "h", "h"]), + ), + ["a", "b", "c"], + pd.Categorical(["a_d_g", "b_d_h", "c_f_h"]), + id="three_of_three", + ), + pytest.param( + dict( + a=pd.Categorical(["a", "b", "c"]), + b=pd.Categorical(["d", "d", "f"]), + c=pd.Categorical(["g", "h", "h"]), + ), + ["a", "c"], + pd.Categorical(["a_g", "b_h", "c_h"]), + id="two_of_three-1", + ), + pytest.param( + dict( + a=pd.Categorical(["a", "b", "c"]), + b=pd.Categorical(["d", "d", "f"]), + c=pd.Categorical(["g", "h", "h"]), + ), + ["b", "c"], + pd.Categorical(["d_g", "d_h", "f_h"]), + id="two_of_three-2", + ), + ], +) +def test_combine_categories(label_cols, cols, expected): + from scanpy.get._aggregated import _combine_categories + + label_df = pd.DataFrame(label_cols) + result, result_label_df = _combine_categories(label_df, cols) + + assert isinstance(result, pd.Categorical) + + pd.testing.assert_extension_array_equal(result, expected) + + pd.testing.assert_index_equal( + pd.Index(result), result_label_df.index.astype("category") + ) + + reconstructed_df = pd.DataFrame( + [x.split("_") for x in result], columns=cols, index=result.astype(str) + ).astype("category") + pd.testing.assert_frame_equal(reconstructed_df, result_label_df) diff --git a/scanpy/tests/test_clustering.py b/scanpy/tests/test_clustering.py index f42a4a4b2a..d2005d9659 100644 --- a/scanpy/tests/test_clustering.py +++ b/scanpy/tests/test_clustering.py @@ -1,6 +1,7 @@ from __future__ import annotations import pytest +from sklearn.metrics.cluster import normalized_mutual_info_score import scanpy as sc from scanpy.testing._helpers.data import pbmc68k_reduced @@ -12,11 +13,136 @@ def adata_neighbors(): return pbmc68k_reduced() +FLAVORS = [ + pytest.param("igraph", marks=needs.igraph), + pytest.param("leidenalg", marks=needs.leidenalg), +] + + +@needs.leidenalg +@needs.igraph +@pytest.mark.parametrize("flavor", FLAVORS) +@pytest.mark.parametrize("resolution", [1, 2]) +@pytest.mark.parametrize("n_iterations", [-1, 3]) +def test_leiden_basic(adata_neighbors, flavor, resolution, n_iterations): + sc.tl.leiden( + adata_neighbors, + flavor=flavor, + resolution=resolution, + n_iterations=n_iterations, + directed=(flavor == "leidenalg"), + ) + assert adata_neighbors.uns["leiden"]["params"]["resolution"] == resolution + assert adata_neighbors.uns["leiden"]["params"]["n_iterations"] == n_iterations + + @needs.leidenalg -def test_leiden_basic(adata_neighbors): - sc.tl.leiden(adata_neighbors) +@needs.igraph +@pytest.mark.parametrize("flavor", FLAVORS) +def test_leiden_random_state(adata_neighbors, flavor): + is_leiden_alg = flavor == "leidenalg" + n_iterations = 2 if is_leiden_alg else -1 + adata_1 = sc.tl.leiden( + adata_neighbors, + flavor=flavor, + random_state=1, + copy=True, + directed=is_leiden_alg, + n_iterations=n_iterations, + ) + adata_1_again = sc.tl.leiden( + adata_neighbors, + flavor=flavor, + random_state=1, + copy=True, + directed=is_leiden_alg, + n_iterations=n_iterations, + ) + adata_2 = sc.tl.leiden( + adata_neighbors, + flavor=flavor, + random_state=2, + copy=True, + directed=is_leiden_alg, + n_iterations=n_iterations, + ) + assert (adata_1.obs["leiden"] == adata_1_again.obs["leiden"]).all() + assert (adata_2.obs["leiden"] != adata_1_again.obs["leiden"]).any() + + +@needs.igraph +def test_leiden_igraph_directed(adata_neighbors): + with pytest.raises(ValueError): + sc.tl.leiden(adata_neighbors, flavor="igraph", directed=True) + + +@needs.igraph +def test_leiden_wrong_flavor(adata_neighbors): + with pytest.raises(ValueError): + sc.tl.leiden(adata_neighbors, flavor="foo") + + +@needs.igraph +@needs.leidenalg +def test_leiden_igraph_partition_type(adata_neighbors): + import leidenalg + + with pytest.raises(ValueError): + sc.tl.leiden( + adata_neighbors, + flavor="igraph", + partition_type=leidenalg.RBConfigurationVertexPartition, + ) + + +@needs.leidenalg +@needs.igraph +def test_leiden_equal_defaults_same_args(adata_neighbors): + """Ensure the two implementations are the same for the same args.""" + leiden_alg_clustered = sc.tl.leiden( + adata_neighbors, flavor="leidenalg", copy=True, n_iterations=2 + ) + igraph_clustered = sc.tl.leiden( + adata_neighbors, flavor="igraph", copy=True, directed=False, n_iterations=2 + ) + assert ( + normalized_mutual_info_score( + leiden_alg_clustered.obs["leiden"], igraph_clustered.obs["leiden"] + ) + > 0.9 + ) + + +@needs.leidenalg +@needs.igraph +def test_leiden_equal_defaults(adata_neighbors): + """Ensure that the old leidenalg defaults are close enough to the current default outputs.""" + leiden_alg_clustered = sc.tl.leiden( + adata_neighbors, flavor="leidenalg", directed=True, copy=True + ) + igraph_clustered = sc.tl.leiden( + adata_neighbors, copy=True, n_iterations=2, directed=False + ) + assert ( + normalized_mutual_info_score( + leiden_alg_clustered.obs["leiden"], igraph_clustered.obs["leiden"] + ) + > 0.9 + ) + + +@needs.igraph +def test_leiden_objective_function(adata_neighbors): + """Ensure that popping this as a `clustering_kwargs` and using it does not error out.""" + sc.tl.leiden( + adata_neighbors, + objective_function="modularity", + flavor="igraph", + directed=False, + ) +@needs.igraph @pytest.mark.parametrize( "clustering,key", [ @@ -52,6 +178,7 @@ def test_clustering_subset(adata_neighbors, clustering, key): @needs.louvain +@needs.igraph def test_louvain_basic(adata_neighbors): sc.tl.louvain(adata_neighbors) sc.tl.louvain(adata_neighbors, use_weights=True) @@ -60,6 +187,7 @@ def test_louvain_basic(adata_neighbors): @needs.louvain +@needs.igraph def test_partition_type(adata_neighbors): import louvain diff --git a/scanpy/tests/test_pca.py b/scanpy/tests/test_pca.py index a18a313063..3150808ee1 100644 --- a/scanpy/tests/test_pca.py +++ b/scanpy/tests/test_pca.py @@ -165,7 +165,7 @@ def test_pca_transform(array_type): with warnings.catch_warnings(record=True) as record: sc.pp.pca(adata, n_comps=4, zero_center=True, dtype="float64") - assert len(record) == 0 + assert len(record) == 0, record assert np.linalg.norm(A_pca_abs[:, :4] - np.abs(adata.obsm["X_pca"])) < 2e-05 @@ -382,7 +382,7 @@ def test_mask_order_warning(request): UserWarning, match="When using a mask parameter with anndata<0.9 on a dense array", ): - sc.pp.pca(adata, mask=mask) + sc.pp.pca(adata, mask_var=mask) def test_mask_defaults(array_type, float_dtype): diff --git a/scanpy/tests/test_plotting.py b/scanpy/tests/test_plotting.py index 0a03fdd8ba..5482c0d72f 100644 --- a/scanpy/tests/test_plotting.py +++ b/scanpy/tests/test_plotting.py @@ -103,7 +103,14 @@ def test_heatmap(image_comparer): # test var_names as dict pbmc = pbmc68k_reduced() - sc.tl.leiden(pbmc, key_added="clusters", resolution=0.5) + sc.tl.leiden( + pbmc, + key_added="clusters", + resolution=0.5, + flavor="igraph", + n_iterations=2, + directed=False, + ) # call umap to trigger colors for the clusters sc.pl.umap(pbmc, color="clusters") marker_genes_dict = { diff --git a/scanpy/tests/test_score_genes.py b/scanpy/tests/test_score_genes.py index cb44e46dc9..23bd934e78 100644 --- a/scanpy/tests/test_score_genes.py +++ b/scanpy/tests/test_score_genes.py @@ -66,8 +66,8 @@ def test_score_with_reference(): sc.tl.score_genes(adata, gene_list=adata.var_names[:100], score_name="Test") with Path(HERE, "score_genes_reference_paul2015.pkl").open("rb") as file: reference = pickle.load(file) - # np.testing.assert_allclose(reference, adata.obs.Test.values) - np.testing.assert_array_equal(reference, adata.obs.Test.values) + # np.testing.assert_allclose(reference, adata.obs["Test"].to_numpy()) + np.testing.assert_array_equal(reference, adata.obs["Test"].to_numpy()) def test_add_score(): diff --git a/scanpy/tools/_ingest.py b/scanpy/tools/_ingest.py index 4c8528180b..fb36b7a913 100644 --- a/scanpy/tools/_ingest.py +++ b/scanpy/tools/_ingest.py @@ -222,12 +222,9 @@ class Ingest: """ def _init_umap(self, adata): - import umap as u + from umap import UMAP - if not self._use_pynndescent: - u.umap_._HAVE_PYNNDESCENT = False - - self._umap = u.UMAP( + self._umap = UMAP( metric=self._metric, random_state=adata.uns["umap"]["params"].get("random_state", 0), ) @@ -246,79 +243,16 @@ def _init_umap(self, adata): self._umap._n_neighbors = self._n_neighbors self._umap.n_neighbors = self._n_neighbors - if not self._use_pynndescent: - if self._random_init is not None or self._tree_init is not None: - self._umap._random_init = self._random_init - self._umap._tree_init = self._tree_init - self._umap._search = self._search - - if self._dist_func is not None: - self._umap._input_distance_func = self._dist_func - - self._umap._rp_forest = self._rp_forest - - self._umap._search_graph = self._search_graph - else: - self._umap._knn_search_index = self._nnd_idx + self._umap._knn_search_index = self._nnd_idx self._umap._a = adata.uns["umap"]["params"]["a"] self._umap._b = adata.uns["umap"]["params"]["b"] self._umap._input_hash = None - def _init_dist_search(self, dist_args): - from functools import partial - - from umap.distances import named_distances - from umap.nndescent import initialise_search - - self._random_init = None - self._tree_init = None - - self._initialise_search = None - self._search = None - - self._dist_func = None - - dist_func = named_distances[self._metric] - - if pkg_version("umap-learn") < version.parse("0.4.0"): - from umap.nndescent import ( - make_initialisations, - make_initialized_nnd_search, - ) - - self._random_init, self._tree_init = make_initialisations( - dist_func, dist_args - ) - _initialise_search = partial( - initialise_search, - init_from_random=self._random_init, - init_from_tree=self._tree_init, - ) - _search = make_initialized_nnd_search(dist_func, dist_args) - - else: - from numba import njit - from umap.nndescent import initialized_nnd_search - - @njit - def partial_dist_func(x, y): - return dist_func(x, y, *dist_args) - - _initialise_search = partial(initialise_search, dist=partial_dist_func) - _search = partial(initialized_nnd_search, dist=partial_dist_func) - - self._dist_func = partial_dist_func - - self._initialise_search = _initialise_search - self._search = _search - def _init_pynndescent(self, distances): from pynndescent import NNDescent - self._use_pynndescent = True - first_col = np.arange(distances.shape[0])[:, None] init_indices = np.hstack((first_col, np.stack(distances.tolil().rows))) @@ -365,27 +299,13 @@ def _init_neighbors(self, adata, neighbors_key): if "metric_kwds" in neighbors["params"]: self._metric_kwds = neighbors["params"]["metric_kwds"] - dist_args = tuple(self._metric_kwds.values()) else: self._metric_kwds = {} - dist_args = () self._metric = neighbors["params"]["metric"] - if pkg_version("umap-learn") < version.parse("0.5.0"): - self._init_dist_search(dist_args) - - search_graph = neighbors["distances"].copy() - search_graph.data = (search_graph.data > 0).astype(np.int8) - self._search_graph = search_graph.maximum(search_graph.transpose()) - - if "rp_forest" in neighbors: - self._rp_forest = _rp_forest_generate(neighbors["rp_forest"]) - else: - self._rp_forest = None - else: - self._neigh_random_state = neighbors["params"].get("random_state", 0) - self._init_pynndescent(neighbors["distances"]) + self._neigh_random_state = neighbors["params"].get("random_state", 0) + self._init_pynndescent(neighbors["distances"]) def _init_pca(self, adata): self._pca_centered = adata.uns["pca"]["params"]["zero_center"] @@ -411,9 +331,6 @@ def __init__(self, adata: AnnData, neighbors_key: str | None = None): self._adata_ref = adata self._adata_new = None - # only use with umap > 0.5.0 - self._use_pynndescent = False - if "pca" in adata.uns: self._init_pca(adata) @@ -498,29 +415,13 @@ def neighbors(self, k=None, queue_size=5, epsilon=0.1, random_state=0): random_state = check_random_state(random_state) rng_state = random_state.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64) - train = self._rep test = self._obsm["rep"] if k is None: k = self._n_neighbors - if self._use_pynndescent: - self._nnd_idx.search_rng_state = rng_state - - self._indices, self._distances = self._nnd_idx.query(test, k, epsilon) - - else: - from umap.utils import deheap_sort - - init = self._initialise_search( - self._rp_forest, train, test, int(k * queue_size), rng_state=rng_state - ) - - result = self._search( - train, self._search_graph.indptr, self._search_graph.indices, init, test - ) - indices, dists = deheap_sort(result) - self._indices, self._distances = indices[:, :k], dists[:, :k] + self._nnd_idx.search_rng_state = rng_state + self._indices, self._distances = self._nnd_idx.query(test, k, epsilon) def _umap_transform(self): return self._umap.transform(self._obsm["rep"]) diff --git a/scanpy/tools/_leiden.py b/scanpy/tools/_leiden.py index aa8a9f4981..5345e3567b 100644 --- a/scanpy/tools/_leiden.py +++ b/scanpy/tools/_leiden.py @@ -1,6 +1,8 @@ from __future__ import annotations -from typing import TYPE_CHECKING +import importlib +import warnings +from typing import TYPE_CHECKING, Literal import numpy as np import pandas as pd @@ -34,14 +36,15 @@ def leiden( random_state: _utils.AnyRandom = 0, key_added: str = "leiden", adjacency: sparse.spmatrix | None = None, - directed: bool = True, + directed: bool | None = None, use_weights: bool = True, n_iterations: int = -1, partition_type: type[MutableVertexPartition] | None = None, neighbors_key: str | None = None, obsp: str | None = None, copy: bool = False, - **partition_kwargs, + flavor: Literal["leidenalg", "ipgraph"] = "leidenalg", + **clustering_args, ) -> AnnData | None: """\ Cluster cells into subgroups [Traag18]_. @@ -80,6 +83,7 @@ def leiden( How many iterations of the Leiden clustering algorithm to perform. Positive values above 2 define the total number of iterations to perform, -1 has the algorithm run until it reaches its optimal clustering. + 2 is faster and the default for underlying packages. partition_type Type of partition to use. Defaults to :class:`~leidenalg.RBConfigurationVertexPartition`. @@ -96,9 +100,11 @@ def leiden( `obsp` and `neighbors_key` at the same time. copy Whether to copy `adata` or modify it inplace. - **partition_kwargs - Any further arguments to pass to `~leidenalg.find_partition` - (which in turn passes arguments to the `partition_type`). + flavor + Which package's implementation to use. + **clustering_args + Any further arguments to pass to :func:`~leidenalg.find_partition` (which in turn passes arguments to the `partition_type`) + or :meth:`igraph.Graph.community_leiden` from `igraph`. Returns ------- @@ -112,13 +118,35 @@ def leiden( A dict with the values for the parameters `resolution`, `random_state`, and `n_iterations`. """ - try: - import leidenalg - except ImportError: + if flavor not in {"igraph", "leidenalg"}: + raise ValueError( + f"flavor must be either 'igraph' or 'leidenalg', but '{flavor}' was passed" + ) + igraph_spec = importlib.util.find_spec("igraph") + if igraph_spec is None: raise ImportError( - "Please install the leiden algorithm: `conda install -c conda-forge leidenalg` or `pip3 install leidenalg`." + "Please install the igraph package: `conda install -c conda-forge igraph` or `pip3 install igraph`." ) - partition_kwargs = dict(partition_kwargs) + if flavor == "igraph": + if directed: + raise ValueError( + "Cannot use igraph's leiden implemntation with a directed graph." + ) + if partition_type is not None: + raise ValueError( + "Do not pass in partition_type argument when using igraph." + ) + else: + try: + import leidenalg + + msg = 'Use of leidenalg is discouraged and will be deprecated in the future. Please use `flavor="igraph"` `n_iterations=2` to achieve similar results. `directed` must also be `False` to work with `igraph`\'s implementation.' + warnings.warn(msg, FutureWarning) + except ImportError: + raise ImportError( + "Please install the leiden algorithm: `conda install -c conda-forge leidenalg` or `pip3 install leidenalg`." + ) + clustering_args = dict(clustering_args) start = logg.info("running Leiden clustering") adata = adata.copy() if copy else adata @@ -133,23 +161,29 @@ def leiden( restrict_categories=restrict_categories, adjacency=adjacency, ) - # convert it to igraph - g = _utils.get_igraph_from_adjacency(adjacency, directed=directed) - # flip to the default partition type if not overriden by the user - if partition_type is None: - partition_type = leidenalg.RBConfigurationVertexPartition # Prepare find_partition arguments as a dictionary, # appending to whatever the user provided. It needs to be this way # as this allows for the accounting of a None resolution # (in the case of a partition variant that doesn't take it on input) - if use_weights: - partition_kwargs["weights"] = np.array(g.es["weight"]).astype(np.float64) - partition_kwargs["n_iterations"] = n_iterations - partition_kwargs["seed"] = random_state + clustering_args["n_iterations"] = n_iterations if resolution is not None: - partition_kwargs["resolution_parameter"] = resolution - # clustering proper - part = leidenalg.find_partition(g, partition_type, **partition_kwargs) + clustering_args["resolution_parameter"] = resolution + if flavor == "leidenalg": + directed = True if directed is None else directed + g = _utils.get_igraph_from_adjacency(adjacency, directed=directed) + if partition_type is None: + partition_type = leidenalg.RBConfigurationVertexPartition + if use_weights: + clustering_args["weights"] = np.array(g.es["weight"]).astype(np.float64) + clustering_args["seed"] = random_state + part = leidenalg.find_partition(g, partition_type, **clustering_args) + else: + g = _utils.get_igraph_from_adjacency(adjacency, directed=False) + if use_weights: + clustering_args["weights"] = "weight" + clustering_args.setdefault("objective_function", "modularity") + with _utils.set_igraph_random_state(random_state): + part = g.community_leiden(**clustering_args) # store output into adata.obs groups = np.array(part.membership) if restrict_to is not None: diff --git a/scanpy/tools/_rank_genes_groups.py b/scanpy/tools/_rank_genes_groups.py index ba9196a842..8e133434bf 100644 --- a/scanpy/tools/_rank_genes_groups.py +++ b/scanpy/tools/_rank_genes_groups.py @@ -582,7 +582,7 @@ def rank_genes_groups( Notes ----- There are slight inconsistencies depending on whether sparse - or dense data are passed. See `here `__. + or dense data are passed. See `here `__. Examples -------- diff --git a/scanpy/tools/_score_genes.py b/scanpy/tools/_score_genes.py index 3e31f0cc66..307a162b12 100644 --- a/scanpy/tools/_score_genes.py +++ b/scanpy/tools/_score_genes.py @@ -51,10 +51,10 @@ def _sparse_nanmean(X, axis): ) def score_genes( adata: AnnData, - gene_list: Sequence[str], + gene_list: Sequence[str] | pd.Index[str], *, ctrl_size: int = 50, - gene_pool: Sequence[str] | None = None, + gene_pool: Sequence[str] | pd.Index[str] | None = None, n_bins: int = 25, score_name: str = "score", random_state: AnyRandom = 0, @@ -114,26 +114,20 @@ def score_genes( if random_state is not None: np.random.seed(random_state) - gene_list_in_var = [] var_names = adata.raw.var_names if use_raw else adata.var_names - genes_to_ignore = [] - for gene in gene_list: - if gene in var_names: - gene_list_in_var.append(gene) - else: - genes_to_ignore.append(gene) + gene_list = pd.Index([gene_list] if isinstance(gene_list, str) else gene_list) + genes_to_ignore = gene_list.difference(var_names, sort=False) # first get missing + gene_list = gene_list.intersection(var_names) # then restrict to present if len(genes_to_ignore) > 0: logg.warning(f"genes are not in var_names and ignored: {genes_to_ignore}") - gene_list = set(gene_list_in_var) - if len(gene_list) == 0: raise ValueError("No valid genes were passed for scoring.") if gene_pool is None: - gene_pool = list(var_names) + gene_pool = pd.Index(var_names, dtype="string") else: - gene_pool = [x for x in gene_pool if x in var_names] - if not gene_pool: + gene_pool = pd.Index(gene_pool, dtype="string").intersection(var_names) + if len(gene_pool) == 0: raise ValueError("No valid genes were passed for reference set.") # Trying here to match the Seurat approach in scoring cells. @@ -144,34 +138,28 @@ def score_genes( _adata_subset = ( _adata[:, gene_pool] if len(gene_pool) < len(_adata.var_names) else _adata ) + # average expression of genes if issparse(_adata_subset.X): obs_avg = pd.Series( np.array(_sparse_nanmean(_adata_subset.X, axis=0)).flatten(), index=gene_pool, - ) # average expression of genes + ) else: - obs_avg = pd.Series( - np.nanmean(_adata_subset.X, axis=0), index=gene_pool - ) # average expression of genes + obs_avg = pd.Series(np.nanmean(_adata_subset.X, axis=0), index=gene_pool) - obs_avg = obs_avg[ - np.isfinite(obs_avg) - ] # Sometimes (and I don't know how) missing data may be there, with nansfor + # Sometimes (and I don't know how) missing data may be there, with nansfor + obs_avg = obs_avg[np.isfinite(obs_avg)] n_items = int(np.round(len(obs_avg) / (n_bins - 1))) obs_cut = obs_avg.rank(method="min") // n_items - control_genes = set() + control_genes = pd.Index([], dtype="string") # now pick `ctrl_size` genes from every cut - for cut in np.unique(obs_cut.loc[list(gene_list)]): - r_genes = np.array(obs_cut[obs_cut == cut].index) - np.random.shuffle(r_genes) - # uses full r_genes if ctrl_size > len(r_genes) - control_genes.update(set(r_genes[:ctrl_size])) - - # To index, we need a list – indexing implies an order. - control_genes = list(control_genes - gene_list) - gene_list = list(gene_list) + for cut in np.unique(obs_cut.loc[gene_list]): + r_genes: pd.Index[str] = obs_cut[obs_cut == cut].index + if ctrl_size < len(r_genes): + r_genes = r_genes.to_series().sample(ctrl_size).index + control_genes = control_genes.union(r_genes.difference(gene_list)) X_list = _adata[:, gene_list].X if issparse(X_list): diff --git a/scanpy/tools/_umap.py b/scanpy/tools/_umap.py index 6b18438e1b..8105c3abd8 100644 --- a/scanpy/tools/_umap.py +++ b/scanpy/tools/_umap.py @@ -4,7 +4,6 @@ from typing import TYPE_CHECKING, Literal import numpy as np -from packaging import version from sklearn.utils import check_array, check_random_state from .. import logging as logg @@ -165,29 +164,12 @@ def umap( f'.obsp["{neighbors["connectivities_key"]}"] have not been computed using umap' ) - # Compat for umap 0.4 -> 0.5 with warnings.catch_warnings(): # umap 0.5.0 warnings.filterwarnings("ignore", message=r"Tensorflow not installed") import umap - if version.parse(umap.__version__) >= version.parse("0.5.0"): - - def simplicial_set_embedding(*args, **kwargs): - from umap.umap_ import simplicial_set_embedding - - X_umap, _ = simplicial_set_embedding( - *args, - densmap=False, - densmap_kwds={}, - output_dens=False, - **kwargs, - ) - return X_umap - - else: - from umap.umap_ import simplicial_set_embedding - from umap.umap_ import find_ab_params + from umap.umap_ import find_ab_params, simplicial_set_embedding if a is None or b is None: a, b = find_ab_params(spread, min_dist) @@ -222,20 +204,23 @@ def simplicial_set_embedding(*args, **kwargs): # for the init condition in the UMAP embedding default_epochs = 500 if neighbors["connectivities"].shape[0] <= 10000 else 200 n_epochs = default_epochs if maxiter is None else maxiter - X_umap = simplicial_set_embedding( - X, - neighbors["connectivities"].tocoo(), - n_components, - alpha, - a, - b, - gamma, - negative_sample_rate, - n_epochs, - init_coords, - random_state, - neigh_params.get("metric", "euclidean"), - neigh_params.get("metric_kwds", {}), + X_umap, _ = simplicial_set_embedding( + data=X, + graph=neighbors["connectivities"].tocoo(), + n_components=n_components, + initial_alpha=alpha, + a=a, + b=b, + gamma=gamma, + negative_sample_rate=negative_sample_rate, + n_epochs=n_epochs, + init=init_coords, + random_state=random_state, + metric=neigh_params.get("metric", "euclidean"), + metric_kwds=neigh_params.get("metric_kwds", {}), + densmap=False, + densmap_kwds={}, + output_dens=False, verbose=settings.verbosity > 3, ) elif method == "rapids":