Skip to content

Commit

Permalink
(feat): igraph leiden implementation now included as an option in `…
Browse files Browse the repository at this point in the history
…sc.tl.leiden` (#2815)

* (feat): igraph as option for leiden

* (feat): add test for similarity

* (feat): migrate defaults to `igraph`

* (chore): add test for `directed` + `igraph`

* (chore): change expected images

* (fix): weights condition bug

* (fix): change `rank_genes_groups` tolerance and update test images

* (feat): new violin plot based on redone cluster assignments

* (chore): check parameters matching

* (fix): handle import properly

* (fix): handle `partition_type` with `use_igraph`

* (chore): remove unnecessary test args

* (chore): add test for old defaults

* (chore): pre-commit?

* (chore): pre-commit hooks run

* (chore): make violin plot `expected` correct

* (fix): change `tol` again for violin plots

* (chore): revert tolerance change - separate issue incoming

* (chore): release note

* (chore): try new plots with random seed set

* (test): try publishing artifacts

* (fix): publish artifact

* (fix): publish other images

* (chore): umap

* (fix): fix random seeding for `igraph`

* (fix): import in function

* (fix): remove umap from test

* (fix): try different random?

* (feat): try marker gene labeling + write results

* (fix): publish artifacts

* (fix): try writing out data after relabel

* (fix): try stable dataset

* (chore): add more writes

* (fix): sort categories

* (fix): require igraph

* (chore): remove build artifact

* (fix): spelling error

* (fix): swap changed after re-ordering

* (chore): `use_igraph` -> `use_leidenalg`

* fmt

* (refactor): `use_leidenalg` -> `backend`

* (refactor): get `objective_function` from `clustering_args`

* (fix): docstring links

* (refactor): create rng for igraph

* (refactor): less lines

* (chore): add test for random state

* (refactor): fix initial state settings for other `igraph` methods by using `random` again

* (refactor): `FLAVORS` reuse in test

* Update scanpy/tools/_leiden.py

Co-authored-by: Philipp A. <[email protected]>

* Update scanpy/_utils/__init__.py

Co-authored-by: Philipp A. <[email protected]>

* Update scanpy/_utils/__init__.py

Co-authored-by: Philipp A. <[email protected]>

* (fix): fix heatmap plot

* (fix): change out images for new random seed method

* Update scanpy/tools/_leiden.py

Co-authored-by: Philipp A. <[email protected]>

* (chore): switch back to `leidenalg` default

* (chore): fix clustering tests and update message

* (fix): plotting test

* (fix): `test_leiden_basic` `directed` arg

* (fix): fix iterations to defaults

* (fix): correct category swapping

* (fix): need to reorder categories as well

* (fix): clean up simple tests

* (fix): remove unnecessary cluster swap.

* (fix): just use random state that gives same number of categories

* (fix): use `np.random` instead of `random` module

* (chore): remove unnecessary comment in test about state

* (refactor): simplify conditions

* (refactor): `elif` -> `else` when `flavor` already checked

* (fix): move leiden import for test

* (fix): revert unnecessary image changes

* (chore): address comments

---------

Co-authored-by: Philipp A <[email protected]>
  • Loading branch information
ilan-gold and flying-sheep authored Feb 19, 2024
1 parent 1ac74a7 commit 6ee18b9
Show file tree
Hide file tree
Showing 12 changed files with 276 additions and 45 deletions.
1 change: 1 addition & 0 deletions docs/release-notes/1.10.0.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
* {func}`scanpy.pp.pca`, {func}`scanpy.pp.scale`, {func}`scanpy.pl.embedding`, and {func}`scanpy.experimental.pp.normalize_pearson_residuals_pca`
now support a `mask` parameter {pr}`2272` {smaller}`C Bright, T Marcella, & P Angerer`
* {func}`scanpy.tl.rank_genes_groups` no longer warns that it's default was changed from t-test_overestim_var to t-test {pr}`2798` {smaller}`L Heumos`
* {func}`scanpy.tl.leiden` now offers `igraph`'s implementation of the leiden algorithm via via `flavor` when set to `igraph`. `leidenalg`'s implementation is still default, but discouraged. {pr}`2815` {smaller}`I Gold`
* {func}`scanpy.pp.highly_variable_genes` has new flavor `seurat_v3_paper` that is in its implementation consistent with the paper description in Stuart et al 2018. {pr}`2792` {smaller}`E Roellin`
* {func}`scanpy.pp.highly_variable_genes` supports dask for the default `seurat` and `cell_ranger` flavors {pr}`2809` {smaller}`P Angerer`
* Auto conversion of strings to collections in `scanpy.pp.calculate_qc_metrics` {pr}`2859` {smaller}`N Teyssier`
Expand Down
41 changes: 37 additions & 4 deletions scanpy/_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@

import importlib.util
import inspect
import random
import sys
import warnings
from collections import namedtuple
from contextlib import contextmanager
from enum import Enum
from functools import partial, singledispatch, wraps
from textwrap import dedent
Expand All @@ -20,10 +22,10 @@
import numpy as np
from anndata import AnnData
from anndata import __version__ as anndata_version
from numpy import random
from numpy.typing import NDArray
from packaging import version
from scipy import sparse
from sklearn.utils import check_random_state

from .. import logging as logg
from .._compat import DaskArray
Expand All @@ -45,7 +47,38 @@ def __repr__(self) -> str:
_empty = Empty.token

# e.g. https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
AnyRandom = Union[int, random.RandomState, None] # maybe in the future random.Generator
# maybe in the future random.Generator
AnyRandom = Union[int, np.random.RandomState, None]


class RNGIgraph:
"""
Random number generator for ipgraph so global seed is not changed.
See :func:`igraph.set_random_number_generator` for the requirements.
"""

def __init__(self, random_state: int = 0) -> None:
self._rng = check_random_state(random_state)

def __getattr__(self, attr: str):
return getattr(self._rng, "normal" if attr == "gauss" else attr)


@contextmanager
def set_igraph_random_state(random_state: int):
try:
import igraph
except ImportError:
raise ImportError(
"Please install igraph: `conda install -c conda-forge igraph` or `pip3 install igraph`."
)
rng = RNGIgraph(random_state)
try:
igraph.set_random_number_generator(rng)
yield None
finally:
igraph.set_random_number_generator(random)


EPS = 1e-15

Expand Down Expand Up @@ -459,10 +492,10 @@ def moving_average(a: np.ndarray, n: int):
return ret[n - 1 :] / n


def get_random_state(seed: AnyRandom) -> random.RandomState:
def get_random_state(seed: AnyRandom) -> np.random.RandomState:
if isinstance(seed, np.random.RandomState):
return seed
return random.RandomState(seed)
return np.random.RandomState(seed)


# --------------------------------------------------------------------------------
Expand Down
Binary file modified scanpy/tests/_images/heatmap_var_as_dict/expected.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified scanpy/tests/notebooks/_images_pbmc3k/scatter_3/expected.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified scanpy/tests/notebooks/_images_pbmc3k/violin_2/expected.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
58 changes: 43 additions & 15 deletions scanpy/tests/notebooks/test_pbmc3k.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,10 @@

@needs.leidenalg
def test_pbmc3k(image_comparer):
# ensure violin plots and other non-determinstic plots have deterministic behavior
np.random.seed(0)
save_and_compare_images = partial(image_comparer, ROOT, tol=20)

adata = sc.read(
"./data/pbmc3k_raw.h5ad", backup_url="https://falexwolf.de/data/pbmc3k_raw.h5ad"
)
adata = sc.datasets.pbmc3k()

# Preprocessing

Expand Down Expand Up @@ -105,13 +104,48 @@ def test_pbmc3k(image_comparer):

# Clustering the graph

sc.tl.leiden(adata, resolution=0.9)
# sc.pl.umap(adata, color=['leiden', 'CST3', 'NKG7'], show=False)
# save_and_compare_images('umap_2')
sc.tl.leiden(
adata,
resolution=0.9,
random_state=0,
directed=False,
n_iterations=2,
flavor="igraph",
)

# sc.pl.umap(adata, color=["leiden", "CST3", "NKG7"], show=False)
# save_and_compare_images("umap_2")
sc.pl.scatter(adata, "CST3", "NKG7", color="leiden", show=False)
save_and_compare_images("scatter_3")

# Finding marker genes
# Due to incosistency with our test runner vs local, these clusters need to
# be pre-annotated as the numbers for each cluster are not consistent.
marker_genes = [
"RP11-18H21.1",
"GZMK",
"CD79A",
"FCGR3A",
"GNLY",
"S100A8",
"FCER1A",
"PPBP",
]
new_labels = ["0", "1", "2", "3", "4", "5", "6", "7"]
data_df = adata[:, marker_genes].to_df()
data_df["leiden"] = adata.obs["leiden"]
max_idxs = data_df.groupby("leiden", observed=True).mean().idxmax()
leiden_relabel = {}
for marker_gene, new_label in zip(marker_genes, new_labels):
leiden_relabel[max_idxs[marker_gene]] = new_label
adata.obs["leiden_old"] = adata.obs["leiden"].copy()
adata.rename_categories(
"leiden", [leiden_relabel[key] for key in sorted(leiden_relabel.keys())]
)
# ensure that the column can be sorted for consistent plotting since it is by default unordered
adata.obs["leiden"] = adata.obs["leiden"].cat.reorder_categories(
list(map(str, range(len(adata.obs["leiden"].cat.categories)))), ordered=True
)

sc.tl.rank_genes_groups(adata, "leiden")
sc.pl.rank_genes_groups(adata, n_genes=20, sharey=False, show=False)
Expand All @@ -129,26 +163,20 @@ def test_pbmc3k(image_comparer):
# sc.pl.rank_genes_groups_violin(adata, groups='0', n_genes=8)
# save_and_compare_images('rank_genes_groups_4')

if adata[adata.obs["leiden"] == "4", "CST3"].X.mean() < 1:
( # switch clusters
adata.obs["leiden"][adata.obs["leiden"] == "4"],
adata.obs["leiden"][adata.obs["leiden"] == "5"],
) = ("5", "4")
new_cluster_names = [
"CD4 T cells",
"CD14+ Monocytes",
"B cells",
"CD8 T cells",
"B cells",
"NK cells",
"FCGR3A+ Monocytes",
"CD14+ Monocytes",
"Dendritic cells",
"Megakaryocytes",
]
adata.rename_categories("leiden", new_cluster_names)

# sc.pl.umap(adata, color='leiden', legend_loc='on data', title='', frameon=False, show=False)
# save_and_compare_images('umap_3')

sc.pl.violin(
adata, ["CST3", "NKG7", "PPBP"], groupby="leiden", rotation=90, show=False
)
Expand Down
132 changes: 130 additions & 2 deletions scanpy/tests/test_clustering.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import pytest
from sklearn.metrics.cluster import normalized_mutual_info_score

import scanpy as sc
from scanpy.testing._helpers.data import pbmc68k_reduced
Expand All @@ -12,11 +13,136 @@ def adata_neighbors():
return pbmc68k_reduced()


FLAVORS = [
pytest.param("igraph", marks=needs.igraph),
pytest.param("leidenalg", marks=needs.leidenalg),
]


@needs.leidenalg
@needs.igraph
@pytest.mark.parametrize("flavor", FLAVORS)
@pytest.mark.parametrize("resolution", [1, 2])
@pytest.mark.parametrize("n_iterations", [-1, 3])
def test_leiden_basic(adata_neighbors, flavor, resolution, n_iterations):
sc.tl.leiden(
adata_neighbors,
flavor=flavor,
resolution=resolution,
n_iterations=n_iterations,
directed=(flavor == "leidenalg"),
)
assert adata_neighbors.uns["leiden"]["params"]["resolution"] == resolution
assert adata_neighbors.uns["leiden"]["params"]["n_iterations"] == n_iterations


@needs.leidenalg
def test_leiden_basic(adata_neighbors):
sc.tl.leiden(adata_neighbors)
@needs.igraph
@pytest.mark.parametrize("flavor", FLAVORS)
def test_leiden_random_state(adata_neighbors, flavor):
is_leiden_alg = flavor == "leidenalg"
n_iterations = 2 if is_leiden_alg else -1
adata_1 = sc.tl.leiden(
adata_neighbors,
flavor=flavor,
random_state=1,
copy=True,
directed=is_leiden_alg,
n_iterations=n_iterations,
)
adata_1_again = sc.tl.leiden(
adata_neighbors,
flavor=flavor,
random_state=1,
copy=True,
directed=is_leiden_alg,
n_iterations=n_iterations,
)
adata_2 = sc.tl.leiden(
adata_neighbors,
flavor=flavor,
random_state=2,
copy=True,
directed=is_leiden_alg,
n_iterations=n_iterations,
)
assert (adata_1.obs["leiden"] == adata_1_again.obs["leiden"]).all()
assert (adata_2.obs["leiden"] != adata_1_again.obs["leiden"]).any()


@needs.igraph
def test_leiden_igraph_directed(adata_neighbors):
with pytest.raises(ValueError):
sc.tl.leiden(adata_neighbors, flavor="igraph", directed=True)


@needs.igraph
def test_leiden_wrong_flavor(adata_neighbors):
with pytest.raises(ValueError):
sc.tl.leiden(adata_neighbors, flavor="foo")


@needs.igraph
@needs.leidenalg
def test_leiden_igraph_partition_type(adata_neighbors):
import leidenalg

with pytest.raises(ValueError):
sc.tl.leiden(
adata_neighbors,
flavor="igraph",
partition_type=leidenalg.RBConfigurationVertexPartition,
)


@needs.leidenalg
@needs.igraph
def test_leiden_equal_defaults_same_args(adata_neighbors):
"""Ensure the two implementations are the same for the same args."""
leiden_alg_clustered = sc.tl.leiden(
adata_neighbors, flavor="leidenalg", copy=True, n_iterations=2
)
igraph_clustered = sc.tl.leiden(
adata_neighbors, flavor="igraph", copy=True, directed=False, n_iterations=2
)
assert (
normalized_mutual_info_score(
leiden_alg_clustered.obs["leiden"], igraph_clustered.obs["leiden"]
)
> 0.9
)


@needs.leidenalg
@needs.igraph
def test_leiden_equal_defaults(adata_neighbors):
"""Ensure that the old leidenalg defaults are close enough to the current default outputs."""
leiden_alg_clustered = sc.tl.leiden(
adata_neighbors, flavor="leidenalg", directed=True, copy=True
)
igraph_clustered = sc.tl.leiden(
adata_neighbors, copy=True, n_iterations=2, directed=False
)
assert (
normalized_mutual_info_score(
leiden_alg_clustered.obs["leiden"], igraph_clustered.obs["leiden"]
)
> 0.9
)


@needs.igraph
def test_leiden_objective_function(adata_neighbors):
"""Ensure that popping this as a `clustering_kwargs` and using it does not error out."""
sc.tl.leiden(
adata_neighbors,
objective_function="modularity",
flavor="igraph",
directed=False,
)


@needs.igraph
@pytest.mark.parametrize(
"clustering,key",
[
Expand Down Expand Up @@ -52,6 +178,7 @@ def test_clustering_subset(adata_neighbors, clustering, key):


@needs.louvain
@needs.igraph
def test_louvain_basic(adata_neighbors):
sc.tl.louvain(adata_neighbors)
sc.tl.louvain(adata_neighbors, use_weights=True)
Expand All @@ -60,6 +187,7 @@ def test_louvain_basic(adata_neighbors):


@needs.louvain
@needs.igraph
def test_partition_type(adata_neighbors):
import louvain

Expand Down
9 changes: 8 additions & 1 deletion scanpy/tests/test_plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,14 @@ def test_heatmap(image_comparer):

# test var_names as dict
pbmc = pbmc68k_reduced()
sc.tl.leiden(pbmc, key_added="clusters", resolution=0.5)
sc.tl.leiden(
pbmc,
key_added="clusters",
resolution=0.5,
flavor="igraph",
n_iterations=2,
directed=False,
)
# call umap to trigger colors for the clusters
sc.pl.umap(pbmc, color="clusters")
marker_genes_dict = {
Expand Down
Loading

0 comments on commit 6ee18b9

Please sign in to comment.