(feat): igraph leiden implementation now included as an option in `…

…sc.tl.leiden` (#2815) * (feat): igraph as option for leiden * (feat): add test for similarity * (feat): migrate defaults to `igraph` * (chore): add test for `directed` + `igraph` * (chore): change expected images * (fix): weights condition bug * (fix): change `rank_genes_groups` tolerance and update test images * (feat): new violin plot based on redone cluster assignments * (chore): check parameters matching * (fix): handle import properly * (fix): handle `partition_type` with `use_igraph` * (chore): remove unnecessary test args * (chore): add test for old defaults * (chore): pre-commit? * (chore): pre-commit hooks run * (chore): make violin plot `expected` correct * (fix): change `tol` again for violin plots * (chore): revert tolerance change - separate issue incoming * (chore): release note * (chore): try new plots with random seed set * (test): try publishing artifacts * (fix): publish artifact * (fix): publish other images * (chore): umap * (fix): fix random seeding for `igraph` * (fix): import in function * (fix): remove umap from test * (fix): try different random? * (feat): try marker gene labeling + write results * (fix): publish artifacts * (fix): try writing out data after relabel * (fix): try stable dataset * (chore): add more writes * (fix): sort categories * (fix): require igraph * (chore): remove build artifact * (fix): spelling error * (fix): swap changed after re-ordering * (chore): `use_igraph` -> `use_leidenalg` * fmt * (refactor): `use_leidenalg` -> `backend` * (refactor): get `objective_function` from `clustering_args` * (fix): docstring links * (refactor): create rng for igraph * (refactor): less lines * (chore): add test for random state * (refactor): fix initial state settings for other `igraph` methods by using `random` again * (refactor): `FLAVORS` reuse in test * Update scanpy/tools/_leiden.py Co-authored-by: Philipp A. <[email protected]> * Update scanpy/_utils/__init__.py Co-authored-by: Philipp A. <[email protected]> * Update scanpy/_utils/__init__.py Co-authored-by: Philipp A. <[email protected]> * (fix): fix heatmap plot * (fix): change out images for new random seed method * Update scanpy/tools/_leiden.py Co-authored-by: Philipp A. <[email protected]> * (chore): switch back to `leidenalg` default * (chore): fix clustering tests and update message * (fix): plotting test * (fix): `test_leiden_basic` `directed` arg * (fix): fix iterations to defaults * (fix): correct category swapping * (fix): need to reorder categories as well * (fix): clean up simple tests * (fix): remove unnecessary cluster swap. * (fix): just use random state that gives same number of categories * (fix): use `np.random` instead of `random` module * (chore): remove unnecessary comment in test about state * (refactor): simplify conditions * (refactor): `elif` -> `else` when `flavor` already checked * (fix): move leiden import for test * (fix): revert unnecessary image changes * (chore): address comments --------- Co-authored-by: Philipp A <[email protected]>
scverse · Feb 19, 2024 · 6ee18b9 · 6ee18b9
1 parent 1ac74a7
commit 6ee18b9
Show file tree

Hide file tree

Showing 12 changed files with 276 additions and 45 deletions.
diff --git a/docs/release-notes/1.10.0.md b/docs/release-notes/1.10.0.md
@@ -16,6 +16,7 @@
 * {func}`scanpy.pp.pca`, {func}`scanpy.pp.scale`, {func}`scanpy.pl.embedding`, and {func}`scanpy.experimental.pp.normalize_pearson_residuals_pca`
   now support a `mask` parameter {pr}`2272` {smaller}`C Bright, T Marcella, & P Angerer`
 * {func}`scanpy.tl.rank_genes_groups` no longer warns that it's default was changed from t-test_overestim_var to t-test {pr}`2798` {smaller}`L Heumos`
+* {func}`scanpy.tl.leiden` now offers `igraph`'s implementation of the leiden algorithm via  via `flavor` when set to `igraph`.  `leidenalg`'s implementation is still default, but discouraged.  {pr}`2815` {smaller}`I Gold`
 * {func}`scanpy.pp.highly_variable_genes` has new flavor `seurat_v3_paper` that is in its implementation consistent with the paper description in Stuart et al 2018. {pr}`2792` {smaller}`E Roellin`
 * {func}`scanpy.pp.highly_variable_genes` supports dask for the default `seurat` and `cell_ranger` flavors {pr}`2809` {smaller}`P Angerer`
 * Auto conversion of strings to collections in `scanpy.pp.calculate_qc_metrics` {pr}`2859` {smaller}`N Teyssier`

diff --git a/scanpy/_utils/__init__.py b/scanpy/_utils/__init__.py
@@ -7,9 +7,11 @@
 
 import importlib.util
 import inspect
+import random
 import sys
 import warnings
 from collections import namedtuple
+from contextlib import contextmanager
 from enum import Enum
 from functools import partial, singledispatch, wraps
 from textwrap import dedent
@@ -20,10 +22,10 @@
 import numpy as np
 from anndata import AnnData
 from anndata import __version__ as anndata_version
-from numpy import random
 from numpy.typing import NDArray
 from packaging import version
 from scipy import sparse
+from sklearn.utils import check_random_state
 
 from .. import logging as logg
 from .._compat import DaskArray
@@ -45,7 +47,38 @@ def __repr__(self) -> str:
 _empty = Empty.token
 
 # e.g. https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
-AnyRandom = Union[int, random.RandomState, None]  # maybe in the future random.Generator
+# maybe in the future random.Generator
+AnyRandom = Union[int, np.random.RandomState, None]
+
+
+class RNGIgraph:
+    """
+    Random number generator for ipgraph so global seed is not changed.
+    See :func:`igraph.set_random_number_generator` for the requirements.
+    """
+
+    def __init__(self, random_state: int = 0) -> None:
+        self._rng = check_random_state(random_state)
+
+    def __getattr__(self, attr: str):
+        return getattr(self._rng, "normal" if attr == "gauss" else attr)
+
+
+@contextmanager
+def set_igraph_random_state(random_state: int):
+    try:
+        import igraph
+    except ImportError:
+        raise ImportError(
+            "Please install igraph: `conda install -c conda-forge igraph` or `pip3 install igraph`."
+        )
+    rng = RNGIgraph(random_state)
+    try:
+        igraph.set_random_number_generator(rng)
+        yield None
+    finally:
+        igraph.set_random_number_generator(random)
+
 
 EPS = 1e-15
 
@@ -459,10 +492,10 @@ def moving_average(a: np.ndarray, n: int):
     return ret[n - 1 :] / n
 
 
-def get_random_state(seed: AnyRandom) -> random.RandomState:
+def get_random_state(seed: AnyRandom) -> np.random.RandomState:
     if isinstance(seed, np.random.RandomState):
         return seed
-    return random.RandomState(seed)
+    return np.random.RandomState(seed)
 
 
 # --------------------------------------------------------------------------------

diff --git a/scanpy/tests/_images/heatmap_var_as_dict/expected.png b/scanpy/tests/_images/heatmap_var_as_dict/expected.png
diff --git a/scanpy/tests/notebooks/_images_pbmc3k/rank_genes_groups_1/expected.png b/scanpy/tests/notebooks/_images_pbmc3k/rank_genes_groups_1/expected.png
diff --git a/scanpy/tests/notebooks/_images_pbmc3k/rank_genes_groups_2/expected.png b/scanpy/tests/notebooks/_images_pbmc3k/rank_genes_groups_2/expected.png
diff --git a/scanpy/tests/notebooks/_images_pbmc3k/rank_genes_groups_3/expected.png b/scanpy/tests/notebooks/_images_pbmc3k/rank_genes_groups_3/expected.png
diff --git a/scanpy/tests/notebooks/_images_pbmc3k/scatter_3/expected.png b/scanpy/tests/notebooks/_images_pbmc3k/scatter_3/expected.png
diff --git a/scanpy/tests/notebooks/_images_pbmc3k/violin_2/expected.png b/scanpy/tests/notebooks/_images_pbmc3k/violin_2/expected.png
diff --git a/scanpy/tests/notebooks/test_pbmc3k.py b/scanpy/tests/notebooks/test_pbmc3k.py
@@ -28,11 +28,10 @@
 
 @needs.leidenalg
 def test_pbmc3k(image_comparer):
+    # ensure violin plots and other non-determinstic plots have deterministic behavior
+    np.random.seed(0)
     save_and_compare_images = partial(image_comparer, ROOT, tol=20)
-
-    adata = sc.read(
-        "./data/pbmc3k_raw.h5ad", backup_url="https://falexwolf.de/data/pbmc3k_raw.h5ad"
-    )
+    adata = sc.datasets.pbmc3k()
 
     # Preprocessing
 
@@ -105,13 +104,48 @@ def test_pbmc3k(image_comparer):
 
     # Clustering the graph
 
-    sc.tl.leiden(adata, resolution=0.9)
-    # sc.pl.umap(adata, color=['leiden', 'CST3', 'NKG7'], show=False)
-    # save_and_compare_images('umap_2')
+    sc.tl.leiden(
+        adata,
+        resolution=0.9,
+        random_state=0,
+        directed=False,
+        n_iterations=2,
+        flavor="igraph",
+    )
+
+    # sc.pl.umap(adata, color=["leiden", "CST3", "NKG7"], show=False)
+    # save_and_compare_images("umap_2")
     sc.pl.scatter(adata, "CST3", "NKG7", color="leiden", show=False)
     save_and_compare_images("scatter_3")
 
     # Finding marker genes
+    # Due to incosistency with our test runner vs local, these clusters need to
+    # be pre-annotated as the numbers for each cluster are not consistent.
+    marker_genes = [
+        "RP11-18H21.1",
+        "GZMK",
+        "CD79A",
+        "FCGR3A",
+        "GNLY",
+        "S100A8",
+        "FCER1A",
+        "PPBP",
+    ]
+    new_labels = ["0", "1", "2", "3", "4", "5", "6", "7"]
+    data_df = adata[:, marker_genes].to_df()
+    data_df["leiden"] = adata.obs["leiden"]
+    max_idxs = data_df.groupby("leiden", observed=True).mean().idxmax()
+    leiden_relabel = {}
+    for marker_gene, new_label in zip(marker_genes, new_labels):
+        leiden_relabel[max_idxs[marker_gene]] = new_label
+    adata.obs["leiden_old"] = adata.obs["leiden"].copy()
+    adata.rename_categories(
+        "leiden", [leiden_relabel[key] for key in sorted(leiden_relabel.keys())]
+    )
+    # ensure that the column can be sorted for consistent plotting since it is by default unordered
+    adata.obs["leiden"] = adata.obs["leiden"].cat.reorder_categories(
+        list(map(str, range(len(adata.obs["leiden"].cat.categories)))), ordered=True
+    )
 
     sc.tl.rank_genes_groups(adata, "leiden")
     sc.pl.rank_genes_groups(adata, n_genes=20, sharey=False, show=False)
@@ -129,26 +163,20 @@ def test_pbmc3k(image_comparer):
     # sc.pl.rank_genes_groups_violin(adata, groups='0', n_genes=8)
     # save_and_compare_images('rank_genes_groups_4')
 
-    if adata[adata.obs["leiden"] == "4", "CST3"].X.mean() < 1:
-        (  # switch clusters
-            adata.obs["leiden"][adata.obs["leiden"] == "4"],
-            adata.obs["leiden"][adata.obs["leiden"] == "5"],
-        ) = ("5", "4")
     new_cluster_names = [
         "CD4 T cells",
-        "CD14+ Monocytes",
-        "B cells",
         "CD8 T cells",
+        "B cells",
         "NK cells",
         "FCGR3A+ Monocytes",
+        "CD14+ Monocytes",
         "Dendritic cells",
         "Megakaryocytes",
     ]
     adata.rename_categories("leiden", new_cluster_names)
 
     # sc.pl.umap(adata, color='leiden', legend_loc='on data', title='', frameon=False, show=False)
     # save_and_compare_images('umap_3')
-
     sc.pl.violin(
         adata, ["CST3", "NKG7", "PPBP"], groupby="leiden", rotation=90, show=False
     )

diff --git a/scanpy/tests/test_clustering.py b/scanpy/tests/test_clustering.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import pytest
+from sklearn.metrics.cluster import normalized_mutual_info_score
 
 import scanpy as sc
 from scanpy.testing._helpers.data import pbmc68k_reduced
@@ -12,11 +13,136 @@ def adata_neighbors():
     return pbmc68k_reduced()
 
 
+FLAVORS = [
+    pytest.param("igraph", marks=needs.igraph),
+    pytest.param("leidenalg", marks=needs.leidenalg),
+]
+
+
+@needs.leidenalg
+@needs.igraph
+@pytest.mark.parametrize("flavor", FLAVORS)
+@pytest.mark.parametrize("resolution", [1, 2])
+@pytest.mark.parametrize("n_iterations", [-1, 3])
+def test_leiden_basic(adata_neighbors, flavor, resolution, n_iterations):
+    sc.tl.leiden(
+        adata_neighbors,
+        flavor=flavor,
+        resolution=resolution,
+        n_iterations=n_iterations,
+        directed=(flavor == "leidenalg"),
+    )
+    assert adata_neighbors.uns["leiden"]["params"]["resolution"] == resolution
+    assert adata_neighbors.uns["leiden"]["params"]["n_iterations"] == n_iterations
+
+
 @needs.leidenalg
-def test_leiden_basic(adata_neighbors):
-    sc.tl.leiden(adata_neighbors)
+@needs.igraph
+@pytest.mark.parametrize("flavor", FLAVORS)
+def test_leiden_random_state(adata_neighbors, flavor):
+    is_leiden_alg = flavor == "leidenalg"
+    n_iterations = 2 if is_leiden_alg else -1
+    adata_1 = sc.tl.leiden(
+        adata_neighbors,
+        flavor=flavor,
+        random_state=1,
+        copy=True,
+        directed=is_leiden_alg,
+        n_iterations=n_iterations,
+    )
+    adata_1_again = sc.tl.leiden(
+        adata_neighbors,
+        flavor=flavor,
+        random_state=1,
+        copy=True,
+        directed=is_leiden_alg,
+        n_iterations=n_iterations,
+    )
+    adata_2 = sc.tl.leiden(
+        adata_neighbors,
+        flavor=flavor,
+        random_state=2,
+        copy=True,
+        directed=is_leiden_alg,
+        n_iterations=n_iterations,
+    )
+    assert (adata_1.obs["leiden"] == adata_1_again.obs["leiden"]).all()
+    assert (adata_2.obs["leiden"] != adata_1_again.obs["leiden"]).any()
+
+
+@needs.igraph
+def test_leiden_igraph_directed(adata_neighbors):
+    with pytest.raises(ValueError):
+        sc.tl.leiden(adata_neighbors, flavor="igraph", directed=True)
+
+
+@needs.igraph
+def test_leiden_wrong_flavor(adata_neighbors):
+    with pytest.raises(ValueError):
+        sc.tl.leiden(adata_neighbors, flavor="foo")
+
+
+@needs.igraph
+@needs.leidenalg
+def test_leiden_igraph_partition_type(adata_neighbors):
+    import leidenalg
+
+    with pytest.raises(ValueError):
+        sc.tl.leiden(
+            adata_neighbors,
+            flavor="igraph",
+            partition_type=leidenalg.RBConfigurationVertexPartition,
+        )
+
+
+@needs.leidenalg
+@needs.igraph
+def test_leiden_equal_defaults_same_args(adata_neighbors):
+    """Ensure the two implementations are the same for the same args."""
+    leiden_alg_clustered = sc.tl.leiden(
+        adata_neighbors, flavor="leidenalg", copy=True, n_iterations=2
+    )
+    igraph_clustered = sc.tl.leiden(
+        adata_neighbors, flavor="igraph", copy=True, directed=False, n_iterations=2
+    )
+    assert (
+        normalized_mutual_info_score(
+            leiden_alg_clustered.obs["leiden"], igraph_clustered.obs["leiden"]
+        )
+        > 0.9
+    )
+
+
+@needs.leidenalg
+@needs.igraph
+def test_leiden_equal_defaults(adata_neighbors):
+    """Ensure that the old leidenalg defaults are close enough to the current default outputs."""
+    leiden_alg_clustered = sc.tl.leiden(
+        adata_neighbors, flavor="leidenalg", directed=True, copy=True
+    )
+    igraph_clustered = sc.tl.leiden(
+        adata_neighbors, copy=True, n_iterations=2, directed=False
+    )
+    assert (
+        normalized_mutual_info_score(
+            leiden_alg_clustered.obs["leiden"], igraph_clustered.obs["leiden"]
+        )
+        > 0.9
+    )
+
+
+@needs.igraph
+def test_leiden_objective_function(adata_neighbors):
+    """Ensure that popping this as a `clustering_kwargs` and using it does not error out."""
+    sc.tl.leiden(
+        adata_neighbors,
+        objective_function="modularity",
+        flavor="igraph",
+        directed=False,
+    )
 
 
+@needs.igraph
 @pytest.mark.parametrize(
     "clustering,key",
     [
@@ -52,6 +178,7 @@ def test_clustering_subset(adata_neighbors, clustering, key):
 
 
 @needs.louvain
+@needs.igraph
 def test_louvain_basic(adata_neighbors):
     sc.tl.louvain(adata_neighbors)
     sc.tl.louvain(adata_neighbors, use_weights=True)
@@ -60,6 +187,7 @@ def test_louvain_basic(adata_neighbors):
 
 
 @needs.louvain
+@needs.igraph
 def test_partition_type(adata_neighbors):
     import louvain
 

diff --git a/scanpy/tests/test_plotting.py b/scanpy/tests/test_plotting.py
@@ -103,7 +103,14 @@ def test_heatmap(image_comparer):
 
     # test var_names as dict
     pbmc = pbmc68k_reduced()
-    sc.tl.leiden(pbmc, key_added="clusters", resolution=0.5)
+    sc.tl.leiden(
+        pbmc,
+        key_added="clusters",
+        resolution=0.5,
+        flavor="igraph",
+        n_iterations=2,
+        directed=False,
+    )
     # call umap to trigger colors for the clusters
     sc.pl.umap(pbmc, color="clusters")
     marker_genes_dict = {