From 284597af98548ed0416ed7f65da1bbf2ba4ea66e Mon Sep 17 00:00:00 2001 From: jdm204 Date: Mon, 10 Feb 2025 13:17:14 +0000 Subject: [PATCH] Add {obs,var}_names_make_unique! (#30) * add counterpart of Python AnnData's var_names_make_unique * test counterpart of Python AnnData's var_names_make_unique * document counterpart of Python AnnData's var_names_make_unique * add/test/doc counterpart of Python AnnData's obs_names_make_unique * try to fix Logging compat * try to fix make_unique docs examples * fix: typo in duplicateindices * fixup! fix: typo in duplicateindices --------- Co-authored-by: jdm204 --- Project.toml | 4 ++- docs/src/objects.md | 16 ++++++++++ src/Muon.jl | 1 + src/util.jl | 75 +++++++++++++++++++++++++++++++++++++++++++++ test/anndata.jl | 22 +++++++++++++ test/runtests.jl | 2 +- 6 files changed, 118 insertions(+), 2 deletions(-) diff --git a/Project.toml b/Project.toml index 170ad41..723dfbf 100644 --- a/Project.toml +++ b/Project.toml @@ -23,6 +23,7 @@ CompressHashDisplace = "0.1.2" DataFrames = "0.22, 1" FileIO = "1.6" HDF5 = "0.16 - 0.99, 1" +Logging = "1.6.7" OrderedCollections = "1.6" PooledArrays = "1" StructArrays = "0.6.4" @@ -30,6 +31,7 @@ julia = "1.5" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" [targets] -test = ["Test"] +test = ["Test", "Logging"] diff --git a/docs/src/objects.md b/docs/src/objects.md index af6c948..9aa67f3 100644 --- a/docs/src/objects.md +++ b/docs/src/objects.md @@ -60,6 +60,22 @@ ad_sub[[true,false,true],:] ad_sub[[1,3],:] ``` +### Operations on AnnData Objects + +Make duplicate `var_names` unique by appending a numbered suffix. + +```@example 1 +import Muon: var_names_make_unique! # hide +var_names_make_unique!(ad) +``` + +Similarly, we can make `obs_names` unique also. + +```@example 1 +import Muon: obs_names_make_unique! # hide +obs_names_make_unique!(ad) +``` + ## MuData The basic idea behind a multimodal object is _key_ ``\rightarrow`` _value_ relationship where _keys_ represent the unique names of individual modalities and _values_ are `AnnData` objects that contain the correposnding data. Similarly to `AnnData` objects, `MuData` objects can also contain rich multimodal annotations. diff --git a/src/Muon.jl b/src/Muon.jl index fac0c36..0250254 100644 --- a/src/Muon.jl +++ b/src/Muon.jl @@ -15,6 +15,7 @@ using FileIO export readh5mu, readh5ad, writeh5mu, writeh5ad, isbacked, update_obs!, update_var!, update! export AnnData, MuData +export var_names_make_unique!, obs_names_make_unique! import Pkg # this executes only during precompilation diff --git a/src/util.jl b/src/util.jl index 9faff18..82f5116 100644 --- a/src/util.jl +++ b/src/util.jl @@ -114,3 +114,78 @@ Base.firstindex(A::Union{AbstractMuData, AbstractAnnData}, d::Integer) = 1 Base.lastindex(A::Union{AbstractMuData, AbstractAnnData}, d::Integer) = size(A, d) Base.copy(d::Union{MuDataView, AnnDataView}) = parent(d)[parentindices(d)...] + +""" + var_names_make_unique!(A::AnnData, join = '-') + +Make `A.var_names` unique by appending `join` and sequential numbers +(1, 2, 3 etc) to duplicate elements, leaving the first unchanged. +""" +function var_names_make_unique!(A::AnnData, join='-') + index_make_unique!(A.var_names, join) +end + +""" + obs_names_make_unique!(A::AnnData, join = '-') + +Make `A.obs_names` unique by appending `join` and sequential numbers +(1, 2, 3 etc) to duplicate elements, leaving the first unchanged. +""" +function obs_names_make_unique!(A::AnnData, join='-') + index_make_unique!(A.obs_names, join) +end + +function index_make_unique!(index, join) + duplicates = duplicateindices(index) + + if isempty(duplicates) + @info "var names are already unique, doing nothing" + return nothing + end + + example_colliding_names = [] + set = Set(index) + + for (name, positions) in duplicates + i = 1 + for pos in Iterators.rest(positions, 2) + while true + potential = string(index[pos], join, i) + i += 1 + if potential in set + if length(example_colliding_names) <= 5 + push!(example_colliding_names, potential) + end + else + index[pos] = potential + push!(set, potential) + break + end + end + end + end + + if !isempty(example_colliding_names) + @warn """ + Appending $(join)[1-9...] to duplicates caused collision with another name. + Example(s): $example_colliding_names + This may make the names hard to interperet. + Consider setting a different delimiter with `join={delimiter}` + """ + end +end + +function duplicateindices(v::Muon.Index{T, I}) where {T <: AbstractString, I <: Integer} + varnames = Dict{T, Vector{Int64}}() + + for i in eachindex(v) + if haskey(varnames, v[i]) + push!(varnames[v[i]], i) + else + varnames[v[i]] = [i] + end + end + + filter!(x -> length(last(x)) > 1, varnames) + varnames +end diff --git a/test/anndata.jl b/test/anndata.jl index 9499aa2..01ebed0 100644 --- a/test/anndata.jl +++ b/test/anndata.jl @@ -39,3 +39,25 @@ end @test copy(adview).X == subad.X test_ad_slicing(subad, 50, 5, x[i, 3:7]) end + +@testset "unique names" begin + @test_logs (:info,) var_names_make_unique!(ad) + @test_logs (:info,) obs_names_make_unique!(ad) + ad2 = deepcopy(ad) + ad2.var_names[3] == "10" + ad2.obs_names[90] == "obs_30" + var_names_make_unique!(ad2) + obs_names_make_unique!(ad2) + @test allunique(ad2.var_names) + @test allunique(ad2.obs_names) + ad2.var_names[10] = "10-1" + ad2.var_names[3] = "10" + ad2.var_names[4] = "10" + ad2.obs_names[11] = "obs_10-1" + ad2.obs_names[10] = "obs_10" + ad2.obs_names[9] = "obs_10" + @test_logs (:warn,) var_names_make_unique!(ad2) + @test_logs (:warn,) obs_names_make_unique!(ad2) + @test allunique(ad2.var_names) + @test allunique(ad2.obs_names) +end diff --git a/test/runtests.jl b/test/runtests.jl index df8d0cb..d765c02 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,4 +1,4 @@ -using Muon, Test +using Muon, Test, Logging @testset "Index" begin include("index.jl")