Merge pull request #28 from dmbates/PooledArrays

Use PooledArrays for unordered categorical vectors
scverse · Mar 20, 2024 · 22cc850 · 22cc850
2 parents 53d51b3 + 9785582
commit 22cc850
Show file tree

Hide file tree

Showing 9 changed files with 72 additions and 56 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 # Files generated by invoking Julia with --code-coverage
 *.jl.cov
 *.jl.*.cov
+lcov.info
 
 # Files generated by invoking Julia with --track-allocation
 *.jl.mem
@@ -22,3 +23,7 @@ docs/site/
 # committed for packages, but should be committed for applications that require a static
 # environment.
 Manifest.toml
+Manifest-v*.toml
+
+# Directory used by VScode for local settings
+.vscode/
diff --git a/Project.toml b/Project.toml
@@ -12,16 +12,24 @@ HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
+PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 StructArrays = "09ab397b-f2b6-538f-b94a-2f83cf4a842a"
 
 [compat]
-CategoricalArrays = "0.9"
+CategoricalArrays = "0.9,0.10"
 CompressHashDisplace = "0.1.2"
-OrderedCollections = "1.6"
 DataFrames = "0.22, 1"
 FileIO = "1.6"
 HDF5 = "0.16 - 0.99, 1"
+OrderedCollections = "1.6"
+PooledArrays = "1"
 StructArrays = "0.6.4"
 julia = "1.5"
+
+[extras]
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[targets]
+test = ["Test"]
diff --git a/src/Muon.jl b/src/Muon.jl
@@ -8,6 +8,7 @@ using HDF5
 using DataFrames
 using CategoricalArrays
 using StructArrays
+using PooledArrays
 import CompressHashDisplace: FrozenDict
 import OrderedCollections: OrderedDict
 using FileIO

diff --git a/src/hdf5_io.jl b/src/hdf5_io.jl
@@ -44,13 +44,22 @@ function read_matrix(f::HDF5.Dataset; kwargs...)
             haskey(attributes(categories), "ordered") &&
             read_attribute(categories, "ordered") == true
         cats = read(categories)
-        mat = mat .+ 0x1
-        mat = compress(
-            CategoricalArray{eltype(cats), ndims(mat)}(
-                mat,
-                CategoricalPool{eltype(cats), eltype(mat)}(cats, ordered),
-            ),
-        )
+        mat .+= 1
+        mat = if ordered
+            compress(
+                CategoricalArray{eltype(cats), ndims(mat)}(
+                    mat,
+                    CategoricalPool{eltype(cats), eltype(mat)}(cats, ordered),
+                ),
+            )
+        else
+            PooledArray(
+                PooledArrays.RefArray(mat),
+                Dict{eltype(cats), eltype(mat)}(
+                    v => i for (i, v) in enumerate(cats)
+                )
+            )
+        end
     end
     return mat
 end
@@ -66,8 +75,8 @@ function read_matrix(f::HDF5.Group; kwargs...)
         indices = read(f, "indices")
         data = read(f, "data")
 
-        indptr .+= eltype(indptr)(1)
-        indices .+= eltype(indptr)(1)
+        indptr .+= 1
+        indices .+= 1
 
         # the row indices in every column need to be sorted
         @views for (colstart, colend) in zip(indptr[1:(end - 1)], indptr[2:end])
@@ -82,13 +91,20 @@ function read_matrix(f::HDF5.Group; kwargs...)
     elseif enctype == "categorical"
         ordered = read_attribute(f, "ordered") > 0
         categories = read(f, "categories")
-        codes = read(f, "codes") .+ 1
-
-        T = any(codes .== 0) ? Union{Missing, eltype(categories)} : eltype(categories)
-        mat = CategoricalVector{T}(
-            undef, length(codes); levels=categories, ordered=ordered)
-        copy!(mat.refs, codes)
-
+        codes = read(f, "codes") .+ true
+
+        T = any(iszero, codes) ? Union{Missing, eltype(categories)} : eltype(categories)
+        mat = if ordered
+            CategoricalVector{T}(
+                undef, length(codes); levels=categories, ordered=ordered
+            )
+            copy!(mat.refs, codes)
+        else
+            PooledArray(
+                PooledArrays.RefArray(codes),
+                Dict{T, eltype(codes)}(v => i for (i, v) in enumerate(categories)),
+            )
+        end
         return mat
     else
         error("unknown encoding $enctype")
@@ -209,16 +225,16 @@ end
 function write_impl(
     parent::Union{HDF5.File, HDF5.Group},
     name::AbstractString,
-    data::CategoricalArray;
+    data::Union{CategoricalArray,PooledArray};
     kwargs...,
 )
     g = create_group(parent, name)
     attrs = attributes(g)
     attrs["encoding-type"] = "categorical"
     attrs["encoding-version"] = "0.2.0"
-    _write_attribute(g, "ordered", isordered(data))
+    _write_attribute(g, "ordered", isa(data, CategoricalArray) && isordered(data))
     write_impl(g, "categories", levels(data); kwargs...)
-    write_impl(g, "codes", data.refs .- 1; kwargs...)
+    write_impl(g, "codes", data.refs .- true; kwargs...)
 end
 
 function write_impl(
@@ -388,8 +404,8 @@ function write_impl(
     shape = collect(size(data))
     transposed && reverse!(shape)
     attrs["shape"] = shape
-    write_impl(g, "indptr", data.colptr .- 1, extensible=true)
-    write_impl(g, "indices", data.rowval .- 1, extensible=true)
+    write_impl(g, "indptr", data.colptr .- true, extensible=true)
+    write_impl(g, "indices", data.rowval .- true, extensible=true)
     write_impl(g, "data", data.nzval, extensible=true)
 end
 

diff --git a/src/index.jl b/src/index.jl
@@ -122,13 +122,13 @@ function _delete!(idx::Index, oldkeyindex::Integer)
         idx.indices[oldkeyindex:(lastidx - 0x2)] .=
             @view idx.indices[(oldkeyindex + 0x1):(lastidx - 0x1)]
         idx.probepositions[oldkeyindex:(lastidx - 0x2)] .=
-            @view(idx.probepositions[(oldkeyindex + 0x1):(lastidx - 0x1)]) .- 0x1
-        idx.indices[lastidx - 0x1] = idx.probepositions[lastidx - 0x1] = 0x0
+            @view(idx.probepositions[(oldkeyindex + 0x1):(lastidx - 0x1)]) .- true
+        idx.indices[lastidx - 0x1] = idx.probepositions[lastidx - 0x1] = false
     else
         if oldkeyindex < _length(idx)
             idx.indices[oldkeyindex:(end - 0x1)] .= @view idx.indices[(oldkeyindex + 0x1):end]
             idx.probepositions[oldkeyindex:(end - 0x1)] .=
-                @view(idx.probepositions[(oldkeyindex + 0x1):end]) .- 0x1
+                @view(idx.probepositions[(oldkeyindex + 0x1):end]) .- true
         end
         if idx.probepositions[1] > 0x1
             idx.indices[end] = idx.indices[1]
@@ -140,7 +140,7 @@ function _delete!(idx::Index, oldkeyindex::Integer)
             end
             idx.indices[0x1:(lastidx - 0x2)] .= @view idx.indices[0x2:(lastidx - 0x1)]
             idx.probepositions[0x1:(lastidx - 0x2)] .=
-                @view(idx.probepositions[0x2:(lastidx - 0x1)]) .- 0x1
+                @view(idx.probepositions[0x2:(lastidx - 0x1)]) .- true
             idx.indices[lastidx - 0x1] = idx.probepositions[lastidx - 0x1] = 0x0
         else
             idx.indices[end] = idx.probepositions[end] = 0x0

diff --git a/src/sparsedataset.jl b/src/sparsedataset.jl
@@ -91,15 +91,15 @@ function Base.getindex(dset::SparseDataset, I::AbstractUnitRange, J::AbstractUni
     newdata = Vector{eltype(dset)}()
     for (nc, c) in enumerate(J)
         c1, c2 = colptr[c] + 1, colptr[c + 1]
-        currrows = rows[c1:c2] .+ convert(eltype(rows), 1)
+        currrows = rows[c1:c2] .+ true
         rowidx = findall(x -> x ∈ I, currrows)
         newcols[nc + 1] = newcols[nc] + length(rowidx)
 
         if length(rowidx) > 0
             currdata = data[c1:c2][rowidx]
             currrows = currrows[rowidx]
             sort!(rowidx, currdata, currrows)
-            append!(newrows, currrows .- convert(eltype(newrows), first(I)) .+ convert(eltype(newrows), 1))
+            append!(newrows, currrows .- convert(eltype(newrows), first(I)) .+ true)
             append!(newdata, currdata)
         end
     end
@@ -124,7 +124,7 @@ Base.getindex(dset::SparseDataset, ::Colon, J::AbstractUnitRange) = dset[1:size(
 function _getindex(dset, i::Integer, J::AbstractUnitRange)
     colptr = getcolptr(dset)
     c1, c2 = colptr[first(J)] + 1, colptr[last(J) + 1]
-    rows = rowvals(dset)[c1:c2] .+ 1
+    rows = rowvals(dset)[c1:c2] .+ true
     rowidx = findall(x -> x == i, rows)
 
     if length(rowidx) == 0
@@ -152,12 +152,12 @@ end
 function _getindex(dset, I::AbstractUnitRange, j::Integer)
     colptr = getcolptr(dset)
     c1, c2 = colptr[j] + 1, colptr[j + 1]
-    rows = rowvals(dset)[c1:c2] .+ 1
+    rows = rowvals(dset)[c1:c2] .+ true
     rowidx = findall(x -> x ∈ I, rows)
     data = nonzeros(dset)[c1:c2][rowidx]
 
     sort!(rowidx, data)
-    return SparseVector(length(I), rows[rowidx] .- first(I) .+ 1, data)
+    return SparseVector(length(I), rows[rowidx] .- first(I) .+ true, data)
 end
 
 function _getindex(dset, i::Integer, ::Colon)
@@ -186,7 +186,7 @@ function _getindex(dset, ::Colon, j::Integer)
     colptr = getcolptr(dset)
     rows = read(rowvals(dset))
     c1, c2 = colptr[j] + 1, colptr[j + 1]
-    rowidx = rows[c1:c2] .+ 1
+    rowidx = rows[c1:c2] .+ true
     data = nonzeros(dset)[c1:c2]
 
     sort!(rowidx, data)
@@ -233,13 +233,13 @@ function Base.setindex!(
     dsetidx = Int[]
     for (ic, c) in enumerate(J)
         c1, c2 = cols[c] + 1, cols[c + 1]
-        crows = rows[c1:c2] .+ 1
+        crows = rows[c1:c2] .+ true
         rowidx = findall(x -> x ∈ I, crows)
-        xvals = x[I[I .∉ ((@view crows[rowidx]),)] .- first(I) .+ 1, ic]
+        xvals = x[I[I .∉ ((@view crows[rowidx]),)] .- first(I) .+ true, ic]
         if length(rowidx) != length(I) && any(xvals .!= 0)
             throw(KeyError("changing the sparsity structure of a SparseDataset is not supported"))
         end
-        append!(xidx, linxidx[crows[rowidx] .- first(I) .+ 1, ic])
+        append!(xidx, linxidx[crows[rowidx] .- first(I) .+ true, ic])
         append!(dsetidx, c1 - 1 .+ rowidx)
     end
     # HDF5 doesn't support assignment using Arrays as indices, so we have to loop

diff --git a/test/Project.toml b/test/Project.toml
diff --git a/test/elementwise_io.jl b/test/elementwise_io.jl
@@ -1,6 +1,6 @@
 
 
-using CategoricalArrays
+using PooledArrays
 using HDF5
 
 tmp = mktempdir()
@@ -32,8 +32,8 @@ tmp = mktempdir()
         ([true,false,missing,true], "nullable-boolean", "0.1.0"),
         (BitVector([true,false,true]),                  "array",       "0.2.0"),
         (BitMatrix([true false true;false true false]), "array",       "0.2.0"),
-        (CategoricalArray(["a", "b", "a", "a"]), "categorical", "0.2.0"),
-        (CategoricalArray([1, 1, 2, 1]),         "categorical", "0.2.0"),
+        (PooledArray(["a", "b", "a", "a"]), "categorical", "0.2.0"),
+        (PooledArray([1, 1, 2, 1]),         "categorical", "0.2.0"),
     ]
 
     for args in enc_tests
@@ -53,8 +53,8 @@ end
         "d" => 1,
         "e" => true,
         "f" => "a",
-        "g" => CategoricalArray(["a", "b", "a", "a"]),
-        "h" => CategoricalArray([1, 1, 2, 1]),
+        "g" => PooledArray(["a", "b", "a", "a"]; compress=true, signed=true),
+        "h" => PooledArray([1, 1, 2, 1]),
         "i" => [1,2,missing,3],
         "k" => [true,false,missing,true]
     )

diff --git a/test/index.jl b/test/index.jl
@@ -2,14 +2,7 @@ using Random
 Random.seed!(42)
 _size = rand(100:200)
 
-function make_testvalues(_size::Integer)
-    testvalues = Vector{String}(undef, _size)
-    for i in 1:_size
-        length = rand(50:200)
-        testvalues[i] = randstring(length)
-    end
-    return testvalues
-end
+make_testvalues(_size::Integer) = [randstring(rand(50:200)) for _ in 1:_size]
 
 testvalues = make_testvalues(_size)
 idx = Muon.Index(testvalues)