Skip to content

Commit

Permalink
Merge pull request #28 from dmbates/PooledArrays
Browse files Browse the repository at this point in the history
Use PooledArrays for unordered categorical vectors
  • Loading branch information
ilia-kats authored Mar 20, 2024
2 parents 53d51b3 + 9785582 commit 22cc850
Show file tree
Hide file tree
Showing 9 changed files with 72 additions and 56 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Files generated by invoking Julia with --code-coverage
*.jl.cov
*.jl.*.cov
lcov.info

# Files generated by invoking Julia with --track-allocation
*.jl.mem
Expand All @@ -22,3 +23,7 @@ docs/site/
# committed for packages, but should be committed for applications that require a static
# environment.
Manifest.toml
Manifest-v*.toml

# Directory used by VScode for local settings
.vscode/
12 changes: 10 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,24 @@ HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
StructArrays = "09ab397b-f2b6-538f-b94a-2f83cf4a842a"

[compat]
CategoricalArrays = "0.9"
CategoricalArrays = "0.9,0.10"
CompressHashDisplace = "0.1.2"
OrderedCollections = "1.6"
DataFrames = "0.22, 1"
FileIO = "1.6"
HDF5 = "0.16 - 0.99, 1"
OrderedCollections = "1.6"
PooledArrays = "1"
StructArrays = "0.6.4"
julia = "1.5"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Test"]
1 change: 1 addition & 0 deletions src/Muon.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ using HDF5
using DataFrames
using CategoricalArrays
using StructArrays
using PooledArrays
import CompressHashDisplace: FrozenDict
import OrderedCollections: OrderedDict
using FileIO
Expand Down
58 changes: 37 additions & 21 deletions src/hdf5_io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,22 @@ function read_matrix(f::HDF5.Dataset; kwargs...)
haskey(attributes(categories), "ordered") &&
read_attribute(categories, "ordered") == true
cats = read(categories)
mat = mat .+ 0x1
mat = compress(
CategoricalArray{eltype(cats), ndims(mat)}(
mat,
CategoricalPool{eltype(cats), eltype(mat)}(cats, ordered),
),
)
mat .+= 1
mat = if ordered
compress(
CategoricalArray{eltype(cats), ndims(mat)}(
mat,
CategoricalPool{eltype(cats), eltype(mat)}(cats, ordered),
),
)
else
PooledArray(
PooledArrays.RefArray(mat),
Dict{eltype(cats), eltype(mat)}(
v => i for (i, v) in enumerate(cats)
)
)
end
end
return mat
end
Expand All @@ -66,8 +75,8 @@ function read_matrix(f::HDF5.Group; kwargs...)
indices = read(f, "indices")
data = read(f, "data")

indptr .+= eltype(indptr)(1)
indices .+= eltype(indptr)(1)
indptr .+= 1
indices .+= 1

# the row indices in every column need to be sorted
@views for (colstart, colend) in zip(indptr[1:(end - 1)], indptr[2:end])
Expand All @@ -82,13 +91,20 @@ function read_matrix(f::HDF5.Group; kwargs...)
elseif enctype == "categorical"
ordered = read_attribute(f, "ordered") > 0
categories = read(f, "categories")
codes = read(f, "codes") .+ 1

T = any(codes .== 0) ? Union{Missing, eltype(categories)} : eltype(categories)
mat = CategoricalVector{T}(
undef, length(codes); levels=categories, ordered=ordered)
copy!(mat.refs, codes)

codes = read(f, "codes") .+ true

T = any(iszero, codes) ? Union{Missing, eltype(categories)} : eltype(categories)
mat = if ordered
CategoricalVector{T}(
undef, length(codes); levels=categories, ordered=ordered
)
copy!(mat.refs, codes)
else
PooledArray(
PooledArrays.RefArray(codes),
Dict{T, eltype(codes)}(v => i for (i, v) in enumerate(categories)),
)
end
return mat
else
error("unknown encoding $enctype")
Expand Down Expand Up @@ -209,16 +225,16 @@ end
function write_impl(
parent::Union{HDF5.File, HDF5.Group},
name::AbstractString,
data::CategoricalArray;
data::Union{CategoricalArray,PooledArray};
kwargs...,
)
g = create_group(parent, name)
attrs = attributes(g)
attrs["encoding-type"] = "categorical"
attrs["encoding-version"] = "0.2.0"
_write_attribute(g, "ordered", isordered(data))
_write_attribute(g, "ordered", isa(data, CategoricalArray) && isordered(data))
write_impl(g, "categories", levels(data); kwargs...)
write_impl(g, "codes", data.refs .- 1; kwargs...)
write_impl(g, "codes", data.refs .- true; kwargs...)
end

function write_impl(
Expand Down Expand Up @@ -388,8 +404,8 @@ function write_impl(
shape = collect(size(data))
transposed && reverse!(shape)
attrs["shape"] = shape
write_impl(g, "indptr", data.colptr .- 1, extensible=true)
write_impl(g, "indices", data.rowval .- 1, extensible=true)
write_impl(g, "indptr", data.colptr .- true, extensible=true)
write_impl(g, "indices", data.rowval .- true, extensible=true)
write_impl(g, "data", data.nzval, extensible=true)
end

Expand Down
8 changes: 4 additions & 4 deletions src/index.jl
Original file line number Diff line number Diff line change
Expand Up @@ -122,13 +122,13 @@ function _delete!(idx::Index, oldkeyindex::Integer)
idx.indices[oldkeyindex:(lastidx - 0x2)] .=
@view idx.indices[(oldkeyindex + 0x1):(lastidx - 0x1)]
idx.probepositions[oldkeyindex:(lastidx - 0x2)] .=
@view(idx.probepositions[(oldkeyindex + 0x1):(lastidx - 0x1)]) .- 0x1
idx.indices[lastidx - 0x1] = idx.probepositions[lastidx - 0x1] = 0x0
@view(idx.probepositions[(oldkeyindex + 0x1):(lastidx - 0x1)]) .- true
idx.indices[lastidx - 0x1] = idx.probepositions[lastidx - 0x1] = false
else
if oldkeyindex < _length(idx)
idx.indices[oldkeyindex:(end - 0x1)] .= @view idx.indices[(oldkeyindex + 0x1):end]
idx.probepositions[oldkeyindex:(end - 0x1)] .=
@view(idx.probepositions[(oldkeyindex + 0x1):end]) .- 0x1
@view(idx.probepositions[(oldkeyindex + 0x1):end]) .- true
end
if idx.probepositions[1] > 0x1
idx.indices[end] = idx.indices[1]
Expand All @@ -140,7 +140,7 @@ function _delete!(idx::Index, oldkeyindex::Integer)
end
idx.indices[0x1:(lastidx - 0x2)] .= @view idx.indices[0x2:(lastidx - 0x1)]
idx.probepositions[0x1:(lastidx - 0x2)] .=
@view(idx.probepositions[0x2:(lastidx - 0x1)]) .- 0x1
@view(idx.probepositions[0x2:(lastidx - 0x1)]) .- true
idx.indices[lastidx - 0x1] = idx.probepositions[lastidx - 0x1] = 0x0
else
idx.indices[end] = idx.probepositions[end] = 0x0
Expand Down
18 changes: 9 additions & 9 deletions src/sparsedataset.jl
Original file line number Diff line number Diff line change
Expand Up @@ -91,15 +91,15 @@ function Base.getindex(dset::SparseDataset, I::AbstractUnitRange, J::AbstractUni
newdata = Vector{eltype(dset)}()
for (nc, c) in enumerate(J)
c1, c2 = colptr[c] + 1, colptr[c + 1]
currrows = rows[c1:c2] .+ convert(eltype(rows), 1)
currrows = rows[c1:c2] .+ true
rowidx = findall(x -> x I, currrows)
newcols[nc + 1] = newcols[nc] + length(rowidx)

if length(rowidx) > 0
currdata = data[c1:c2][rowidx]
currrows = currrows[rowidx]
sort!(rowidx, currdata, currrows)
append!(newrows, currrows .- convert(eltype(newrows), first(I)) .+ convert(eltype(newrows), 1))
append!(newrows, currrows .- convert(eltype(newrows), first(I)) .+ true)
append!(newdata, currdata)
end
end
Expand All @@ -124,7 +124,7 @@ Base.getindex(dset::SparseDataset, ::Colon, J::AbstractUnitRange) = dset[1:size(
function _getindex(dset, i::Integer, J::AbstractUnitRange)
colptr = getcolptr(dset)
c1, c2 = colptr[first(J)] + 1, colptr[last(J) + 1]
rows = rowvals(dset)[c1:c2] .+ 1
rows = rowvals(dset)[c1:c2] .+ true
rowidx = findall(x -> x == i, rows)

if length(rowidx) == 0
Expand Down Expand Up @@ -152,12 +152,12 @@ end
function _getindex(dset, I::AbstractUnitRange, j::Integer)
colptr = getcolptr(dset)
c1, c2 = colptr[j] + 1, colptr[j + 1]
rows = rowvals(dset)[c1:c2] .+ 1
rows = rowvals(dset)[c1:c2] .+ true
rowidx = findall(x -> x I, rows)
data = nonzeros(dset)[c1:c2][rowidx]

sort!(rowidx, data)
return SparseVector(length(I), rows[rowidx] .- first(I) .+ 1, data)
return SparseVector(length(I), rows[rowidx] .- first(I) .+ true, data)
end

function _getindex(dset, i::Integer, ::Colon)
Expand Down Expand Up @@ -186,7 +186,7 @@ function _getindex(dset, ::Colon, j::Integer)
colptr = getcolptr(dset)
rows = read(rowvals(dset))
c1, c2 = colptr[j] + 1, colptr[j + 1]
rowidx = rows[c1:c2] .+ 1
rowidx = rows[c1:c2] .+ true
data = nonzeros(dset)[c1:c2]

sort!(rowidx, data)
Expand Down Expand Up @@ -233,13 +233,13 @@ function Base.setindex!(
dsetidx = Int[]
for (ic, c) in enumerate(J)
c1, c2 = cols[c] + 1, cols[c + 1]
crows = rows[c1:c2] .+ 1
crows = rows[c1:c2] .+ true
rowidx = findall(x -> x I, crows)
xvals = x[I[I .∉ ((@view crows[rowidx]),)] .- first(I) .+ 1, ic]
xvals = x[I[I .∉ ((@view crows[rowidx]),)] .- first(I) .+ true, ic]
if length(rowidx) != length(I) && any(xvals .!= 0)
throw(KeyError("changing the sparsity structure of a SparseDataset is not supported"))
end
append!(xidx, linxidx[crows[rowidx] .- first(I) .+ 1, ic])
append!(xidx, linxidx[crows[rowidx] .- first(I) .+ true, ic])
append!(dsetidx, c1 - 1 .+ rowidx)
end
# HDF5 doesn't support assignment using Arrays as indices, so we have to loop
Expand Down
7 changes: 0 additions & 7 deletions test/Project.toml

This file was deleted.

10 changes: 5 additions & 5 deletions test/elementwise_io.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@


using CategoricalArrays
using PooledArrays
using HDF5

tmp = mktempdir()
Expand Down Expand Up @@ -32,8 +32,8 @@ tmp = mktempdir()
([true,false,missing,true], "nullable-boolean", "0.1.0"),
(BitVector([true,false,true]), "array", "0.2.0"),
(BitMatrix([true false true;false true false]), "array", "0.2.0"),
(CategoricalArray(["a", "b", "a", "a"]), "categorical", "0.2.0"),
(CategoricalArray([1, 1, 2, 1]), "categorical", "0.2.0"),
(PooledArray(["a", "b", "a", "a"]), "categorical", "0.2.0"),
(PooledArray([1, 1, 2, 1]), "categorical", "0.2.0"),
]

for args in enc_tests
Expand All @@ -53,8 +53,8 @@ end
"d" => 1,
"e" => true,
"f" => "a",
"g" => CategoricalArray(["a", "b", "a", "a"]),
"h" => CategoricalArray([1, 1, 2, 1]),
"g" => PooledArray(["a", "b", "a", "a"]; compress=true, signed=true),
"h" => PooledArray([1, 1, 2, 1]),
"i" => [1,2,missing,3],
"k" => [true,false,missing,true]
)
Expand Down
9 changes: 1 addition & 8 deletions test/index.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,7 @@ using Random
Random.seed!(42)
_size = rand(100:200)

function make_testvalues(_size::Integer)
testvalues = Vector{String}(undef, _size)
for i in 1:_size
length = rand(50:200)
testvalues[i] = randstring(length)
end
return testvalues
end
make_testvalues(_size::Integer) = [randstring(rand(50:200)) for _ in 1:_size]

testvalues = make_testvalues(_size)
idx = Muon.Index(testvalues)
Expand Down

0 comments on commit 22cc850

Please sign in to comment.