Skip to content

POC: Implementation of ZEP0003 #126

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 10 additions & 11 deletions src/ZArray.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import JSON
import OffsetArrays: OffsetArray
import DiskArrays: AbstractDiskArray
import DiskArrays: AbstractDiskArray, findchunk, max_chunksize
import DiskArrays

"""
Expand Down Expand Up @@ -122,8 +122,7 @@ trans_ind(r, bs)
For a given index and blocksize determines which chunks of the Zarray will have to
be accessed.
"""
trans_ind(r::AbstractUnitRange, bs) = fld1(first(r),bs):fld1(last(r),bs)
trans_ind(r::Integer, bs) = fld1(r,bs)
trans_ind(r, bs) = findchunk(bs,r)

function boundint(r1, s2, o2)
r2 = range(o2+1,length=s2)
Expand All @@ -134,14 +133,14 @@ end

function getchunkarray(z::ZArray{>:Missing})
# temporary workaround to use strings as data values
inner = fill(z.metadata.fill_value, z.metadata.chunks)
inner = fill(z.metadata.fill_value, max_chunksize.(z.metadata.chunks))
a = SenMissArray(inner,z.metadata.fill_value)
end
_zero(T) = zero(T)
_zero(T::Type{<:MaxLengthString}) = T("")
_zero(T::Type{ASCIIChar}) = ASCIIChar(0)
_zero(::Type{<:Vector{T}}) where T = T[]
getchunkarray(z::ZArray) = fill(_zero(eltype(z)), z.metadata.chunks)
getchunkarray(z::ZArray) = fill(_zero(eltype(z)), max_chunksize.(z.metadata.chunks))

maybeinner(a::Array) = a
maybeinner(a::SenMissArray) = a.x
Expand Down Expand Up @@ -247,7 +246,7 @@ end
DiskArrays.readblock!(a::ZArray,aout,i::AbstractUnitRange...) = readblock!(aout,a,CartesianIndices(i))
DiskArrays.writeblock!(a::ZArray,v,i::AbstractUnitRange...) = writeblock!(v,a,CartesianIndices(i))
DiskArrays.haschunks(::ZArray) = DiskArrays.Chunked()
DiskArrays.eachchunk(a::ZArray) = DiskArrays.GridChunks(a,a.metadata.chunks)
DiskArrays.eachchunk(a::ZArray) = DiskArrays.GridChunks(a.metadata.chunks...)

"""
uncompress_raw!(a::DenseArray{T},z::ZArray{T,N},i::CartesianIndex{N})
Expand Down Expand Up @@ -278,7 +277,7 @@ function uncompress_to_output!(aout,output_base_offsets,z,chunk_compressed,curre
end

function compress_raw(a,z)
length(a) == prod(z.metadata.chunks) || throw(DimensionMismatch("Array size does not equal chunk size"))
#length(a) == prod(z.metadata.chunks) || throw(DimensionMismatch("Array size does not equal chunk size"))
if !all(isequal(z.metadata.fill_value),a)
dtemp = UInt8[]
zcompress!(dtemp,a,z.metadata.compressor, z.metadata.filters)
Expand Down Expand Up @@ -383,7 +382,7 @@ chunkindices(z::ZArray)

Returns the Cartesian Indices of the chunks of a given ZArray
"""
chunkindices(z::ZArray) = CartesianIndices(map((s, c) -> 1:ceil(Int, s/c), z.metadata.shape[], z.metadata.chunks))
chunkindices(z::ZArray) = CartesianIndices(length.(z.metadata.chunks))

"""
zzeros(T, dims...; kwargs... )
Expand All @@ -392,7 +391,7 @@ Creates a zarr array and initializes all values with zero. Accepts the same keyw
"""
function zzeros(T,dims...;kwargs...)
z = zcreate(T,dims...;kwargs...)
as = zeros(T, z.metadata.chunks...)
as = zeros(T, max_chunksize.(z.metadata.chunks)...)
data_encoded = compress_raw(as,z)
p = z.path
for i in chunkindices(z)
Expand Down Expand Up @@ -459,9 +458,9 @@ end

function prune_oob_chunks(s::AbstractStore,path,oldsize, newsize, chunks)
dimstoshorten = findall(map(<,newsize, oldsize))
allchunkranges = Base.OneTo.(length.(chunks))
for idim in dimstoshorten
delrange = (fld1(newsize[idim],chunks[idim])+1):(fld1(oldsize[idim],chunks[idim]))
allchunkranges = map(i->1:fld1(oldsize[i],chunks[i]),1:length(oldsize))
delrange = (findchunk(chunks[idim],newsize[idim])+1):findchunk(chunks[idim],oldsize[idim])
r = (allchunkranges[1:idim-1]..., delrange, allchunkranges[idim+1:end]...)
for cI in CartesianIndices(r)
delete!(s,path,cI)
Expand Down
32 changes: 25 additions & 7 deletions src/metadata.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import Dates: Date, DateTime

using DiskArrays: RegularChunks, IrregularChunks
const SomeChunk = Union{RegularChunks, IrregularChunks}
"""NumPy array protocol type string (typestr) format

A string providing the basic type of the homogeneous array. The basic string format
Expand Down Expand Up @@ -114,6 +115,9 @@ function typestr(s::AbstractString, filterlist=nothing)
end
end

arraysize_from_chunksize(cs::RegularChunks)=cs.s
arraysize_from_chunksize(cs::IrregularChunks)=last(cs.offsets)

"""Metadata configuration of the stored array

Each array requires essential configuration metadata to be stored, enabling correct
Expand All @@ -125,7 +129,7 @@ https://zarr.readthedocs.io/en/stable/spec/v2.html#metadata
struct Metadata{T, N, C, F}
zarr_format::Int
shape::Base.RefValue{NTuple{N, Int}}
chunks::NTuple{N, Int}
chunks::NTuple{N, SomeChunk}
dtype::String # structured data types not yet supported
compressor::C
fill_value::Union{T, Nothing}
Expand All @@ -136,9 +140,17 @@ struct Metadata{T, N, C, F}
zarr_format == 2 || throw(ArgumentError("Zarr.jl currently only support v2 of the protocol"))
#Do some sanity checks to make sure we have a sane array
any(<(0), shape) && throw(ArgumentError("Size must be positive"))
any(<(1), chunks) && throw(ArgumentError("Chunk size must be >= 1 along each dimension"))
chunks = map(shape,chunks) do s,c
if isa(c,Int)
c=RegularChunks(c,0,s)
elseif isa(c,AbstractVector{<:Integer})
c=IrregularChunks(chunksizes=c)
end
arraysize_from_chunksize(c) < s && throw(ArgumentError("Size of chunks must be larger or equal the size of the array"))
c
end
order === 'C' || throw(ArgumentError("Currently only 'C' storage order is supported"))
new{T2, N, C, F}(zarr_format, Base.RefValue{NTuple{N,Int}}(shape), chunks, dtype, compressor,fill_value, order, filters)
new{T2, N, C, F}(zarr_format, Base.RefValue{NTuple{N,Int}}(shape), (chunks...,), dtype, compressor,fill_value, order, filters)
end
end

Expand All @@ -157,7 +169,7 @@ end


"Construct Metadata based on your data"
function Metadata(A::AbstractArray{T, N}, chunks::NTuple{N, Int};
function Metadata(A::AbstractArray{T, N}, chunks::Tuple;
zarr_format::Integer=2,
compressor::C=BloscCompressor(),
fill_value::Union{T, Nothing}=nothing,
Expand Down Expand Up @@ -196,12 +208,15 @@ function Metadata(d::AbstractDict, fill_as_missing)

fv = fill_value_decoding(d["fill_value"], T)

chunks = map(d["chunks"],d["shape"]) do c,s
isa(c,Integer) ? RegularChunks(c,0,s) : IrregularChunks(chunksizes=c)
end
TU = (fv === nothing || !fill_as_missing) ? T : Union{T,Missing}

Metadata{TU, N, C, F}(
d["zarr_format"],
NTuple{N, Int}(d["shape"]) |> reverse,
NTuple{N, Int}(d["chunks"]) |> reverse,
chunks |> reverse,
d["dtype"],
compressor,
fv,
Expand All @@ -210,12 +225,15 @@ function Metadata(d::AbstractDict, fill_as_missing)
)
end

chunk_encoding(c::RegularChunks) = c.cs
chunk_encoding(c::IrregularChunks) = length.(c)

"Describes how to lower Metadata to JSON, used in json(::Metadata)"
function JSON.lower(md::Metadata)
Dict{String, Any}(
"zarr_format" => md.zarr_format,
"shape" => md.shape[] |> reverse,
"chunks" => md.chunks |> reverse,
"chunks" => chunk_encoding.(md.chunks) |> reverse,
"dtype" => md.dtype,
"compressor" => md.compressor,
"fill_value" => fill_value_encoding(md.fill_value),
Expand Down
5 changes: 3 additions & 2 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ using JSON
using Pkg
using PyCall
using Dates
using Zarr: RegularChunks, IrregularChunks

macro test_py(ex)
quote
Expand All @@ -25,7 +26,7 @@ end
@test eltype(z.storage.a["0.0"]) === UInt8
@test z.metadata.shape[] === (2, 3)
@test z.metadata.order === 'C'
@test z.metadata.chunks === (2, 3)
@test z.metadata.chunks === (RegularChunks(2,0,2), RegularChunks(3,0,3))
@test z.metadata.fill_value === nothing
@test z.metadata.compressor isa Zarr.BloscCompressor
@test z.metadata.compressor.blocksize === 0
Expand Down Expand Up @@ -122,7 +123,7 @@ end
@test metadata isa Zarr.Metadata
@test metadata.zarr_format === 2
@test metadata.shape[] === size(A)
@test metadata.chunks === chunks
@test metadata.chunks === (RegularChunks(5,0,30),RegularChunks(10,0,20))
@test metadata.dtype === "<f8"
@test metadata.compressor === Zarr.BloscCompressor(0, 5, "lz4", true)
@test metadata.fill_value === -1.5
Expand Down