JuliaIO · rafaqz · Jan 27, 2025 · meggart · Feb 10, 2025 · meggart
diff --git a/src/DiskArrays.jl b/src/DiskArrays.jl
@@ -26,6 +26,7 @@ include("generator.jl")
 include("zip.jl")
 include("show.jl")
 include("cached.jl")
+include("pad.jl")
 
 # The all-in-one macro
 

diff --git a/src/cached.jl b/src/cached.jl
@@ -1,7 +1,4 @@
 import Mmap
-# Force disk any abstractarray into a different chunking pattern.
-# This is useful in `zip` and other operations that can iterate
-# over multiple arrays with different patterns.
 
 """
     CachedDiskArray <: AbstractDiskArray

diff --git a/src/pad.jl b/src/pad.jl
@@ -0,0 +1,104 @@
+"""
+    PaddedDiskArray <: AbstractDiskArray
+
+    PaddedDiskArray(A, padding; fill=zero(eltype(A)))
+
+An `AbstractDiskArray` that adds padding to the edges of the parent array.
+This can help changing chunk offsets or padding a larger than memory array
+before windowing operations.
+
+# Arguments
+
+- `A`: The parent disk array.
+- `padding`: A tuple of `Int` lower and upper padding tuples, one for each dimension.
+
+# Keywords
+
+- `fill=zero(eltype(A))`: The value to pad the array with.
+"""
+struct PaddedDiskArray{T,N,A<:AbstractArray{T,N},C,F<:T} <: AbstractDiskArray{T,N}
+    parent::A
+    padding::NTuple{N,Tuple{Int,Int}}
+    fill::F
+    chunks::C
+end
+function PaddedDiskArray(A::AbstractArray{T,N}, padding::NTuple{N,Tuple{Int,Int}};
+    fill=zero(eltype(A)),
+) where {T,N}
+    chunks = GridChunks(map(_pad_offset, eachchunk(A).chunks, padding, size(A)))
+    PaddedDiskArray(A, padding, fill, chunks)
+end
+
+function _pad_offset(c::RegularChunks, (low, high), s)
+    chunksize = c.cs
+    offset = low > 0 ? c.offset - low + chunksize : c.offset
+    size = c.s + low + high
+    return RegularChunks(chunksize, offset, size)
+end
+function _pad_offset(c::IrregularChunks, (low, high), s)
+    offsets = Vector{Int}(undef, length(c.offsets) + 1)
+    # First offset is always zero
+    offsets[begin] = 0
+    # Increase original offsets by lower padding
+    for (i, o) in enumerate(c.offsets)
+        offsets[i + 1] = o + low
+    end
+    # Add offset for start of upper padding
+    offsets[end] = s + low
+    return IrregularChunks(offsets)
+end
+
+Base.parent(A::PaddedDiskArray) = A.parent
+function Base.size(A::PaddedDiskArray)
+    map(size(parent(A)), A.padding) do s, (low, high)
+        s + low + high
+    end
+end
+
+readblock!(A::PaddedDiskArray, data, I...) = _readblock_padded!(A, data, I...)
+readblock!(A::PaddedDiskArray, data, I::AbstractVector...) =
+    _readblock_padded(A, data, I...)
+writeblock!(A::PaddedDiskArray, data, I...) = 
+    throw(ArgumentError("Cannot write to a PaddedDiskArray"))
+haschunks(A::PaddedDiskArray) = haschunks(parent(A))
+eachchunk(A::PaddedDiskArray) = A.chunks
+
+function _readblock_padded(A, data, I...)
+    data .= A.fill
+    Ipadded = map(I, A.padding) do i, (low, high)
+        i .- low
+    end
+    fs = map(axes(parent(A)), Ipadded) do a, ip
+        searchsortedfirst(ip, first(a))
+    end
+    ls = map(axes(parent(A)), Ipadded) do a, ip
+        searchsortedlast(ip, last(a))
+    end
+    return if all(map(<=, fs, ls))
+        Idata = map(:, fs, ls)
+        Iparent = map(getindex, Ipadded, Idata)
+        @show Idata Iparent
+        readblock!(parent(A), view(data, Idata...), Iparent...)
+    else
+        # No overlap, don't read
+        data
+    end
+end
+
+"""
+
+    pad(A, padding; fill=zero(eltype(A)))
+
+Pad any `AbstractArray` with fill values, updating chunk patterns.
+
+# Arguments
+
+- `A`: The parent disk array.
+- `padding`: A tuple of `Int` lower and upper padding tuples, one for each dimension.
+
+# Keywords
+
+- `fill=zero(eltype(A))`: The value to pad the array with.
+"""
+pad(A::AbstractArray{<:Any,N}, padding::NTuple{N,Tuple{Int,Int}}; kw...) where N = 
+    PaddedDiskArray(A, padding; kw...)
diff --git a/src/rechunk.jl b/src/rechunk.jl
@@ -1,7 +1,13 @@
-# Force disk any abstractarray into a different chunking pattern.
-# This is useful in `zip` and other operations that can iterate
-# over multiple arrays with different patterns.
+"""
+    RechunkedDiskArray <: AbstractDiskArray
 
+    RechunkedDiskArray(A::AbstractArray, chunks::GridChunks)
+
+A array that forces any abstractarray into a specific chunking pattern.
+
+This is useful in `zip` and other operations that can iterate
+over multiple arrays with different patterns, that may not allign.
+"""
 struct RechunkedDiskArray{T,N,A<:AbstractArray{T,N},C} <: AbstractDiskArray{T,N}
     parent::A
     chunks::C
@@ -10,16 +16,17 @@ end
 Base.parent(A::RechunkedDiskArray) = A.parent
 Base.size(A::RechunkedDiskArray) = size(parent(A))
 # These could be more efficient with memory in some cases, but this is simple
-readblock!(A::RechunkedDiskArray, data, I...) = _readblock_rechunked(A, data, I...)
-function readblock!(A::RechunkedDiskArray, data, I::AbstractVector...)
-    return _readblock_rechunked(A, data, I...)
-end
-writeblock!(A::RechunkedDiskArray, data, I...) = writeblock!(parent(A), data, I...)
+readblock!(A::RechunkedDiskArray, data, I...) = 
+    _readblock_rechunked!(A, data, I...)
+readblock!(A::RechunkedDiskArray, data, I::AbstractVector...) =
+    _readblock_rechunked!(A, data, I...)
+writeblock!(A::RechunkedDiskArray, data, I...) = 
+    writeblock!(parent(A), data, I...)
 
 haschunks(::RechunkedDiskArray) = Chunked()
 eachchunk(A::RechunkedDiskArray) = A.chunks
 
-function _readblock_rechunked(A, data, I...)
+function _readblock_rechunked!(A, data, I...)
     if haschunks(parent(A)) isa Chunked
         readblock!(parent(A), data, I...)
     else

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -962,6 +962,29 @@ end
     end
 end
 
+@testset "Padded disk arrays" begin
+    M = (1:100) * (1:120)'
+    A = cat(M, 2M, 3M, 4M; dims=3)
+    ch = ChunkedDiskArray(A, (128, 128, 2))
+    pa = DiskArrays.pad(ch, ((10, 20), (30, 40), (1, 2)); fill=999)
+    @test size(pa) == (130, 190, 7)
+    # All outside
+    @test all(==(999), pa[1:10, 1:30, 1:1])
+    # All inside
+    @test pa[11:20, 31:40, 3:4] == ch[1:10, 1:10, 2:3]
+    # Overlapping lower
+    regions = pa[10:20, 30:40, 1:2]
+    testdata = copy(regions) .= 999
+    testdata[2:11, 2:11, 2:2] .= ch[1:10, 1:10, 1:1]
+    @test regions == testdata
+    # Overlapping upper
+    regions = pa[101:120, 141:160, 5:7]
+    testdata = copy(regions) .= 999
+    testdata[1:10, 1:10, 1:1] .= ch[91:100, 111:120, 4:4]
+    testdata
+    @test regions == testdata
+end
+
 @testset "Range subset identification" begin
     inds = [1,2,2,3,5,6,7,10,10]
     readranges, offsets = DiskArrays.find_subranges_sorted(inds,false)