-
Notifications
You must be signed in to change notification settings - Fork 9
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
inference failure when broadcasting over multiple structs and a Field #2065
Comments
This is required as a workaround to the type inference failure detailed in CliMA/ClimaCore.jl#2065. The failure appears when a function is broadcast over one field and two structs; the types are too complex and inference fails. This workaround introduces a wrapper struct to contain the two struct inputs, so that the function is broadcast over one field and only one struct, and the types are easier to infer.
This is required as a workaround to the type inference failure detailed in CliMA/ClimaCore.jl#2065. The failure appears when a function is broadcast over one field and two structs; the types are too complex and inference fails. This workaround introduces a wrapper struct to contain the two struct inputs, so that the function is broadcast over one field and only one struct, and the types are easier to infer.
This is becoming more of an issue as we eliminate the ClimaAtmos solver cache. So, I think it's worth either fixing this or opening an issue in julialang. |
Here's a reproducer that's much bigger (I'm still working on trimming this down, and I'll keep updating this comment as I trim it down), but doesn't depend on any CliMA packages. |
We can reproduce the issue for |
Here's a further trimmed down reproducer with the VF datalayout. Big win: does not depend on We're now below 200 lines: #=
using Revise; include("broadcast_inference_repro.jl")
julia --project=.buildkite
julia --project=.buildkite broadcast_inference_repro.jl
julia +1.11 --project=.buildkite broadcast_inference_repro.jl
=#
@show VERSION
@static if !(VERSION ≥ v"1.11.0-beta")
using JET;
end
import CUDA # comment to run without CUDA
using Test
import Adapt
import Base
import Base.Broadcast: BroadcastStyle,
Broadcasted, instantiate, broadcasted, materialize, materialize!
struct VF{S <: AbstractFloat, Nv, A}
array::A
end
struct VFStyle{Nv, A} <: Base.BroadcastStyle end
function VF{S, Nv}(array::AbstractArray{T, 2}) where {S, Nv, T}
@assert size(array, 1) == Nv
@assert size(array, 2) == typesize(T, S)
VF{S, Nv, typeof(array)}(array)
end
function VF{S}(
::Type{ArrayType};
Nv::Integer,
) where {S, ArrayType}
Nf = typesize(eltype(ArrayType), S)
array = similar(ArrayType, Nv, Nf)
fill!(array, 0)
VF{S, Nv}(array)
end
typesize(::Type{T}, ::Type{S}) where {T, S} = div(sizeof(S), sizeof(T))
parent_array_type(::Type{<:Array{T}}) where {T} = Array{T}
Base.eltype(::Type{<:VF{S}}) where {S} = S
Base.parent(data::VF) = getfield(data, :array)
Base.similar(data::VF{S}) where {S} = similar(data, S)
@inline Base.size(data::VF, i::Integer) = size(data)[i]
@inline Base.size(data::VF{S, Nv}) where {S, Nv} = (1, 1, 1, Nv, 1)
Base.length(data::VF{S, Nv}) where {S, Nv} = Nv
Base.lastindex(data::VF) = length(data)
Base.copy(data::VF{S, NV}) where {S, NV} = VF{S, NV}(copy(parent(data)))
Base.Broadcast.BroadcastStyle(::Type{VF{S, Nv, A}}) where {S, Nv, A} = VFStyle{Nv, parent_array_type(A)}()
Base.Broadcast.BroadcastStyle(::Base.Broadcast.Style{<:Tuple}, ds::VFStyle) = ds
Base.Broadcast.broadcastable(data::VF) = data
Adapt.adapt_structure(to, data::VF{S, NV}) where {S, NV} = VF{S, NV}(Adapt.adapt(to, parent(data)))
@inline parent_array_type(::Type{VF{S, Nv, A}}) where {S, Nv, A} = A
Base.ndims(data::VF) = Base.ndims(typeof(data))
Base.ndims(::Type{T}) where {T <: VF} = Base.ndims(parent_array_type(T))
function Base.similar(
bc::Union{Base.Broadcast.Broadcasted{VFStyle{Nv, A}}, VF{S, Nv, A}},
::Type{S},
) where {Nv, A, S}
PA = parent_array_type(A)
array = similar(PA, (Nv, typesize(eltype(A), S)))
return VF{S, Nv}(array)
end
@inline function Base.getindex(
data::VF{S, Nv},
I::CartesianIndex,
) where {S, Nv}
@boundscheck 1 <= I.I[4] <= Nv || throw(BoundsError(data, I))
return parent(data)[I.I[4], 1]
end
@inline function Base.setindex!(
data::VF{S, Nv},
val,
I::CartesianIndex,
) where {S, Nv}
@boundscheck 1 <= I.I[4] <= Nv || throw(BoundsError(data, I))
parent(data)[I.I[4], 1] = val
end
function Base.copyto!(
dest::VF{S},
bc::Union{VF, Base.Broadcast.Broadcasted},
) where {S}
Base.copyto!(dest, bc, parent(dest))
dest
end
function Base.copyto!(
dest::VF{S, Nv},
bc::Union{Base.Broadcast.Broadcasted{VFStyle{Nv, A}}, VF{S, Nv, A}},
::Array,
) where {S, Nv, A}
@inbounds for v in 1:Nv
idx = CartesianIndex(1, 1, 1, v, 1)
dest[idx] = convert(S, bc[idx])
end
return dest
end
# Extension
@static if @isdefined(CUDA)
parent_array_type(::Type{<:CUDA.CuArray{T, N, B} where {N}}) where {T, B} = CUDA.CuArray{T, N, B} where {N}
Base.similar(
::Type{CUDA.CuArray{T, N′, B} where {N′}},
dims::Dims{N},
) where {T, N, B} = similar(CUDA.CuArray{T, N, B}, dims)
function knl_copyto!(dest::VF{S, Nv}, src) where {S, Nv}
(tv,) = CUDA.threadIdx()
(bv,) = CUDA.blockIdx()
v = tv + (bv - 1) * CUDA.blockDim().x
I = CartesianIndex((1, 1, 1, v, 1))
if 1 ≤ I.I[4] ≤ Nv
@inbounds dest[I] = src[I]
end
return nothing
end
function Base.copyto!(dest::VF{S, Nv}, bc, to::CUDA.CuArray) where {S, Nv}
kernel = CUDA.@cuda always_inline = true launch = false knl_copyto!(dest, bc)
config = CUDA.launch_configuration(kernel.fun)
n_max_threads = min(config.threads, Nv)
Nvt = fld(n_max_threads, Nv)
Nv_thread = Nvt == 0 ? n_max_threads : min(Int(Nvt), Nv)
Nv_blocks = cld(Nv, Nv_thread)
@assert Nv_thread ≤ n_max_threads "threads,n_max_threads=($(Nv_thread),$n_max_threads)"
p = (; threads = (Nv_thread,), blocks = (Nv_blocks,))
kernel(dest, bc; threads = p.threads, blocks = p.blocks)
return dest
end
end
struct MyParams1{A}
a::A
end;
struct MyParams2{B}
b::B
end;
Base.Broadcast.broadcastable(x::MyParams1) = tuple(x);
Base.Broadcast.broadcastable(x::MyParams2) = tuple(x);
foo(f, p1, p2) = f + p1.a - p2.b;
bar(p1, p2, f) = f + p1.a - p2.b;
FT = Float64;
p1 = MyParams1{FT}(1);
p2 = MyParams2{FT}(2);
@testset "Broken test" begin
b = zeros(FT, 5,5); # Ordinary CPU array works
a = similar(b);
bc = instantiate(broadcasted(foo, b, p1, p2));
materialize!(a, bc)
@static if !(VERSION ≥ v"1.11.0-beta")
@test_opt materialize!(a, bc) # also passes inference
end
b = VF{FT}(Array{FT}; Nv=4); # VF with CPU array works
a = similar(b);
bc = instantiate(broadcasted(foo, b, p1, p2));
materialize!(a, bc)
@static if !(VERSION ≥ v"1.11.0-beta")
@test_opt materialize!(a, bc) # also passes inference
end
@static if @isdefined(CUDA)
b = CUDA.zeros(FT, 5,5); # CUDA.CuArray works
a = similar(b);
bc = instantiate(broadcasted(foo, b, p1, p2));
materialize!(a, bc)
b = VF{FT}(CUDA.CuArray{FT}; Nv=4); # VF with CUDA.CuArray fails
a = similar(b);
bc = instantiate(broadcasted(foo, b, p1, p2));
@test_throws CUDA.InvalidIRError materialize!(a, bc) # fails to compile
end
end
nothing |
I opened JuliaGPU/CUDA.jl#2623. |
Describe the bug
Reproducer added in #2062
Initially found in CliMA/ClimaLand.jl#739
This is a type inference error that arises when a function is broadcast over one field and two structs that both have
broadcastable
extended usingtuple
. The types in the expression become too complex, and inference fails. The problem is simplified in the reproducer so the structs each contain one scalar value, and the called function itself does nothing. The reproducer was tested on fields constructed on multiple different spaces, and reproduced the bug on all of them.The text was updated successfully, but these errors were encountered: