Skip to content

Commit

Permalink
update AMDGPU
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexander-Barth committed Nov 27, 2024
1 parent 7bb1c99 commit a8b8ef1
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 4 deletions.
73 changes: 69 additions & 4 deletions ext/AMDGPUExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,77 @@ using AMDGPU
using Flux


function interpnd!(pos::AbstractVector{<:NTuple{N}},A::ROCArray,vec) where N
@roc interpnd!(pos,A,vec)
function interpnd_d!(pos::AbstractVector{<:NTuple{N}},A,vec) where N
index = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
stride = gridGroupDim().x * workgroupDim().x

@inbounds for i = index:stride:length(pos)
p = pos[i]
ind = floor.(Int,p)

# interpolation coefficients
c = p .- ind

for offset in CartesianIndices(ntuple(n -> 0:1,Val(N)))
p2 = Tuple(offset) .+ ind

cc = prod(ntuple(n -> (offset[n] == 1 ? c[n] : 1-c[n]),Val(N)))
vec[i] += A[p2...] * cc
end
end

return nothing
end

function interp_adjn!(pos::AbstractVector{<:NTuple{N}},values::ROCArray,A2) where N
@roc interp_adjn!(pos,values,A2)
function interpnd!(pos::AbstractVector{<:NTuple{N}},d_A::ROCArray,vec_d) where N
AMDGPU.@sync begin
len = length(pos)
kernel = @roc launch=false interpnd_d!(pos,d_A,vec_d)
config = launch_configuration(kernel.fun)
groupsize = min(len, config.groupsize)
gridsize = cld(len, groupsize)
@debug gridsize,groupsize

kernel(pos,d_A,vec_d; groupsize, gridsize)
end
end


function interp_adjn_d!(pos::AbstractVector{<:NTuple{N}},values,A2) where N
index = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
stride = gridGroupDim().x * workgroupDim().x

A2 .= 0

@inbounds for i = index:stride:length(pos)
p = pos[i]
ind = floor.(Int,p)

# interpolation coefficients
c = p .- ind

for offset in CartesianIndices(ntuple(n -> 0:1,Val(N)))
p2 = Tuple(offset) .+ ind

cc = prod(ntuple(n -> (offset[n] == 1 ? c[n] : 1-c[n]),Val(N)))

I = LinearIndices(A2)[p2...]
AMDGPU.atomic_add!(pointer(A2,I), values[i] * cc)
end
end

return nothing
end


function interp_adjn!(pos::AbstractVector{<:NTuple{N}},values_d::ROCArray,d_A2) where N
AMDGPU.@sync begin
len = length(pos)
#numgridsize = ceil(Int, length(pos)/256)
# must be one
numgridsize = 1
@roc groupsize=256 gridsize=numgridsize interp_adjn_d!(pos,values_d,d_A2)
end
end

@inline function _to_device(::Type{Atype}) where Atype <: ROCArray
Expand Down
1 change: 1 addition & 0 deletions ext/CUDAExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ function interp_adjn_d!(pos::AbstractVector{<:NTuple{N}},values,A2) where N
index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
stride = gridDim().x * blockDim().x

# initialize before kernel launch???
A2 .= 0

@inbounds for i = index:stride:length(pos)
Expand Down
3 changes: 3 additions & 0 deletions test/test_DINCAE_SST.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,15 @@ using DINCAE
using Base.Iterators
using Random
using NCDatasets
using AMDGPU
using CUDA

const F = Float32
Atype =
if CUDA.functional()
CuArray{F}
elseif AMDGPU.functional()
ROCArray{F}
else
Array{F}
end
Expand Down

0 comments on commit a8b8ef1

Please sign in to comment.