diff --git a/ext/AMDGPUExt.jl b/ext/AMDGPUExt.jl
index f477d20..05c4c6d 100644
--- a/ext/AMDGPUExt.jl
+++ b/ext/AMDGPUExt.jl
@@ -5,12 +5,77 @@ using AMDGPU
 using Flux
 
 
-function interpnd!(pos::AbstractVector{<:NTuple{N}},A::ROCArray,vec) where N
-    @roc interpnd!(pos,A,vec)
+function interpnd_d!(pos::AbstractVector{<:NTuple{N}},A,vec) where N
+    index = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    stride = gridGroupDim().x * workgroupDim().x
+
+    @inbounds for i = index:stride:length(pos)
+        p = pos[i]
+        ind = floor.(Int,p)
+
+        # interpolation coefficients
+        c = p .- ind
+
+        for offset in CartesianIndices(ntuple(n -> 0:1,Val(N)))
+            p2 = Tuple(offset) .+ ind
+
+            cc = prod(ntuple(n -> (offset[n] == 1 ? c[n] : 1-c[n]),Val(N)))
+            vec[i] += A[p2...] * cc
+        end
+    end
+
+    return nothing
 end
 
-function interp_adjn!(pos::AbstractVector{<:NTuple{N}},values::ROCArray,A2) where N
-    @roc interp_adjn!(pos,values,A2)
+function interpnd!(pos::AbstractVector{<:NTuple{N}},d_A::ROCArray,vec_d) where N
+    AMDGPU.@sync begin
+        len = length(pos)
+        kernel = @roc launch=false interpnd_d!(pos,d_A,vec_d)
+        config = launch_configuration(kernel.fun)
+        groupsize = min(len, config.groupsize)
+        gridsize = cld(len, groupsize)
+        @debug gridsize,groupsize
+
+        kernel(pos,d_A,vec_d; groupsize, gridsize)
+    end
+end
+
+
+function interp_adjn_d!(pos::AbstractVector{<:NTuple{N}},values,A2) where N
+    index = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    stride = gridGroupDim().x * workgroupDim().x
+
+    A2 .= 0
+
+    @inbounds for i = index:stride:length(pos)
+        p = pos[i]
+        ind = floor.(Int,p)
+
+        # interpolation coefficients
+        c = p .- ind
+
+        for offset in CartesianIndices(ntuple(n -> 0:1,Val(N)))
+            p2 = Tuple(offset) .+ ind
+
+            cc = prod(ntuple(n -> (offset[n] == 1 ? c[n] : 1-c[n]),Val(N)))
+
+            I = LinearIndices(A2)[p2...]
+            AMDGPU.atomic_add!(pointer(A2,I), values[i] * cc)
+        end
+    end
+
+    return nothing
+end
+
+
+function interp_adjn!(pos::AbstractVector{<:NTuple{N}},values_d::ROCArray,d_A2) where N
+    AMDGPU.@sync begin
+        len = length(pos)
+        #numgridsize = ceil(Int, length(pos)/256)
+        # must be one
+        numgridsize = 1
+        @roc groupsize=256 gridsize=numgridsize interp_adjn_d!(pos,values_d,d_A2)
+    end
 end
 
 @inline function _to_device(::Type{Atype}) where Atype <: ROCArray
diff --git a/ext/CUDAExt.jl b/ext/CUDAExt.jl
index 1ff87fb..d4a046c 100644
--- a/ext/CUDAExt.jl
+++ b/ext/CUDAExt.jl
@@ -45,6 +45,7 @@ function interp_adjn_d!(pos::AbstractVector{<:NTuple{N}},values,A2) where N
     index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     stride = gridDim().x * blockDim().x
 
+    # initialize before kernel launch???
     A2 .= 0
 
     @inbounds for i = index:stride:length(pos)
diff --git a/test/test_DINCAE_SST.jl b/test/test_DINCAE_SST.jl
index 9758552..55db57b 100644
--- a/test/test_DINCAE_SST.jl
+++ b/test/test_DINCAE_SST.jl
@@ -6,12 +6,15 @@ using DINCAE
 using Base.Iterators
 using Random
 using NCDatasets
+using AMDGPU
 using CUDA
 
 const F = Float32
 Atype =
     if CUDA.functional()
         CuArray{F}
+    elseif AMDGPU.functional()
+        ROCArray{F}
     else
         Array{F}
     end