diff --git a/ext/NFFTGPUArraysExt/NFFTGPUArraysExt.jl b/ext/NFFTGPUArraysExt/NFFTGPUArraysExt.jl
index 2d4c4410..e3d824ba 100644
--- a/ext/NFFTGPUArraysExt/NFFTGPUArraysExt.jl
+++ b/ext/NFFTGPUArraysExt/NFFTGPUArraysExt.jl
@@ -2,8 +2,10 @@ module NFFTGPUArraysExt
 
 using NFFT, NFFT.AbstractNFFTs
 using NFFT.SparseArrays, NFFT.LinearAlgebra, NFFT.FFTW
-using GPUArrays, Adapt
+using GPUArrays, GPUArrays.KernelAbstractions, Adapt
+using GPUArrays.KernelAbstractions.Extras: @unroll
 
 include("implementation.jl")
+include("precomputation.jl")
 
 end
diff --git a/ext/NFFTGPUArraysExt/implementation.jl b/ext/NFFTGPUArraysExt/implementation.jl
index 43712980..281248e4 100644
--- a/ext/NFFTGPUArraysExt/implementation.jl
+++ b/ext/NFFTGPUArraysExt/implementation.jl
@@ -16,7 +16,8 @@ mutable struct GPU_NFFTPlan{T,D, arrTc <: AbstractGPUArray{Complex{T}, D}, vecI
   B::SM
 end
 
-function AbstractNFFTs.plan_nfft(::NFFTBackend, arr::Type{<:AbstractGPUArray}, k::Matrix{T}, N::NTuple{D,Int}, rest...;
+AbstractNFFTs.plan_nfft(b::NFFTBackend, arr::Type{<:AbstractGPUArray}, k::Matrix{T}, N::NTuple{D,Int}, rest...; kargs...) where {T,D} = plan_nfft(b, arr, arr(k), N, rest...; kargs...)
+function AbstractNFFTs.plan_nfft(::NFFTBackend, arr::Type{<:AbstractGPUArray}, k::AbstractGPUArray{T}, N::NTuple{D,Int}, rest...;
   timing::Union{Nothing,TimingStats} = nothing, kargs...) where {T,D}
   t = @elapsed begin
     p = GPU_NFFTPlan(arr, k, N, rest...; kargs...)
@@ -27,7 +28,7 @@ function AbstractNFFTs.plan_nfft(::NFFTBackend, arr::Type{<:AbstractGPUArray}, k
   return p
 end
 
-function GPU_NFFTPlan(arr, k::Matrix{T}, N::NTuple{D,Int}; dims::Union{Integer,UnitRange{Int64}}=1:D,
+function GPU_NFFTPlan(arr, k::AbstractGPUMatrix{T}, N::NTuple{D,Int}; dims::Union{Integer,UnitRange{Int64}}=1:D,
                  fftflags=nothing, kwargs...) where {T,D}
 
     if dims != 1:D
@@ -50,10 +51,9 @@ function GPU_NFFTPlan(arr, k::Matrix{T}, N::NTuple{D,Int}; dims::Union{Integer,U
 
     deconvIdx = Int32.(adapt(arr, (deconvolveIdx)))
     winHatInvLUT = Complex{T}.(adapt(arr, (windowHatInvLUT[1]))) 
-    B_ = Complex{T}.(adapt(arr, (B))) # Bit hacky
 
-    GPU_NFFTPlan{T,D, typeof(tmpVec), typeof(deconvIdx), typeof(FP), typeof(BP), typeof(winHatInvLUT), typeof(B_)}(N, NOut, J, k, Ñ, dims_, params, FP, BP, tmpVec, tmpVecHat, 
-               deconvIdx, windowLinInterp, winHatInvLUT, B_)
+    GPU_NFFTPlan{T,D, typeof(tmpVec), typeof(deconvIdx), typeof(FP), typeof(BP), typeof(winHatInvLUT), typeof(B)}(N, NOut, J, k, Ñ, dims_, params, FP, BP, tmpVec, tmpVecHat, 
+               deconvIdx, windowLinInterp, winHatInvLUT, B)
 end
 
 AbstractNFFTs.size_in(p::GPU_NFFTPlan) = p.N
diff --git a/ext/NFFTGPUArraysExt/precomputation.jl b/ext/NFFTGPUArraysExt/precomputation.jl
new file mode 100644
index 00000000..65173934
--- /dev/null
+++ b/ext/NFFTGPUArraysExt/precomputation.jl
@@ -0,0 +1,48 @@
+function NFFT.precomputeB(win, k::AbstractGPUArray, N::NTuple{D,Int}, Ñ::NTuple{D,Int}, m, J, σ, K, T) where D
+  I = similar(k, Int64, (2*m)^D, J)
+  β = (2*m)^D
+
+  # CPU uses a CSC constructor, which is not generically available for GPU (I think)
+  #Y = similar(k, Int64, J + 1)
+  #Y .= (0:J) .* β .+ 1
+  # We have to use the COO constructor and need (2*m)^D * J values:
+  Y = similar(k, Int64, (2*m)^D * J)
+  Y .= ((0:β*J-1) .÷ β) .+ 1
+  
+  V = similar(k, T, (2*m)^D, J)
+  nProd = ntuple(d-> (d==1) ? 1 : prod(Ñ[1:(d-1)]), D)
+  L = Val(2*m)
+
+  @kernel inbounds = true function precomputeB_kernel(I, V, win, k, Ñ::NTuple{D,Int}, m, σ, nProd, ::Val{Z}) where {D, Z}
+    idx = @index(Global, Cartesian)
+    j = idx[2]
+    linear = idx[1]
+    
+    prodWin = one(eltype(k))
+    ζ = 1
+    tmpIdx = linear - 1 # 0-based for index calculation
+    @unroll for d = 1:D
+      l_d = (tmpIdx % Z) + 1 # index in 1:(2*m)
+      tmpIdx = div(tmpIdx, Z)
+
+      kscale = k[d, j] * Ñ[d]
+      off = floor(Int, kscale) - m + 1
+
+      idx_d = rem(l_d + off + Ñ[d] - 1, Ñ[d]) + 1 # periodic wrapped index in 1:Ñ[d] 
+      ζ += (idx_d - 1) * nProd[d]
+
+      # accumulate window product
+      prodWin *= win( (kscale - (l_d-1) - off)  / Ñ[d], Ñ[d], m, σ)
+    end
+
+    I[idx] = ζ
+    V[idx] = prodWin
+  end
+
+  backend = get_backend(k)
+  kernel = precomputeB_kernel(backend)
+  kernel(I, V, win, k, Ñ, m, σ, nProd, L, ndrange = size(I))
+
+  S = sparse(vec(I), Y, vec(V), prod(Ñ), J)
+  return S
+end
\ No newline at end of file
diff --git a/src/precomputation.jl b/src/precomputation.jl
index 51ab86b1..73ad5849 100644
--- a/src/precomputation.jl
+++ b/src/precomputation.jl
@@ -1,6 +1,6 @@
 ### Init some initial parameters necessary to create the plan ###
 
-function initParams(k::Matrix{T}, N::NTuple{D,Int}, dims::Union{Integer,UnitRange{Int64}}=1:D; 
+function initParams(k::AbstractMatrix{T}, N::NTuple{D,Int}, dims::Union{Integer,UnitRange{Int64}}=1:D; 
                     kargs...) where {D,T}
   # convert dims to a unit range
   dims_ = (typeof(dims) <: Integer) ? (dims:dims) : dims
@@ -357,7 +357,7 @@ function precomputeWindowHatInvLUT(windowHatInvLUT, win_hat, N, Ñ, m, σ, T)
   end
 end
 
-function precomputation(k::Union{Matrix{T},Vector{T}}, N::NTuple{D,Int}, Ñ, params) where {T,D}
+function precomputation(k::Union{AbstractMatrix{T},AbstractVector{T}}, N::NTuple{D,Int}, Ñ, params) where {T,D}
 
   m = params.m; σ = params.σ; window=params.window
   LUTSize = params.LUTSize; precompute = params.precompute