From aa6f3827ae406ad047b98362256908eda6f586a4 Mon Sep 17 00:00:00 2001 From: Niklas Hackelberg Date: Tue, 6 Jan 2026 11:59:42 +0100 Subject: [PATCH 1/4] Precompute B on GPU --- ext/NFFTGPUArraysExt/NFFTGPUArraysExt.jl | 4 +- ext/NFFTGPUArraysExt/precomputation.jl | 149 +++++++++++++++++++++++ src/precomputation.jl | 4 +- 3 files changed, 154 insertions(+), 3 deletions(-) create mode 100644 ext/NFFTGPUArraysExt/precomputation.jl diff --git a/ext/NFFTGPUArraysExt/NFFTGPUArraysExt.jl b/ext/NFFTGPUArraysExt/NFFTGPUArraysExt.jl index 2d4c4410..e3d824ba 100644 --- a/ext/NFFTGPUArraysExt/NFFTGPUArraysExt.jl +++ b/ext/NFFTGPUArraysExt/NFFTGPUArraysExt.jl @@ -2,8 +2,10 @@ module NFFTGPUArraysExt using NFFT, NFFT.AbstractNFFTs using NFFT.SparseArrays, NFFT.LinearAlgebra, NFFT.FFTW -using GPUArrays, Adapt +using GPUArrays, GPUArrays.KernelAbstractions, Adapt +using GPUArrays.KernelAbstractions.Extras: @unroll include("implementation.jl") +include("precomputation.jl") end diff --git a/ext/NFFTGPUArraysExt/precomputation.jl b/ext/NFFTGPUArraysExt/precomputation.jl new file mode 100644 index 00000000..b6e1e1a6 --- /dev/null +++ b/ext/NFFTGPUArraysExt/precomputation.jl @@ -0,0 +1,149 @@ +function NFFT.precomputeB(win, k::AbstractGPUArray, N::NTuple{D,Int}, Ñ::NTuple{D,Int}, m, J, σ, K, T) where D + I = similar(k, Int64, (2*m)^D, J) + β = (2*m)^D + + # CPU uses a CSC constructor, which is not generically available for GPU (I think) + #Y = similar(k, Int64, J + 1) + #Y .= (0:J) .* β .+ 1 + # We have to use the COO constructor and need (2*m)^D * J values: + Y = similar(k, Int64, (2*m)^D * J) + Y .= ((0:β*J-1) .÷ β) .+ 1 + + V = similar(k, T, (2*m)^D, J) + nProd = ntuple(d-> (d==1) ? 1 : prod(Ñ[1:(d-1)]), D) + L = Val(2*m) + + @kernel cpu = false inbounds = true function precomputeB_kernel(I, V, win, k, Ñ::NTuple{D,Int}, m, σ, nProd, ::Val{Z}) where {D, Z} + idx = @index(Global, Cartesian) + j = idx[2] + linear = idx[1] + + prodWin = one(eltype(k)) + ζ = 1 + tmpIdx = linear - 1 # 0-based for index calculation + @unroll for d = 1:D + l_d = (tmpIdx % Z) + 1 # index in 1:(2*m) + tmpIdx = div(tmpIdx, Z) + + kscale = k[d, j] * Ñ[d] + off = floor(Int, kscale) - m + 1 + + idx_d = rem(l_d + off + Ñ[d] - 1, Ñ[d]) + 1 # periodic wrapped index in 1:Ñ[d] + ζ += (idx_d - 1) * nProd[d] + + # accumulate window product + prodWin *= win( (kscale - (l_d-1) - off) / Ñ[d], Ñ[d], m, σ) + end + + I[idx] = ζ + V[idx] = prodWin + end + + backend = get_backend(k) + kernel = precomputeB_kernel(backend) + kernel(I, V, win, k, Ñ, m, σ, nProd, L, ndrange = size(I)) + + S = sparse(vec(I), Y, vec(V), prod(Ñ), J) + return S +end + +#= +@inline @generated function _precomputeB(win, k::AbstractMatrix{T}, N::NTuple{D,Int}, Ñ::NTuple{D,Int}, m, J, + σ, scale, I, Y, V, mProd, nProd, L::Val{Z}, j, LUTSize) where {T, 3, Z} + quote + + (tmpIdx_1, tmpWin_1) = precomputeOneNode(win, k, Ñ, m, σ, scale, j, 1, L, LUTSize) + (tmpIdx_2, tmpWin_2) = precomputeOneNode(win, k, Ñ, m, σ, scale, j, 2, L, LUTSize) + (tmpIdx_3, tmpWin_3) = precomputeOneNode(win, k, Ñ, m, σ, scale, j, 3, L, LUTSize) + + + κ_3 = 1 + ζ_3 = 1 + prodWin_3 = one(T) + + for l_3 in 1:Z + prodWin_2 = prodWin_3 * tmpWin_3[l_3] + κ_2 = κ_3 + (l_3 - 1) * mProd[3] + ζ_2 = ζ_3 + (tmpIdx_3[l_3] - 1) * nProd[3] + + for l_2 in 1:Z + prodWin_1 = prodWin_2 * tmpWin_2[l_2] + κ_1 = κ_2 + (l_2 - 1) * mProd[2] + ζ_1 = ζ_2 + (tmpIdx_2[l_2] - 1) * nProd[2] + + for l_1 in 1:Z + prodWin_0 = prodWin_1 * tmpWin_1[l_1] + κ_0 = κ_1 + (l_1 - 1) * mProd[1] + ζ_0 = ζ_1 + (tmpIdx_1[l_1] - 1) * nProd[1] + + I[κ_0, j] = ζ_0 + V[κ_0, j] = prodWin_0 + end + end + end + return + end +end + +@inline function _precomputeB(win, k::AbstractMatrix{T}, N::NTuple{1,Int}, Ñ::NTuple{1,Int}, m, J, + σ, scale, I, Y, V, mProd, nProd, L::Val{Z}, j, LUTSize) where {T, Z} + + # Precompute per-dimension lookup + tmpIdx_1, tmpWin_1 = precomputeOneNode(win, k, Ñ, m, σ, scale, j, 1, L, LUTSize) + + # Initial values (equivalent to κ_1 = 1, ζ_1 = 1, prodWin_1 = one(T)) + κ_1 = 1 + ζ_1 = 1 + prodWin_1 = one(T) + + for l_1 in 1:Z + prodWin_0 = prodWin_1 * tmpWin_1[l_1] + κ_0 = κ_1 + (l_1 - 1) * mProd[1] + ζ_0 = ζ_1 + (tmpIdx_1[l_1] - 1) * nProd[1] + I[κ_0, j] = ζ_0 + V[κ_0, j] = prodWin_0 + end + return +end + +@inline function _precomputeB(win, k::AbstractMatrix{T}, N::NTuple{2,Int}, Ñ::NTuple{2,Int}, m, J, + σ, scale, I, Y, V, mProd, nProd, L::Val{Z}, j, LUTSize) where {T, Z} + + # Precompute per-dimension lookup + tmpIdx_1, tmpWin_1 = precomputeOneNode(win, k, Ñ, m, σ, scale, j, 1, L, LUTSize) + tmpIdx_2, tmpWin_2 = precomputeOneNode(win, k, Ñ, m, σ, scale, j, 2, L, LUTSize) + + # Initial values (equivalent to κ_2 = 1, ζ_2 = 1, prodWin_2 = one(T)) + κ_2 = 1 + ζ_2 = 1 + prodWin_2 = one(T) + + for l_2 in 1:Z + prodWin_1 = prodWin_2 * tmpWin_2[l_2] + κ_1 = κ_2 + (l_2 - 1) * mProd[2] + ζ_1 = ζ_2 + (tmpIdx_2[l_2] - 1) * nProd[2] + + for l_1 in 1:Z + prodWin_0 = prodWin_1 * tmpWin_1[l_1] + κ_0 = κ_1 + (l_1 - 1) * mProd[1] + ζ_0 = ζ_1 + (tmpIdx_1[l_1] - 1) * nProd[1] + + I[κ_0, j] = ζ_0 + V[κ_0, j] = prodWin_0 + end + end + return +end + + +### precomputation of the window and the indices required during convolution ### + +@inline function precomputeOneNode(win::Function, k::AbstractMatrix{T}, Ñ::NTuple{1,Int}, m, + σ, scale, j, d, L::Val{Z}, LUTSize) where {T,Z} + kscale = k[d, j] * Ñ[d] + off = floor(Int, kscale) - m + 1 + tmpIdx = ntuple(l -> rem(l + off + Ñ[d] - 1, Ñ[d]) + 1, Z) + tmpWin = ntuple(l -> win((kscale - (l - 1) - off) / Ñ[d], Ñ[d], m, σ), Z) + return tmpIdx, tmpWin +end +=# \ No newline at end of file diff --git a/src/precomputation.jl b/src/precomputation.jl index 51ab86b1..73ad5849 100644 --- a/src/precomputation.jl +++ b/src/precomputation.jl @@ -1,6 +1,6 @@ ### Init some initial parameters necessary to create the plan ### -function initParams(k::Matrix{T}, N::NTuple{D,Int}, dims::Union{Integer,UnitRange{Int64}}=1:D; +function initParams(k::AbstractMatrix{T}, N::NTuple{D,Int}, dims::Union{Integer,UnitRange{Int64}}=1:D; kargs...) where {D,T} # convert dims to a unit range dims_ = (typeof(dims) <: Integer) ? (dims:dims) : dims @@ -357,7 +357,7 @@ function precomputeWindowHatInvLUT(windowHatInvLUT, win_hat, N, Ñ, m, σ, T) end end -function precomputation(k::Union{Matrix{T},Vector{T}}, N::NTuple{D,Int}, Ñ, params) where {T,D} +function precomputation(k::Union{AbstractMatrix{T},AbstractVector{T}}, N::NTuple{D,Int}, Ñ, params) where {T,D} m = params.m; σ = params.σ; window=params.window LUTSize = params.LUTSize; precompute = params.precompute From 532c920372028374ee34211248d9b28951c2e129 Mon Sep 17 00:00:00 2001 From: Niklas Hackelberg Date: Tue, 6 Jan 2026 12:00:15 +0100 Subject: [PATCH 2/4] Adapt plan_nfft for GPU arrays/arrTypes to compute B on gpu --- ext/NFFTGPUArraysExt/implementation.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ext/NFFTGPUArraysExt/implementation.jl b/ext/NFFTGPUArraysExt/implementation.jl index 43712980..281248e4 100644 --- a/ext/NFFTGPUArraysExt/implementation.jl +++ b/ext/NFFTGPUArraysExt/implementation.jl @@ -16,7 +16,8 @@ mutable struct GPU_NFFTPlan{T,D, arrTc <: AbstractGPUArray{Complex{T}, D}, vecI B::SM end -function AbstractNFFTs.plan_nfft(::NFFTBackend, arr::Type{<:AbstractGPUArray}, k::Matrix{T}, N::NTuple{D,Int}, rest...; +AbstractNFFTs.plan_nfft(b::NFFTBackend, arr::Type{<:AbstractGPUArray}, k::Matrix{T}, N::NTuple{D,Int}, rest...; kargs...) where {T,D} = plan_nfft(b, arr, arr(k), N, rest...; kargs...) +function AbstractNFFTs.plan_nfft(::NFFTBackend, arr::Type{<:AbstractGPUArray}, k::AbstractGPUArray{T}, N::NTuple{D,Int}, rest...; timing::Union{Nothing,TimingStats} = nothing, kargs...) where {T,D} t = @elapsed begin p = GPU_NFFTPlan(arr, k, N, rest...; kargs...) @@ -27,7 +28,7 @@ function AbstractNFFTs.plan_nfft(::NFFTBackend, arr::Type{<:AbstractGPUArray}, k return p end -function GPU_NFFTPlan(arr, k::Matrix{T}, N::NTuple{D,Int}; dims::Union{Integer,UnitRange{Int64}}=1:D, +function GPU_NFFTPlan(arr, k::AbstractGPUMatrix{T}, N::NTuple{D,Int}; dims::Union{Integer,UnitRange{Int64}}=1:D, fftflags=nothing, kwargs...) where {T,D} if dims != 1:D @@ -50,10 +51,9 @@ function GPU_NFFTPlan(arr, k::Matrix{T}, N::NTuple{D,Int}; dims::Union{Integer,U deconvIdx = Int32.(adapt(arr, (deconvolveIdx))) winHatInvLUT = Complex{T}.(adapt(arr, (windowHatInvLUT[1]))) - B_ = Complex{T}.(adapt(arr, (B))) # Bit hacky - GPU_NFFTPlan{T,D, typeof(tmpVec), typeof(deconvIdx), typeof(FP), typeof(BP), typeof(winHatInvLUT), typeof(B_)}(N, NOut, J, k, Ñ, dims_, params, FP, BP, tmpVec, tmpVecHat, - deconvIdx, windowLinInterp, winHatInvLUT, B_) + GPU_NFFTPlan{T,D, typeof(tmpVec), typeof(deconvIdx), typeof(FP), typeof(BP), typeof(winHatInvLUT), typeof(B)}(N, NOut, J, k, Ñ, dims_, params, FP, BP, tmpVec, tmpVecHat, + deconvIdx, windowLinInterp, winHatInvLUT, B) end AbstractNFFTs.size_in(p::GPU_NFFTPlan) = p.N From 8734f90d894d7875a996776523adb68d710c87fc Mon Sep 17 00:00:00 2001 From: Niklas Hackelberg Date: Tue, 6 Jan 2026 12:01:33 +0100 Subject: [PATCH 3/4] Remove comments --- ext/NFFTGPUArraysExt/precomputation.jl | 103 +------------------------ 1 file changed, 1 insertion(+), 102 deletions(-) diff --git a/ext/NFFTGPUArraysExt/precomputation.jl b/ext/NFFTGPUArraysExt/precomputation.jl index b6e1e1a6..b0dca44b 100644 --- a/ext/NFFTGPUArraysExt/precomputation.jl +++ b/ext/NFFTGPUArraysExt/precomputation.jl @@ -45,105 +45,4 @@ function NFFT.precomputeB(win, k::AbstractGPUArray, N::NTuple{D,Int}, Ñ::NTupl S = sparse(vec(I), Y, vec(V), prod(Ñ), J) return S -end - -#= -@inline @generated function _precomputeB(win, k::AbstractMatrix{T}, N::NTuple{D,Int}, Ñ::NTuple{D,Int}, m, J, - σ, scale, I, Y, V, mProd, nProd, L::Val{Z}, j, LUTSize) where {T, 3, Z} - quote - - (tmpIdx_1, tmpWin_1) = precomputeOneNode(win, k, Ñ, m, σ, scale, j, 1, L, LUTSize) - (tmpIdx_2, tmpWin_2) = precomputeOneNode(win, k, Ñ, m, σ, scale, j, 2, L, LUTSize) - (tmpIdx_3, tmpWin_3) = precomputeOneNode(win, k, Ñ, m, σ, scale, j, 3, L, LUTSize) - - - κ_3 = 1 - ζ_3 = 1 - prodWin_3 = one(T) - - for l_3 in 1:Z - prodWin_2 = prodWin_3 * tmpWin_3[l_3] - κ_2 = κ_3 + (l_3 - 1) * mProd[3] - ζ_2 = ζ_3 + (tmpIdx_3[l_3] - 1) * nProd[3] - - for l_2 in 1:Z - prodWin_1 = prodWin_2 * tmpWin_2[l_2] - κ_1 = κ_2 + (l_2 - 1) * mProd[2] - ζ_1 = ζ_2 + (tmpIdx_2[l_2] - 1) * nProd[2] - - for l_1 in 1:Z - prodWin_0 = prodWin_1 * tmpWin_1[l_1] - κ_0 = κ_1 + (l_1 - 1) * mProd[1] - ζ_0 = ζ_1 + (tmpIdx_1[l_1] - 1) * nProd[1] - - I[κ_0, j] = ζ_0 - V[κ_0, j] = prodWin_0 - end - end - end - return - end -end - -@inline function _precomputeB(win, k::AbstractMatrix{T}, N::NTuple{1,Int}, Ñ::NTuple{1,Int}, m, J, - σ, scale, I, Y, V, mProd, nProd, L::Val{Z}, j, LUTSize) where {T, Z} - - # Precompute per-dimension lookup - tmpIdx_1, tmpWin_1 = precomputeOneNode(win, k, Ñ, m, σ, scale, j, 1, L, LUTSize) - - # Initial values (equivalent to κ_1 = 1, ζ_1 = 1, prodWin_1 = one(T)) - κ_1 = 1 - ζ_1 = 1 - prodWin_1 = one(T) - - for l_1 in 1:Z - prodWin_0 = prodWin_1 * tmpWin_1[l_1] - κ_0 = κ_1 + (l_1 - 1) * mProd[1] - ζ_0 = ζ_1 + (tmpIdx_1[l_1] - 1) * nProd[1] - I[κ_0, j] = ζ_0 - V[κ_0, j] = prodWin_0 - end - return -end - -@inline function _precomputeB(win, k::AbstractMatrix{T}, N::NTuple{2,Int}, Ñ::NTuple{2,Int}, m, J, - σ, scale, I, Y, V, mProd, nProd, L::Val{Z}, j, LUTSize) where {T, Z} - - # Precompute per-dimension lookup - tmpIdx_1, tmpWin_1 = precomputeOneNode(win, k, Ñ, m, σ, scale, j, 1, L, LUTSize) - tmpIdx_2, tmpWin_2 = precomputeOneNode(win, k, Ñ, m, σ, scale, j, 2, L, LUTSize) - - # Initial values (equivalent to κ_2 = 1, ζ_2 = 1, prodWin_2 = one(T)) - κ_2 = 1 - ζ_2 = 1 - prodWin_2 = one(T) - - for l_2 in 1:Z - prodWin_1 = prodWin_2 * tmpWin_2[l_2] - κ_1 = κ_2 + (l_2 - 1) * mProd[2] - ζ_1 = ζ_2 + (tmpIdx_2[l_2] - 1) * nProd[2] - - for l_1 in 1:Z - prodWin_0 = prodWin_1 * tmpWin_1[l_1] - κ_0 = κ_1 + (l_1 - 1) * mProd[1] - ζ_0 = ζ_1 + (tmpIdx_1[l_1] - 1) * nProd[1] - - I[κ_0, j] = ζ_0 - V[κ_0, j] = prodWin_0 - end - end - return -end - - -### precomputation of the window and the indices required during convolution ### - -@inline function precomputeOneNode(win::Function, k::AbstractMatrix{T}, Ñ::NTuple{1,Int}, m, - σ, scale, j, d, L::Val{Z}, LUTSize) where {T,Z} - kscale = k[d, j] * Ñ[d] - off = floor(Int, kscale) - m + 1 - tmpIdx = ntuple(l -> rem(l + off + Ñ[d] - 1, Ñ[d]) + 1, Z) - tmpWin = ntuple(l -> win((kscale - (l - 1) - off) / Ñ[d], Ñ[d], m, σ), Z) - return tmpIdx, tmpWin -end -=# \ No newline at end of file +end \ No newline at end of file From 527109086e2918440cdfb5e03aa5528230d86b41 Mon Sep 17 00:00:00 2001 From: Niklas Hackelberg Date: Sat, 24 Jan 2026 16:30:53 +0100 Subject: [PATCH 4/4] Allow B precomputation for JLArrays --- ext/NFFTGPUArraysExt/precomputation.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/NFFTGPUArraysExt/precomputation.jl b/ext/NFFTGPUArraysExt/precomputation.jl index b0dca44b..65173934 100644 --- a/ext/NFFTGPUArraysExt/precomputation.jl +++ b/ext/NFFTGPUArraysExt/precomputation.jl @@ -13,7 +13,7 @@ function NFFT.precomputeB(win, k::AbstractGPUArray, N::NTuple{D,Int}, Ñ::NTupl nProd = ntuple(d-> (d==1) ? 1 : prod(Ñ[1:(d-1)]), D) L = Val(2*m) - @kernel cpu = false inbounds = true function precomputeB_kernel(I, V, win, k, Ñ::NTuple{D,Int}, m, σ, nProd, ::Val{Z}) where {D, Z} + @kernel inbounds = true function precomputeB_kernel(I, V, win, k, Ñ::NTuple{D,Int}, m, σ, nProd, ::Val{Z}) where {D, Z} idx = @index(Global, Cartesian) j = idx[2] linear = idx[1]