From 3fb5625190b4fc6c25b95defea1fec1f50ceb7b6 Mon Sep 17 00:00:00 2001 From: Michel Schanen Date: Thu, 18 Jun 2026 15:44:10 +0000 Subject: [PATCH 1/3] Use the cached task-local SYCL queue for oneMKL FFT plans `_create_descriptor` built fresh syclDevice/syclContext/syclQueue objects for every FFT plan. Once those wrappers become garbage their finalizers (syclQueueDestroy etc.) tear down SYCL runtime state for the still-in-use underlying Level Zero queue, corrupting later DFT commits and crashing at process exit. Use the cached task-local `sycl_queue(global_queue(...))` accessor that every other oneMKL wrapper already uses, so the plan shares the managed queue lifetime instead of owning a throwaway one. --- lib/mkl/fft.jl | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/lib/mkl/fft.jl b/lib/mkl/fft.jl index 745ff65f..4429801b 100644 --- a/lib/mkl/fft.jl +++ b/lib/mkl/fft.jl @@ -105,11 +105,13 @@ function _create_descriptor(sz::NTuple{N,Int}, T::Type, complex::Bool) where {N} desc = desc_ref[] # Do not program descriptor scaling; we'll perform inverse normalization manually. # Set placement explicitly based on plan type later - # Construct a SYCL queue from current Level Zero context/device (reuse global queue) + # Use the task-local cached SYCL queue wrapping the global Level Zero queue, like the + # other oneMKL wrappers do. Creating fresh syclContext/syclQueue objects per plan is + # unsound: once they become garbage their finalizers (syclQueueDestroy etc.) tear down + # SYCL runtime state for the still-in-use underlying queue, corrupting later DFT + # commits and crashing at process exit. ze_ctx = oneAPI.context(); ze_dev = oneAPI.device() - sycl_dev = SYCL.syclDevice(SYCL.syclPlatform(oneAPI.driver()), ze_dev) - sycl_ctx = SYCL.syclContext([sycl_dev], ze_ctx) - q = SYCL.syclQueue(sycl_ctx, sycl_dev, oneAPI.global_queue(ze_ctx, ze_dev)) + q = oneAPI.sycl_queue(oneAPI.global_queue(ze_ctx, ze_dev)) return desc, q end From 3719b89cb5bbc8b7a04e3ac83481bc2c14d58413 Mon Sep 17 00:00:00 2001 From: Michel Schanen Date: Mon, 22 Jun 2026 15:27:53 +0000 Subject: [PATCH 2/3] Add test --- test/fft.jl | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/test/fft.jl b/test/fft.jl index 1b148dfe..b01d1aba 100644 --- a/test/fft.jl +++ b/test/fft.jl @@ -79,4 +79,19 @@ end end end end + +@testset "shared queue lifetime across plans" begin + dX1 = gpu(rand(ComplexF32, 8)) + p1 = AbstractFFTs.plan_fft(dX1) + dY1 = p1 * dX1 + p1i = AbstractFFTs.plan_ifft(dX1) + p1i * dY1 + + GC.gc(true) # run finalizers of any throwaway per-plan SYCL wrappers + + X2 = rand(ComplexF32, 8, 32) + dX2 = gpu(X2) + p2 = AbstractFFTs.plan_fft(dX2) + cmp(p2 * dX2, fft(X2)) +end end From a1c8ade580f9420c14fe2a31535f3751f35d3e90 Mon Sep 17 00:00:00 2001 From: Michel Schanen Date: Mon, 22 Jun 2026 10:47:04 -0500 Subject: [PATCH 3/3] Assert FFT plans share the cached SYCL queue handle Add deterministic assertions to the queue-lifetime testset: every plan's stored queue handle must equal the task-local cached SYCL queue. This fails on the old throwaway-per-plan code regardless of hardware, unlike the GC-roundtrip check which only crashes on PVC-class teardown. --- test/fft.jl | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/fft.jl b/test/fft.jl index b01d1aba..d4419462 100644 --- a/test/fft.jl +++ b/test/fft.jl @@ -81,8 +81,15 @@ end end @testset "shared queue lifetime across plans" begin + # Plans must share the single cached task-local SYCL queue rather than each owning a + # throwaway one (whose finalizer would tear down shared SYCL/oneMKL state). Assert the + # shared handle deterministically, independent of whether a stale queue would crash. + cached_handle = Base.unsafe_convert(oneAPI.oneMKL.syclQueue_t, + oneAPI.sycl_queue(oneAPI.global_queue(oneAPI.context(), oneAPI.device()))) + dX1 = gpu(rand(ComplexF32, 8)) p1 = AbstractFFTs.plan_fft(dX1) + @test p1.queue == cached_handle dY1 = p1 * dX1 p1i = AbstractFFTs.plan_ifft(dX1) p1i * dY1 @@ -92,6 +99,7 @@ end X2 = rand(ComplexF32, 8, 32) dX2 = gpu(X2) p2 = AbstractFFTs.plan_fft(dX2) + @test p2.queue == cached_handle cmp(p2 * dX2, fft(X2)) end end