From 3fb5625190b4fc6c25b95defea1fec1f50ceb7b6 Mon Sep 17 00:00:00 2001
From: Michel Schanen <mschanen@anl.gov>
Date: Thu, 18 Jun 2026 15:44:10 +0000
Subject: [PATCH 1/3] Use the cached task-local SYCL queue for oneMKL FFT plans

`_create_descriptor` built fresh syclDevice/syclContext/syclQueue
objects for every FFT plan. Once those wrappers become garbage their
finalizers (syclQueueDestroy etc.) tear down SYCL runtime state for the
still-in-use underlying Level Zero queue, corrupting later DFT commits
and crashing at process exit.

Use the cached task-local `sycl_queue(global_queue(...))` accessor that
every other oneMKL wrapper already uses, so the plan shares the managed
queue lifetime instead of owning a throwaway one.
---
 lib/mkl/fft.jl | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/lib/mkl/fft.jl b/lib/mkl/fft.jl
index 745ff65f..4429801b 100644
--- a/lib/mkl/fft.jl
+++ b/lib/mkl/fft.jl
@@ -105,11 +105,13 @@ function _create_descriptor(sz::NTuple{N,Int}, T::Type, complex::Bool) where {N}
     desc = desc_ref[]
     # Do not program descriptor scaling; we'll perform inverse normalization manually.
     # Set placement explicitly based on plan type later
-    # Construct a SYCL queue from current Level Zero context/device (reuse global queue)
+    # Use the task-local cached SYCL queue wrapping the global Level Zero queue, like the
+    # other oneMKL wrappers do. Creating fresh syclContext/syclQueue objects per plan is
+    # unsound: once they become garbage their finalizers (syclQueueDestroy etc.) tear down
+    # SYCL runtime state for the still-in-use underlying queue, corrupting later DFT
+    # commits and crashing at process exit.
     ze_ctx = oneAPI.context(); ze_dev = oneAPI.device()
-    sycl_dev = SYCL.syclDevice(SYCL.syclPlatform(oneAPI.driver()), ze_dev)
-    sycl_ctx = SYCL.syclContext([sycl_dev], ze_ctx)
-    q = SYCL.syclQueue(sycl_ctx, sycl_dev, oneAPI.global_queue(ze_ctx, ze_dev))
+    q = oneAPI.sycl_queue(oneAPI.global_queue(ze_ctx, ze_dev))
     return desc, q
 end
 

From 3719b89cb5bbc8b7a04e3ac83481bc2c14d58413 Mon Sep 17 00:00:00 2001
From: Michel Schanen <mschanen@anl.gov>
Date: Mon, 22 Jun 2026 15:27:53 +0000
Subject: [PATCH 2/3] Add test

---
 test/fft.jl | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/test/fft.jl b/test/fft.jl
index 1b148dfe..b01d1aba 100644
--- a/test/fft.jl
+++ b/test/fft.jl
@@ -79,4 +79,19 @@ end
         end
     end
 end
+
+@testset "shared queue lifetime across plans" begin
+    dX1 = gpu(rand(ComplexF32, 8))
+    p1 = AbstractFFTs.plan_fft(dX1)
+    dY1 = p1 * dX1
+    p1i = AbstractFFTs.plan_ifft(dX1)
+    p1i * dY1
+
+    GC.gc(true)  # run finalizers of any throwaway per-plan SYCL wrappers
+
+    X2 = rand(ComplexF32, 8, 32)
+    dX2 = gpu(X2)
+    p2 = AbstractFFTs.plan_fft(dX2)
+    cmp(p2 * dX2, fft(X2))
+end
 end

From a1c8ade580f9420c14fe2a31535f3751f35d3e90 Mon Sep 17 00:00:00 2001
From: Michel Schanen <michel.schanen@gmail.com>
Date: Mon, 22 Jun 2026 10:47:04 -0500
Subject: [PATCH 3/3] Assert FFT plans share the cached SYCL queue handle

Add deterministic assertions to the queue-lifetime testset: every plan's
stored queue handle must equal the task-local cached SYCL queue. This fails
on the old throwaway-per-plan code regardless of hardware, unlike the
GC-roundtrip check which only crashes on PVC-class teardown.
---
 test/fft.jl | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/test/fft.jl b/test/fft.jl
index b01d1aba..d4419462 100644
--- a/test/fft.jl
+++ b/test/fft.jl
@@ -81,8 +81,15 @@ end
 end
 
 @testset "shared queue lifetime across plans" begin
+    # Plans must share the single cached task-local SYCL queue rather than each owning a
+    # throwaway one (whose finalizer would tear down shared SYCL/oneMKL state). Assert the
+    # shared handle deterministically, independent of whether a stale queue would crash.
+    cached_handle = Base.unsafe_convert(oneAPI.oneMKL.syclQueue_t,
+        oneAPI.sycl_queue(oneAPI.global_queue(oneAPI.context(), oneAPI.device())))
+
     dX1 = gpu(rand(ComplexF32, 8))
     p1 = AbstractFFTs.plan_fft(dX1)
+    @test p1.queue == cached_handle
     dY1 = p1 * dX1
     p1i = AbstractFFTs.plan_ifft(dX1)
     p1i * dY1
@@ -92,6 +99,7 @@ end
     X2 = rand(ComplexF32, 8, 32)
     dX2 = gpu(X2)
     p2 = AbstractFFTs.plan_fft(dX2)
+    @test p2.queue == cached_handle
     cmp(p2 * dX2, fft(X2))
 end
 end