From c5078c9b5aa4d5e7ec2cf2ddd59a62b9c713157a Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 12 Mar 2026 16:27:59 +0100
Subject: [PATCH 01/13] Switch to ParallelTestRunner.

---
 .buildkite/pipeline.yml            |  40 ++----
 .github/workflows/CI-CPU.yml       |   4 +-
 test/Project.toml                  |  10 ++
 test/aqua.jl                       |   4 +
 test/{ => generic}/accumulate.jl   |   0
 test/{ => generic}/binarysearch.jl |   0
 test/{ => generic}/looping.jl      |   0
 test/{ => generic}/map.jl          |   0
 test/{ => generic}/predicates.jl   |   0
 test/{ => generic}/reduce.jl       |   0
 test/{ => generic}/sort.jl         |   0
 test/runtests.jl                   | 193 +++++++++++++++++++----------
 12 files changed, 154 insertions(+), 97 deletions(-)
 create mode 100644 test/aqua.jl
 rename test/{ => generic}/accumulate.jl (100%)
 rename test/{ => generic}/binarysearch.jl (100%)
 rename test/{ => generic}/looping.jl (100%)
 rename test/{ => generic}/map.jl (100%)
 rename test/{ => generic}/predicates.jl (100%)
 rename test/{ => generic}/reduce.jl (100%)
 rename test/{ => generic}/sort.jl (100%)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 582ab1df..7b004fdc 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -7,13 +7,10 @@ steps:
           version: "1.10"
     command: |
       julia -e 'using Pkg
-
                 println("--- :julia: Instantiating environment")
-                Pkg.add("CUDA")
                 Pkg.develop(path=".")
-
                 println("+++ :julia: Running tests")
-                Pkg.test("AcceleratedKernels", test_args=["--CUDA"])'
+                Pkg.test("AcceleratedKernels", test_args=["--cuda"])'
     agents:
       queue: "cuda"
     if: build.message !~ /\[skip tests\]/
@@ -25,13 +22,10 @@ steps:
           version: "1.11"
     command: |
       julia -e 'using Pkg
-
                 println("--- :julia: Instantiating environment")
-                Pkg.add("CUDA")
                 Pkg.develop(path=".")
-
                 println("+++ :julia: Running tests")
-                Pkg.test("AcceleratedKernels", test_args=["--CUDA"])'
+                Pkg.test("AcceleratedKernels", test_args=["--cuda"])'
     agents:
       queue: "cuda"
     if: build.message !~ /\[skip tests\]/
@@ -45,13 +39,10 @@ steps:
           version: "1.10"
     command: |
       julia -e 'using Pkg
-
                 println("--- :julia: Instantiating environment")
-                Pkg.add("AMDGPU")
                 Pkg.develop(path=".")
-
                 println("+++ :julia: Running tests")
-                Pkg.test("AcceleratedKernels", test_args=["--AMDGPU"])'
+                Pkg.test("AcceleratedKernels", test_args=["--amdgpu"])'
     agents:
       queue: "rocm"
       rocmgpu: "*"
@@ -64,13 +55,10 @@ steps:
           version: "1.11"
     command: |
       julia -e 'using Pkg
-
                 println("--- :julia: Instantiating environment")
-                Pkg.add("AMDGPU")
                 Pkg.develop(path=".")
-
                 println("+++ :julia: Running tests")
-                Pkg.test("AcceleratedKernels", test_args=["--AMDGPU"])'
+                Pkg.test("AcceleratedKernels", test_args=["--amdgpu"])'
     agents:
       queue: "rocm"
       rocmgpu: "*"
@@ -85,13 +73,10 @@ steps:
           version: "1.10"
     command: |
       julia -e 'using Pkg
-
                 println("--- :julia: Instantiating environment")
-                Pkg.add("oneAPI")
                 Pkg.develop(path=".")
-
                 println("+++ :julia: Running tests")
-                Pkg.test("AcceleratedKernels", test_args=["--oneAPI"])'
+                Pkg.test("AcceleratedKernels", test_args=["--oneapi"])'
     agents:
       queue: "oneapi"
     if: build.message !~ /\[skip tests\]/
@@ -103,13 +88,10 @@ steps:
           version: "1.11"
     command: |
       julia -e 'using Pkg
-
                 println("--- :julia: Instantiating environment")
-                Pkg.add("oneAPI")
                 Pkg.develop(path=".")
-
                 println("+++ :julia: Running tests")
-                Pkg.test("AcceleratedKernels", test_args=["--oneAPI"])'
+                Pkg.test("AcceleratedKernels", test_args=["--oneapi"])'
     agents:
       queue: "oneapi"
     if: build.message !~ /\[skip tests\]/
@@ -123,13 +105,10 @@ steps:
           version: "1.10"
     command: |
       julia -e 'using Pkg
-
                 println("--- :julia: Instantiating environment")
-                Pkg.add("Metal")
                 Pkg.develop(path=".")
-
                 println("+++ :julia: Running tests")
-                Pkg.test("AcceleratedKernels", test_args=["--Metal"])'
+                Pkg.test("AcceleratedKernels", test_args=["--metal"])'
     agents:
       queue: "metal"
     if: build.message !~ /\[skip tests\]/
@@ -141,13 +120,10 @@ steps:
           version: "1.11"
     command: |
       julia -e 'using Pkg
-
                 println("--- :julia: Instantiating environment")
-                Pkg.add("Metal")
                 Pkg.develop(path=".")
-
                 println("+++ :julia: Running tests")
-                Pkg.test("AcceleratedKernels", test_args=["--Metal"])'
+                Pkg.test("AcceleratedKernels", test_args=["--metal"])'
     agents:
       queue: "metal"
     if: build.message !~ /\[skip tests\]/
diff --git a/.github/workflows/CI-CPU.yml b/.github/workflows/CI-CPU.yml
index 2944bf2b..e46bc3b7 100644
--- a/.github/workflows/CI-CPU.yml
+++ b/.github/workflows/CI-CPU.yml
@@ -82,7 +82,7 @@ jobs:
       - uses: julia-actions/julia-buildpkg@v1
       - uses: julia-actions/julia-runtest@v1
         with:
-          test_args: '--OpenCL'
+          test_args: '--opencl'
   # cpuKA:
   #   name: KA CPU Backend
   #   runs-on: ubuntu-latest
@@ -102,7 +102,7 @@ jobs:
   #     - uses: julia-actions/julia-buildpkg@v1
   #     - uses: julia-actions/julia-runtest@v1
   #       with:
-  #         test_args: '--cpuKA'
+  #         test_args: '--cpu-ka'
   docs:
     name: Documentation
     runs-on: ubuntu-latest
diff --git a/test/Project.toml b/test/Project.toml
index d77e276f..0a79e7d4 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,7 +1,17 @@
 [deps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
+OpenCL = "08131aa3-fb12-5dee-8b74-c09406e224a2"
+ParallelTestRunner = "d3525ed8-44d0-4b2c-a655-542cee43accc"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
+pocl_jll = "627d6b7a-bbe6-5189-83e7-98cc0a5aeadd"
+
+[compat]
+ParallelTestRunner = "2"
diff --git a/test/aqua.jl b/test/aqua.jl
new file mode 100644
index 00000000..14bd2771
--- /dev/null
+++ b/test/aqua.jl
@@ -0,0 +1,4 @@
+using Aqua
+@testset "Aqua" begin
+    Aqua.test_all(AK)
+end
diff --git a/test/accumulate.jl b/test/generic/accumulate.jl
similarity index 100%
rename from test/accumulate.jl
rename to test/generic/accumulate.jl
diff --git a/test/binarysearch.jl b/test/generic/binarysearch.jl
similarity index 100%
rename from test/binarysearch.jl
rename to test/generic/binarysearch.jl
diff --git a/test/looping.jl b/test/generic/looping.jl
similarity index 100%
rename from test/looping.jl
rename to test/generic/looping.jl
diff --git a/test/map.jl b/test/generic/map.jl
similarity index 100%
rename from test/map.jl
rename to test/generic/map.jl
diff --git a/test/predicates.jl b/test/generic/predicates.jl
similarity index 100%
rename from test/predicates.jl
rename to test/generic/predicates.jl
diff --git a/test/reduce.jl b/test/generic/reduce.jl
similarity index 100%
rename from test/reduce.jl
rename to test/generic/reduce.jl
diff --git a/test/sort.jl b/test/generic/sort.jl
similarity index 100%
rename from test/sort.jl
rename to test/generic/sort.jl
diff --git a/test/runtests.jl b/test/runtests.jl
index b858c78c..4f8a6852 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,76 +1,143 @@
 import AcceleratedKernels as AK
-using KernelAbstractions
-using Test
-using Random
-import Pkg
-
-# Set to true when testing backends that support this
-const TEST_DL = Ref{Bool}(false)
-
-# Pass command-line argument to test suite to install the right backend, e.g.
-#   julia> import Pkg
-#   julia> Pkg.test(test_args=["--oneAPI"])
-if "--CUDA" in ARGS
-    Pkg.add("CUDA")
+using ParallelTestRunner
+
+const init_code = quote
+    import AcceleratedKernels as AK
+    using KernelAbstractions
+    using Test
+    using Random
+end
+
+# Discover root-level tests (aqua.jl, partition.jl) and generic tests
+const testsuite = find_tests(@__DIR__)
+const generic_tests = find_tests(joinpath(@__DIR__, "generic"))
+
+# Parse args with lowercase hyphenated backend flags
+args = parse_args(ARGS; custom=["cuda", "amdgpu", "metal", "oneapi", "opencl", "cpu-ka"])
+
+# Common helper code appended to every backend setup
+const _array_from_host_code = quote
+    global array_from_host
+    array_from_host(h_arr::AbstractArray, dtype=nothing) = array_from_host(BACKEND, h_arr, dtype)
+    function array_from_host(backend, h_arr::AbstractArray, dtype=nothing)
+        d_arr = KernelAbstractions.zeros(backend, isnothing(dtype) ? eltype(h_arr) : dtype, size(h_arr))
+        copyto!(d_arr, h_arr isa Array ? h_arr : Array(h_arr))
+        d_arr
+    end
+end
+
+# Build list of active backends, each with setup code
+backends = Pair{String, Expr}[]
+
+# CPU always active
+using InteractiveUtils
+@info "Julia information:\n" * sprint(InteractiveUtils.versioninfo)
+push!(backends, "cpu" => quote
+    global BACKEND = get_backend([])
+    global IS_CPU_BACKEND = true
+    global prefer_threads = true
+    global TEST_DL = Ref{Bool}(false)
+    $_array_from_host_code
+end)
+
+# cpu-ka only when --cpu-ka flag passed
+if args.custom["cpu-ka"] !== nothing
+    push!(backends, "cpu-ka" => quote
+        global BACKEND = get_backend([])
+        global IS_CPU_BACKEND = true
+        global prefer_threads = false
+        global TEST_DL = Ref{Bool}(false)
+        $_array_from_host_code
+    end)
+end
+
+# GPU backends: auto-detect functional ones, or enable explicitly via CLI flags.
+# Always assert functional() before proceeding, then print versioninfo() once.
+
+if try; using CUDA; CUDA.functional(); catch; false; end || args.custom["cuda"] !== nothing
     using CUDA
-    CUDA.versioninfo()
-    const BACKEND = CUDABackend()
-    TEST_DL[] = true
-elseif "--oneAPI" in ARGS
-    Pkg.add("oneAPI")
-    using oneAPI
-    oneAPI.versioninfo()
-    const BACKEND = oneAPIBackend()
+    @assert CUDA.functional()
+    @info "CUDA information:\n" * sprint(CUDA.versioninfo)
+    push!(backends, "cuda" => quote
+        using CUDA
+        global BACKEND = CUDABackend()
+        global IS_CPU_BACKEND = false
+        global prefer_threads = true
+        global TEST_DL = Ref{Bool}(true)
+        $_array_from_host_code
+    end)
+end
 
-    # FIXME: need atomic orderings for `DecoupledLookback` in oneAPI
-    # TEST_DL[] = true
-elseif "--AMDGPU" in ARGS
-    Pkg.add("AMDGPU")
+if try; using AMDGPU; AMDGPU.functional(); catch; false; end || args.custom["amdgpu"] !== nothing
     using AMDGPU
-    AMDGPU.versioninfo()
-    const BACKEND = ROCBackend()
-    TEST_DL[] = true
-elseif "--Metal" in ARGS
-    Pkg.add("Metal")
+    @assert AMDGPU.functional()
+    @info "AMDGPU information:\n" * sprint(AMDGPU.versioninfo)
+    push!(backends, "amdgpu" => quote
+        using AMDGPU
+        global BACKEND = ROCBackend()
+        global IS_CPU_BACKEND = false
+        global prefer_threads = true
+        global TEST_DL = Ref{Bool}(true)
+        $_array_from_host_code
+    end)
+end
+
+if try; using Metal; Metal.functional(); catch; false; end || args.custom["metal"] !== nothing
     using Metal
-    Metal.versioninfo()
-    const BACKEND = MetalBackend()
-elseif "--OpenCL" in ARGS
-    Pkg.add(name="OpenCL", rev="main")
-    Pkg.add(name="SPIRVIntrinsics", rev="main")
-    Pkg.add("pocl_jll")
-    using pocl_jll
-    using OpenCL
-    OpenCL.versioninfo()
-    const BACKEND = OpenCLBackend()
-elseif !@isdefined(BACKEND)
-    # Otherwise do CPU tests
-    using InteractiveUtils
-    InteractiveUtils.versioninfo()
-    const BACKEND = get_backend([])
+    @assert Metal.functional()
+    @info "Metal information:\n" * sprint(Metal.versioninfo)
+    push!(backends, "metal" => quote
+        using Metal
+        global BACKEND = MetalBackend()
+        global IS_CPU_BACKEND = false
+        global prefer_threads = true
+        global TEST_DL = Ref{Bool}(false)
+        $_array_from_host_code
+    end)
 end
 
-const IS_CPU_BACKEND = BACKEND == get_backend([])
+if try; using oneAPI; oneAPI.functional(); catch; false; end || args.custom["oneapi"] !== nothing
+    using oneAPI
+    @assert oneAPI.functional()
+    @info "oneAPI information:\n" * sprint(oneAPI.versioninfo)
+    push!(backends, "oneapi" => quote
+        using oneAPI
+        global BACKEND = oneAPIBackend()
+        global IS_CPU_BACKEND = false
+        global prefer_threads = true
+        global TEST_DL = Ref{Bool}(false)
+        $_array_from_host_code
+    end)
+end
 
-global prefer_threads::Bool = !(IS_CPU_BACKEND && "--cpuKA" in ARGS)
+if try; using pocl_jll, OpenCL; !isempty(OpenCL.cl.platforms()); catch; false; end || args.custom["opencl"] !== nothing
+    using pocl_jll, OpenCL
+    @assert !isempty(OpenCL.cl.platforms())
+    @info "OpenCL information:\n" * sprint(OpenCL.versioninfo)
+    push!(backends, "opencl" => quote
+        using pocl_jll
+        using OpenCL
+        global BACKEND = OpenCLBackend()
+        global IS_CPU_BACKEND = false
+        global prefer_threads = true
+        global TEST_DL = Ref{Bool}(false)
+        $_array_from_host_code
+    end)
+end
 
-array_from_host(h_arr::AbstractArray, dtype=nothing) = array_from_host(BACKEND, h_arr, dtype)
-function array_from_host(backend, h_arr::AbstractArray, dtype=nothing)
-    d_arr = KernelAbstractions.zeros(backend, isnothing(dtype) ? eltype(h_arr) : dtype, size(h_arr))
-    copyto!(d_arr, h_arr isa Array ? h_arr : Array(h_arr))      # Allow unmaterialised types, e.g. ranges
-    d_arr
+# Duplicate generic tests per active backend
+for (backend_name, setup_code) in backends
+    for (test_name, test_body) in generic_tests
+        testsuite["$backend_name/$test_name"] = quote
+            $setup_code
+            $test_body
+        end
+    end
 end
 
-@testset "Aqua" begin
-    using Aqua
-    Aqua.test_all(AK)
+# Filter tests by user-specified positional args; remove bare generic/ entries if no filter was specified
+if filter_tests!(testsuite, args)
+    filter!(((k,v),) -> !startswith(k, "generic/"), testsuite)
 end
 
-include("partition.jl")
-include("looping.jl")
-include("map.jl")
-include("sort.jl")
-include("reduce.jl")
-include("accumulate.jl")
-include("predicates.jl")
-include("binarysearch.jl")
+runtests(AK, args; init_code, testsuite)

From 4d330bddf0bc1f3f9ee38716b378476af689fde5 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 11 Jun 2026 10:01:52 +0200
Subject: [PATCH 02/13] Add AcceleratedKernels to test project for direct
 runtests.jl runs.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 test/Project.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/Project.toml b/test/Project.toml
index 0a79e7d4..5683fde2 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,5 +1,6 @@
 [deps]
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"

From 27a0a21e6890cc6ff5df869dcb30ea2d5a6c9c1b Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 11 Jun 2026 11:46:26 +0200
Subject: [PATCH 03/13] Only test GPU back-ends when explicitly requested.

Drop the functional-backend autodetection: the CPU back-end is always tested,
and GPU back-ends are opt-in via CLI flags (--cuda, --metal, etc.), in which
case load or initialization failures propagate instead of being swallowed.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 test/runtests.jl | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 4f8a6852..6d5dfe77 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -51,10 +51,10 @@ if args.custom["cpu-ka"] !== nothing
     end)
 end
 
-# GPU backends: auto-detect functional ones, or enable explicitly via CLI flags.
-# Always assert functional() before proceeding, then print versioninfo() once.
+# GPU backends are only tested when explicitly requested via a CLI flag, in which case
+# they are expected to be functional: load or initialization failures propagate.
 
-if try; using CUDA; CUDA.functional(); catch; false; end || args.custom["cuda"] !== nothing
+if args.custom["cuda"] !== nothing
     using CUDA
     @assert CUDA.functional()
     @info "CUDA information:\n" * sprint(CUDA.versioninfo)
@@ -68,7 +68,7 @@ if try; using CUDA; CUDA.functional(); catch; false; end || args.custom["cuda"]
     end)
 end
 
-if try; using AMDGPU; AMDGPU.functional(); catch; false; end || args.custom["amdgpu"] !== nothing
+if args.custom["amdgpu"] !== nothing
     using AMDGPU
     @assert AMDGPU.functional()
     @info "AMDGPU information:\n" * sprint(AMDGPU.versioninfo)
@@ -82,7 +82,7 @@ if try; using AMDGPU; AMDGPU.functional(); catch; false; end || args.custom["amd
     end)
 end
 
-if try; using Metal; Metal.functional(); catch; false; end || args.custom["metal"] !== nothing
+if args.custom["metal"] !== nothing
     using Metal
     @assert Metal.functional()
     @info "Metal information:\n" * sprint(Metal.versioninfo)
@@ -96,7 +96,7 @@ if try; using Metal; Metal.functional(); catch; false; end || args.custom["metal
     end)
 end
 
-if try; using oneAPI; oneAPI.functional(); catch; false; end || args.custom["oneapi"] !== nothing
+if args.custom["oneapi"] !== nothing
     using oneAPI
     @assert oneAPI.functional()
     @info "oneAPI information:\n" * sprint(oneAPI.versioninfo)
@@ -110,7 +110,7 @@ if try; using oneAPI; oneAPI.functional(); catch; false; end || args.custom["one
     end)
 end
 
-if try; using pocl_jll, OpenCL; !isempty(OpenCL.cl.platforms()); catch; false; end || args.custom["opencl"] !== nothing
+if args.custom["opencl"] !== nothing
     using pocl_jll, OpenCL
     @assert !isempty(OpenCL.cl.platforms())
     @info "OpenCL information:\n" * sprint(OpenCL.versioninfo)

From 5dd3596356fd5ac1aa3613a196edeae54fffa054 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Sat, 13 Jun 2026 14:19:49 -0300
Subject: [PATCH 04/13] No precompile

---
 .github/workflows/CI-CPU.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/CI-CPU.yml b/.github/workflows/CI-CPU.yml
index e46bc3b7..dbda2e70 100644
--- a/.github/workflows/CI-CPU.yml
+++ b/.github/workflows/CI-CPU.yml
@@ -12,6 +12,9 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
 
+env:
+  JULIA_PKG_PRECOMPILE_AUTO: false
+
 jobs:
   test:
     name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - nthreads ${{ matrix.env.JULIA_NUM_THREADS }} - ${{ github.event_name }}

From 5fc33fa411edcfd5b06339764ee5aec1bb1e115a Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Sat, 13 Jun 2026 14:34:20 -0300
Subject: [PATCH 05/13] Don't test CPU by default when another backend is
 requested.

---
 test/runtests.jl | 46 ++++++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 6d5dfe77..6c349a81 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -29,28 +29,6 @@ end
 # Build list of active backends, each with setup code
 backends = Pair{String, Expr}[]
 
-# CPU always active
-using InteractiveUtils
-@info "Julia information:\n" * sprint(InteractiveUtils.versioninfo)
-push!(backends, "cpu" => quote
-    global BACKEND = get_backend([])
-    global IS_CPU_BACKEND = true
-    global prefer_threads = true
-    global TEST_DL = Ref{Bool}(false)
-    $_array_from_host_code
-end)
-
-# cpu-ka only when --cpu-ka flag passed
-if args.custom["cpu-ka"] !== nothing
-    push!(backends, "cpu-ka" => quote
-        global BACKEND = get_backend([])
-        global IS_CPU_BACKEND = true
-        global prefer_threads = false
-        global TEST_DL = Ref{Bool}(false)
-        $_array_from_host_code
-    end)
-end
-
 # GPU backends are only tested when explicitly requested via a CLI flag, in which case
 # they are expected to be functional: load or initialization failures propagate.
 
@@ -125,6 +103,30 @@ if args.custom["opencl"] !== nothing
     end)
 end
 
+# cpu-ka only when --cpu-ka flag passed
+if args.custom["cpu-ka"] !== nothing
+    push!(backends, "cpu-ka" => quote
+        global BACKEND = get_backend([])
+        global IS_CPU_BACKEND = true
+        global prefer_threads = false
+        global TEST_DL = Ref{Bool}(false)
+        $_array_from_host_code
+    end)
+end
+
+# CPU runs if no backend selected or if explicitly specified
+using InteractiveUtils
+@info "Julia information:\n" * sprint(InteractiveUtils.versioninfo)
+if args.custom["cpu"] !== nothing || isempty(backends)
+    push!(backends, "cpu" => quote
+        global BACKEND = get_backend([])
+        global IS_CPU_BACKEND = true
+        global prefer_threads = true
+        global TEST_DL = Ref{Bool}(false)
+        $_array_from_host_code
+    end)
+end
+
 # Duplicate generic tests per active backend
 for (backend_name, setup_code) in backends
     for (test_name, test_body) in generic_tests

From d964ae8fcc55df744d23dacf93e22c5e85b3f6ef Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Sat, 13 Jun 2026 14:38:49 -0300
Subject: [PATCH 06/13] Fix

---
 test/runtests.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 6c349a81..793c1c6f 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -13,7 +13,7 @@ const testsuite = find_tests(@__DIR__)
 const generic_tests = find_tests(joinpath(@__DIR__, "generic"))
 
 # Parse args with lowercase hyphenated backend flags
-args = parse_args(ARGS; custom=["cuda", "amdgpu", "metal", "oneapi", "opencl", "cpu-ka"])
+args = parse_args(ARGS; custom=["cuda", "amdgpu", "metal", "oneapi", "opencl", "cpu-ka", "cpu"])
 
 # Common helper code appended to every backend setup
 const _array_from_host_code = quote

From 3214e381536873002fc004f87ec06f0ad14b27ea Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Sat, 13 Jun 2026 14:40:53 -0300
Subject: [PATCH 07/13] CUDACore

---
 test/Project.toml | 2 +-
 test/runtests.jl  | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/test/Project.toml b/test/Project.toml
index 5683fde2..d6329fc7 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -2,7 +2,7 @@
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+CUDACore = "bd0ed864-bdfe-4181-a5ed-ce625a5fdea2"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
diff --git a/test/runtests.jl b/test/runtests.jl
index 793c1c6f..0db74871 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -33,11 +33,11 @@ backends = Pair{String, Expr}[]
 # they are expected to be functional: load or initialization failures propagate.
 
 if args.custom["cuda"] !== nothing
-    using CUDA
-    @assert CUDA.functional()
-    @info "CUDA information:\n" * sprint(CUDA.versioninfo)
+    using CUDACore
+    @assert CUDACore.functional()
+    @info "CUDACore information:\n" * sprint(CUDACore.versioninfo)
     push!(backends, "cuda" => quote
-        using CUDA
+        using CUDACore
         global BACKEND = CUDABackend()
         global IS_CPU_BACKEND = false
         global prefer_threads = true

From 0fd1ec05e4d379b9a93e7ba2b2e8f0e5b673bd2e Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Sat, 13 Jun 2026 14:44:34 -0300
Subject: [PATCH 08/13] Move CPU info

---
 test/runtests.jl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 0db74871..b6b72600 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,9 @@
 import AcceleratedKernels as AK
 using ParallelTestRunner
 
+using InteractiveUtils
+@info "Julia information:\n" * sprint(InteractiveUtils.versioninfo)
+
 const init_code = quote
     import AcceleratedKernels as AK
     using KernelAbstractions
@@ -115,8 +118,6 @@ if args.custom["cpu-ka"] !== nothing
 end
 
 # CPU runs if no backend selected or if explicitly specified
-using InteractiveUtils
-@info "Julia information:\n" * sprint(InteractiveUtils.versioninfo)
 if args.custom["cpu"] !== nothing || isempty(backends)
     push!(backends, "cpu" => quote
         global BACKEND = get_backend([])

From 68bb2b7f13789af71c8c8c6d3f10471294501342 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Sat, 13 Jun 2026 14:52:20 -0300
Subject: [PATCH 09/13] FIx

---
 test/runtests.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index b6b72600..ecc334eb 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -36,9 +36,9 @@ backends = Pair{String, Expr}[]
 # they are expected to be functional: load or initialization failures propagate.
 
 if args.custom["cuda"] !== nothing
-    using CUDACore
+    using CUDACore, CUDATools
     @assert CUDACore.functional()
-    @info "CUDACore information:\n" * sprint(CUDACore.versioninfo)
+    @info "CUDACore information:\n" * sprint(CUDATools.versioninfo)
     push!(backends, "cuda" => quote
         using CUDACore
         global BACKEND = CUDABackend()

From 1650923d3dc482ccc24fbed7daa9d0fe37ac6bff Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Sat, 13 Jun 2026 14:55:54 -0300
Subject: [PATCH 10/13] Fix

---
 test/Project.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/Project.toml b/test/Project.toml
index d6329fc7..6363aeb0 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -3,6 +3,7 @@ AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 CUDACore = "bd0ed864-bdfe-4181-a5ed-ce625a5fdea2"
+CUDATools = "9ec180c6-1c07-47c7-9e6e-ebefa4d1f6d0"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 Metal = "dde4c033-4e86-420c-a63e-0dd931031962"

From 6b9849c97dd136f5973a5984a4ce9a4b62232dd1 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Sat, 13 Jun 2026 15:04:03 -0300
Subject: [PATCH 11/13] Install backends as required

---
 test/Project.toml | 7 -------
 test/runtests.jl  | 9 ++++++++-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/test/Project.toml b/test/Project.toml
index 6363aeb0..76fb12f9 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,19 +1,12 @@
 [deps]
-AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
-CUDACore = "bd0ed864-bdfe-4181-a5ed-ce625a5fdea2"
-CUDATools = "9ec180c6-1c07-47c7-9e6e-ebefa4d1f6d0"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
-Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
-OpenCL = "08131aa3-fb12-5dee-8b74-c09406e224a2"
 ParallelTestRunner = "d3525ed8-44d0-4b2c-a655-542cee43accc"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
-pocl_jll = "627d6b7a-bbe6-5189-83e7-98cc0a5aeadd"
 
 [compat]
 ParallelTestRunner = "2"
diff --git a/test/runtests.jl b/test/runtests.jl
index ecc334eb..7fdf13ac 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,7 +1,8 @@
 import AcceleratedKernels as AK
+using InteractiveUtils
 using ParallelTestRunner
+using Pkg
 
-using InteractiveUtils
 @info "Julia information:\n" * sprint(InteractiveUtils.versioninfo)
 
 const init_code = quote
@@ -36,6 +37,8 @@ backends = Pair{String, Expr}[]
 # they are expected to be functional: load or initialization failures propagate.
 
 if args.custom["cuda"] !== nothing
+    Pkg.add("CUDACore")
+    Pkg.add("CUDATools")
     using CUDACore, CUDATools
     @assert CUDACore.functional()
     @info "CUDACore information:\n" * sprint(CUDATools.versioninfo)
@@ -50,6 +53,7 @@ if args.custom["cuda"] !== nothing
 end
 
 if args.custom["amdgpu"] !== nothing
+    Pkg.add("AMDGPU")
     using AMDGPU
     @assert AMDGPU.functional()
     @info "AMDGPU information:\n" * sprint(AMDGPU.versioninfo)
@@ -64,6 +68,7 @@ if args.custom["amdgpu"] !== nothing
 end
 
 if args.custom["metal"] !== nothing
+    Pkg.add("Metal")
     using Metal
     @assert Metal.functional()
     @info "Metal information:\n" * sprint(Metal.versioninfo)
@@ -78,6 +83,7 @@ if args.custom["metal"] !== nothing
 end
 
 if args.custom["oneapi"] !== nothing
+    Pkg.add("oneAPI")
     using oneAPI
     @assert oneAPI.functional()
     @info "oneAPI information:\n" * sprint(oneAPI.versioninfo)
@@ -92,6 +98,7 @@ if args.custom["oneapi"] !== nothing
 end
 
 if args.custom["opencl"] !== nothing
+    Pkg.add(["pocl_jll", "OpenCL"])
     using pocl_jll, OpenCL
     @assert !isempty(OpenCL.cl.platforms())
     @info "OpenCL information:\n" * sprint(OpenCL.versioninfo)

From d5d081c1fa48c1e6409cc2ef1c9c3c930dcc9b85 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Sat, 13 Jun 2026 15:34:25 -0300
Subject: [PATCH 12/13] Fix AMDGPU output

---
 test/runtests.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 7fdf13ac..0c848365 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -56,7 +56,8 @@ if args.custom["amdgpu"] !== nothing
     Pkg.add("AMDGPU")
     using AMDGPU
     @assert AMDGPU.functional()
-    @info "AMDGPU information:\n" * sprint(AMDGPU.versioninfo)
+    println("AMDGPU information:")
+    AMDGPU.versioninfo()
     push!(backends, "amdgpu" => quote
         using AMDGPU
         global BACKEND = ROCBackend()

From 06fcbff7c1de3c300d028792f03d8aa26faf09f0 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 16 Jun 2026 15:36:17 +0200
Subject: [PATCH 13/13] Try going back to hard-coded back-ends.

---
 test/Project.toml | 8 +++++++-
 test/runtests.jl  | 7 -------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/test/Project.toml b/test/Project.toml
index 76fb12f9..4a9cdf68 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,12 +1,18 @@
 [deps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
+CUDACore = "bd0ed864-bdfe-4181-a5ed-ce625a5fdea2"
+CUDATools = "9ec180c6-1c07-47c7-9e6e-ebefa4d1f6d0"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
+OpenCL = "08131aa3-fb12-5dee-8b74-c09406e224a2"
 ParallelTestRunner = "d3525ed8-44d0-4b2c-a655-542cee43accc"
-Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
+pocl_jll = "627d6b7a-bbe6-5189-83e7-98cc0a5aeadd"
 
 [compat]
 ParallelTestRunner = "2"
diff --git a/test/runtests.jl b/test/runtests.jl
index 0c848365..bb52212b 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,7 +1,6 @@
 import AcceleratedKernels as AK
 using InteractiveUtils
 using ParallelTestRunner
-using Pkg
 
 @info "Julia information:\n" * sprint(InteractiveUtils.versioninfo)
 
@@ -37,8 +36,6 @@ backends = Pair{String, Expr}[]
 # they are expected to be functional: load or initialization failures propagate.
 
 if args.custom["cuda"] !== nothing
-    Pkg.add("CUDACore")
-    Pkg.add("CUDATools")
     using CUDACore, CUDATools
     @assert CUDACore.functional()
     @info "CUDACore information:\n" * sprint(CUDATools.versioninfo)
@@ -53,7 +50,6 @@ if args.custom["cuda"] !== nothing
 end
 
 if args.custom["amdgpu"] !== nothing
-    Pkg.add("AMDGPU")
     using AMDGPU
     @assert AMDGPU.functional()
     println("AMDGPU information:")
@@ -69,7 +65,6 @@ if args.custom["amdgpu"] !== nothing
 end
 
 if args.custom["metal"] !== nothing
-    Pkg.add("Metal")
     using Metal
     @assert Metal.functional()
     @info "Metal information:\n" * sprint(Metal.versioninfo)
@@ -84,7 +79,6 @@ if args.custom["metal"] !== nothing
 end
 
 if args.custom["oneapi"] !== nothing
-    Pkg.add("oneAPI")
     using oneAPI
     @assert oneAPI.functional()
     @info "oneAPI information:\n" * sprint(oneAPI.versioninfo)
@@ -99,7 +93,6 @@ if args.custom["oneapi"] !== nothing
 end
 
 if args.custom["opencl"] !== nothing
-    Pkg.add(["pocl_jll", "OpenCL"])
     using pocl_jll, OpenCL
     @assert !isempty(OpenCL.cl.platforms())
     @info "OpenCL information:\n" * sprint(OpenCL.versioninfo)