From fba0d325033c8b8e74cf6c15b03266b98ae565d8 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 5 Nov 2025 18:53:25 -0400 Subject: [PATCH 1/5] Add KernelIntrinsics support --- Project.toml | 2 +- src/OpenCLKernels.jl | 68 +++++++++++++++++++++++++++++++------------- 2 files changed, 50 insertions(+), 20 deletions(-) diff --git a/Project.toml b/Project.toml index ae9646f1..f4aa07f7 100644 --- a/Project.toml +++ b/Project.toml @@ -32,7 +32,7 @@ SPIRVIntrinsics = {path = "lib/intrinsics"} Adapt = "4" GPUArrays = "11.2.1" GPUCompiler = "1.7.1" -KernelAbstractions = "0.9.38" +KernelAbstractions = "0.10" LLVM = "9.1" LinearAlgebra = "1" OpenCL_jll = "=2024.10.24" diff --git a/src/OpenCLKernels.jl b/src/OpenCLKernels.jl index e06102cf..626b7fa0 100644 --- a/src/OpenCLKernels.jl +++ b/src/OpenCLKernels.jl @@ -1,9 +1,10 @@ module OpenCLKernels using ..OpenCL -using ..OpenCL: @device_override, method_table +using ..OpenCL: @device_override, method_table, kernel_convert, clfunction import KernelAbstractions as KA +import KernelAbstractions.KernelIntrinsics as KI import StaticArrays @@ -126,33 +127,62 @@ function (obj::KA.Kernel{OpenCLBackend})(args...; ndrange=nothing, workgroupsize return nothing end +KI.argconvert(::OpenCLBackend, arg) = kernel_convert(arg) + +function KI.kernel_function(::OpenCLBackend, f::F, tt::TT=Tuple{}; name = nothing, kwargs...) where {F,TT} + kern = clfunction(f, tt; name, kwargs...) + KI.Kernel{OpenCLBackend, typeof(kern)}(OpenCLBackend(), kern) +end + +function (obj::KI.Kernel{OpenCLBackend})(args...; numworkgroups = 1, workgroupsize = 1) + KI.check_launch_args(numworkgroups, workgroupsize) + + local_size = (workgroupsize..., ntuple(_ -> 1, 3 - length(workgroupsize))...) + + numworkgroups = (numworkgroups..., ntuple(_ -> 1, 3 - length(numworkgroups))...) + global_size = local_size .* numworkgroups + + obj.kern(args...; local_size, global_size) + return nothing +end + + +function KI.kernel_max_work_group_size(kernel::KI.Kernel{<:OpenCLBackend}; max_work_items::Int=typemax(Int))::Int + wginfo = cl.work_group_info(kernel.kern.fun, cl.device()) + Int(min(wginfo.size, max_work_items)) +end +function KI.max_work_group_size(::OpenCLBackend)::Int + Int(cl.device().max_work_group_size) +end +function KI.multiprocessor_count(::OpenCLBackend)::Int + Int(cl.device().max_compute_units) +end ## Indexing Functions +## COV_EXCL_START -@device_override @inline function KA.__index_Local_Linear(ctx) - return get_local_id(1) +@device_override @inline function KI.get_local_id() + return (; x = Int(get_local_id(1)), y = Int(get_local_id(2)), z = Int(get_local_id(3))) end -@device_override @inline function KA.__index_Group_Linear(ctx) - return get_group_id(1) +@device_override @inline function KI.get_group_id() + return (; x = Int(get_group_id(1)), y = Int(get_group_id(2)), z = Int(get_group_id(3))) end -@device_override @inline function KA.__index_Global_Linear(ctx) - #return get_global_id(1) # JuliaGPU/OpenCL.jl#346 - I = KA.__index_Global_Cartesian(ctx) - @inbounds LinearIndices(KA.__ndrange(ctx))[I] +@device_override @inline function KI.get_global_id() + return (; x = Int(get_global_id(1)), y = Int(get_global_id(2)), z = Int(get_global_id(3))) end -@device_override @inline function KA.__index_Local_Cartesian(ctx) - @inbounds KA.workitems(KA.__iterspace(ctx))[get_local_id(1)] +@device_override @inline function KI.get_local_size() + return (; x = Int(get_local_size(1)), y = Int(get_local_size(2)), z = Int(get_local_size(3))) end -@device_override @inline function KA.__index_Group_Cartesian(ctx) - @inbounds KA.blocks(KA.__iterspace(ctx))[get_group_id(1)] +@device_override @inline function KI.get_num_groups() + return (; x = Int(get_num_groups(1)), y = Int(get_num_groups(2)), z = Int(get_num_groups(3))) end -@device_override @inline function KA.__index_Global_Cartesian(ctx) - return @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1)) +@device_override @inline function KI.get_global_size() + return (; x = Int(get_global_size(1)), y = Int(get_global_size(2)), z = Int(get_global_size(3))) end @device_override @inline function KA.__validindex(ctx) @@ -167,7 +197,7 @@ end ## Shared and Scratch Memory -@device_override @inline function KA.SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id} +@device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}) where {T, Dims} ptr = OpenCL.emit_localmemory(T, Val(prod(Dims))) CLDeviceArray(Dims, ptr) end @@ -179,14 +209,14 @@ end ## Synchronization and Printing -@device_override @inline function KA.__synchronize() +@device_override @inline function KI.barrier() work_group_barrier(OpenCL.LOCAL_MEM_FENCE | OpenCL.GLOBAL_MEM_FENCE) end -@device_override @inline function KA.__print(args...) +@device_override @inline function KI._print(args...) OpenCL._print(args...) end - +## COV_EXCL_STOP ## Other From 6a8117fc417e918e2a588ef6a7a847e0d23447a5 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Mon, 17 Nov 2025 22:02:15 -0400 Subject: [PATCH 2/5] Temp CI --- Project.toml | 3 ++- test/Project.toml | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index f4aa07f7..7ae28129 100644 --- a/Project.toml +++ b/Project.toml @@ -27,12 +27,13 @@ spirv2clc_jll = "f0274c0c-8c8a-59f1-85b7-f7d60330c5fb" [sources] SPIRVIntrinsics = {path = "lib/intrinsics"} +KernelAbstractions = {rev = "main", url = "https://github.com/JuliaGPU/KernelAbstractions.jl"} [compat] Adapt = "4" GPUArrays = "11.2.1" GPUCompiler = "1.7.1" -KernelAbstractions = "0.10" +KernelAbstractions = "0.9, 0.10" LLVM = "9.1" LinearAlgebra = "1" OpenCL_jll = "=2024.10.24" diff --git a/test/Project.toml b/test/Project.toml index 64eacfcf..cb13f2dc 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -10,6 +10,7 @@ KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" OpenCL = "08131aa3-fb12-5dee-8b74-c09406e224a2" ParallelTestRunner = "d3525ed8-44d0-4b2c-a655-542cee43accc" +Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" Preferences = "21216c6a-2e73-6563-6e65-726566657250" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" From 9976a11b54c5d2255b9a8c82135ef14ea23d6007 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 2 Jul 2026 09:10:55 -0300 Subject: [PATCH 3/5] Rename --- src/OpenCLKernels.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/OpenCLKernels.jl b/src/OpenCLKernels.jl index 626b7fa0..98d8f372 100644 --- a/src/OpenCLKernels.jl +++ b/src/OpenCLKernels.jl @@ -4,7 +4,7 @@ using ..OpenCL using ..OpenCL: @device_override, method_table, kernel_convert, clfunction import KernelAbstractions as KA -import KernelAbstractions.KernelIntrinsics as KI +import KernelAbstractions.KernelInterface as KI import StaticArrays From 3446d9f6580140d1bdc2d59ab8543a25836ac591 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 2 Jul 2026 11:00:29 -0300 Subject: [PATCH 4/5] KA.versioninfo() --- src/OpenCLKernels.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/OpenCLKernels.jl b/src/OpenCLKernels.jl index 98d8f372..3b84e180 100644 --- a/src/OpenCLKernels.jl +++ b/src/OpenCLKernels.jl @@ -18,6 +18,8 @@ export OpenCLBackend struct OpenCLBackend <: KA.GPU end +KA.versioninfo(io::IO, ::OpenCLBackend) = OpenCL.versioninfo(io) + function KA.allocate(::OpenCLBackend, ::Type{T}, dims::Tuple; unified::Bool = false) where T if unified memory_backend = cl.unified_memory_backend() From 1f4d27591439fa90b678af5c1db9c08da54bba9f Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Sat, 4 Jul 2026 16:18:12 -0300 Subject: [PATCH 5/5] Skip CPU-only tests --- test/kernelabstractions.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/kernelabstractions.jl b/test/kernelabstractions.jl index 97e54154..e53b79c8 100644 --- a/test/kernelabstractions.jl +++ b/test/kernelabstractions.jl @@ -15,5 +15,7 @@ end skip_tests=Set([ "sparse", "Convert", # Need to opt out of i128 + "CPU synchronization", + "fallback test: callable types" ]) KATestSuite.testsuite(OpenCLBackend, "OpenCL", OpenCL, CLArray, CLDeviceArray; skip_tests)