From f56959620f034ca1075b527cc6fa77c214fe7b4f Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Mon, 22 Jun 2026 15:20:03 +0200
Subject: [PATCH 01/15] Add dense GPU sparse conversion

---
 lib/JLArrays/src/JLArrays.jl | 15 +++++++++++
 src/host/sparse.jl           | 52 ++++++++++++++++++++++++++++++++++++
 test/testsuite/sparse.jl     | 13 +++++++++
 3 files changed, 80 insertions(+)

diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl
index 0efc5a5e7..b55f13d96 100644
--- a/lib/JLArrays/src/JLArrays.jl
+++ b/lib/JLArrays/src/JLArrays.jl
@@ -224,10 +224,14 @@ GPUArrays.dense_vector_type(a::JLArray{T, N}) where {T, N} = JLArray{T, 1}
 GPUArrays.dense_vector_type(::Type{JLArray{T, N}}) where {T, N} = JLArray{T, 1}
 
 GPUArrays.sparse_array_type(sa::JLSparseMatrixCSC) = JLSparseMatrixCSC
+GPUArrays.sparse_array_type(sa::JLArray{<:Any,2}) = JLSparseMatrixCSC
+GPUArrays.sparse_array_type(::Type{<:JLArray{<:Any,2}}) = JLSparseMatrixCSC
 GPUArrays.sparse_array_type(::Type{<:JLSparseMatrixCSC}) = JLSparseMatrixCSC
 GPUArrays.sparse_array_type(sa::JLSparseMatrixCSR) = JLSparseMatrixCSR
 GPUArrays.sparse_array_type(::Type{<:JLSparseMatrixCSR}) = JLSparseMatrixCSR
 GPUArrays.sparse_array_type(sa::JLSparseVector) = JLSparseVector
+GPUArrays.sparse_array_type(sa::JLArray{<:Any,1}) = JLSparseVector
+GPUArrays.sparse_array_type(::Type{<:JLArray{<:Any,1}}) = JLSparseVector
 GPUArrays.sparse_array_type(::Type{<:JLSparseVector}) = JLSparseVector
 
 GPUArrays.dense_array_type(sa::JLSparseVector) = JLArray
@@ -238,7 +242,9 @@ GPUArrays.dense_array_type(sa::JLSparseMatrixCSR) = JLArray
 GPUArrays.dense_array_type(::Type{<:JLSparseMatrixCSR}) = JLArray
 
 GPUArrays.csc_type(sa::JLSparseMatrixCSR) = JLSparseMatrixCSC
+GPUArrays.csc_type(::Type{<:JLSparseMatrixCSR}) = JLSparseMatrixCSC
 GPUArrays.csr_type(sa::JLSparseMatrixCSC) = JLSparseMatrixCSR
+GPUArrays.csr_type(::Type{<:JLSparseMatrixCSC}) = JLSparseMatrixCSR
 
 Base.similar(Mat::JLSparseMatrixCSR) = JLSparseMatrixCSR(copy(Mat.rowPtr), copy(Mat.colVal), similar(nonzeros(Mat)), size(Mat))
 Base.similar(Mat::JLSparseMatrixCSR, T::Type) = JLSparseMatrixCSR(copy(Mat.rowPtr), copy(Mat.colVal), similar(nonzeros(Mat), T), size(Mat))
@@ -383,6 +389,7 @@ function JLSparseMatrixCSC(xs::SparseMatrixCSC{Tv, Ti}) where {Ti, Tv}
     return JLSparseMatrixCSC{Tv, Ti}(colPtr, rowVal, nzVal, (xs.m, xs.n))
 end
 JLSparseMatrixCSC(xs::SparseVector) = JLSparseMatrixCSC(SparseMatrixCSC(xs))
+JLSparseMatrixCSC(xs::JLArray{<:Any,2}) = JLSparseMatrixCSC(SparseMatrixCSC(xs))
 Base.length(x::JLSparseMatrixCSC) = prod(x.dims)
 Base.size(x::JLSparseMatrixCSC) = x.dims
 
@@ -397,6 +404,7 @@ function JLSparseMatrixCSR(xs::SparseMatrixCSC{Tv, Ti}) where {Ti, Tv}
     return JLSparseMatrixCSR{Tv, Ti}(rowPtr, colVal, nzVal, (xs.m, xs.n))
 end
 JLSparseMatrixCSR(xs::SparseVector{Tv, Ti}) where {Ti, Tv} = JLSparseMatrixCSR(SparseMatrixCSC(xs))
+JLSparseMatrixCSR(xs::JLArray{<:Any,2}) = JLSparseMatrixCSR(SparseMatrixCSC(xs))
 function JLSparseMatrixCSR(xs::JLSparseMatrixCSC{Tv, Ti}) where {Ti, Tv}
     return JLSparseMatrixCSR(SparseMatrixCSC(xs))
 end
@@ -456,6 +464,13 @@ Adapt.adapt_storage(::Type{Array}, xs::JLArray) = convert(Array, xs)
 Base.convert(::Type{T}, x::T) where T <: JLArray = x
 
 
+## indexing
+
+Base.findall(bools::JLArray{Bool}) = JLArray(findall(Array(bools)))
+Base.findall(f::Function, A::JLArray) = JLArray(findall(f, Array(A)))
+Base.findall(f::Base.Fix2{typeof(in)}, A::JLArray) = JLArray(findall(f, Array(A)))
+
+
 ## broadcast
 
 import Base.Broadcast: BroadcastStyle, Broadcasted
diff --git a/src/host/sparse.jl b/src/host/sparse.jl
index 5c84ee77c..60d469af6 100644
--- a/src/host/sparse.jl
+++ b/src/host/sparse.jl
@@ -44,6 +44,58 @@ Base.iszero(A::AbstractGPUSparseArray) = SparseArrays.nnz(A) == 0 || all(iszero,
 SparseArrays.SparseVector(x::AbstractGPUSparseVector) = SparseVector(length(x), Array(SparseArrays.nonzeroinds(x)), Array(SparseArrays.nonzeros(x)))
 SparseArrays.SparseMatrixCSC(x::AbstractGPUSparseMatrixCSC) = SparseMatrixCSC(size(x)..., Array(SparseArrays.getcolptr(x)), Array(SparseArrays.rowvals(x)), Array(SparseArrays.nonzeros(x)))
 
+@kernel function dense_sparse_values_kernel!(values, A, indices)
+    i = @index(Global, Linear)
+    if i <= length(indices)
+        @inbounds values[i] = A[indices[i]]
+    end
+end
+
+function dense_sparse_values(A::AnyGPUArray{Tv}, indices) where {Tv}
+    values = similar(A, Tv, length(indices))
+    if !isempty(values)
+        dense_sparse_values_kernel!(get_backend(A))(values, A, indices; ndrange=length(indices))
+    end
+    return values
+end
+
+function SparseArrays.SparseVector(x::AnyGPUVector)
+    inds = findall(!iszero, x)
+    vals = dense_sparse_values(x, inds)
+    host_inds = Array(inds)
+    host_vals = Array(vals)
+    unsafe_free!(inds)
+    unsafe_free!(vals)
+    return SparseVector(length(x), host_inds, host_vals)
+end
+
+function SparseArrays.SparseMatrixCSC(x::AnyGPUMatrix)
+    inds = findall(!iszero, x)
+    vals = dense_sparse_values(x, inds)
+    host_inds = Array(inds)
+    host_vals = Array(vals)
+    unsafe_free!(inds)
+    unsafe_free!(vals)
+
+    rows = getindex.(host_inds, 1)
+    cols = getindex.(host_inds, 2)
+    return sparse(rows, cols, host_vals, size(x)...)
+end
+
+function SparseArrays.sparse(A::AnyGPUVector)
+    sparse_array_type(A)(SparseVector(A))
+end
+
+function SparseArrays.sparse(A::AnyGPUMatrix; fmt=:csc)
+    if fmt == :csc
+        return sparse_array_type(A)(SparseMatrixCSC(A))
+    elseif fmt == :csr
+        return csr_type(sparse_array_type(A))(SparseMatrixCSC(A))
+    else
+        throw(ArgumentError("format :$fmt not available, use :csc or :csr"))
+    end
+end
+
 # similar
 Base.similar(Vec::V) where {V<:AbstractGPUSparseVector} = V(copy(SparseArrays.nonzeroinds(Vec)), similar(SparseArrays.nonzeros(Vec)), length(Vec))
 Base.similar(Mat::M) where {M<:AbstractGPUSparseMatrixCSC} = M(copy(SparseArrays.getcolptr(Mat)), copy(SparseArrays.rowvals(Mat)), similar(SparseArrays.nonzeros(Mat)), size(Mat))
diff --git a/test/testsuite/sparse.jl b/test/testsuite/sparse.jl
index 3e0d92fe3..763e244ca 100644
--- a/test/testsuite/sparse.jl
+++ b/test/testsuite/sparse.jl
@@ -71,6 +71,11 @@ function vector_construction(AT, eltypes)
         @test collect(d_x) == collect(x)
         @test similar(d_x) isa AT{ET}
         @test similar(d_x, Float32) isa AT{Float32}
+
+        dense_x = dense_AT(collect(x))
+        @test SparseVector(dense_x) == x
+        @test sparse(dense_x) isa AT{ET}
+        @test SparseVector(sparse(dense_x)) == x
     end
 end
 
@@ -148,6 +153,14 @@ function matrix_construction(AT, eltypes)
         @test similar(d_x, Float32, (n, m)) isa AT{Float32}
         @test similar(d_x, (3, 4)) isa AT{ET}
         @test size(similar(d_x, (3, 4))) == (3, 4)
+
+        dense_x = dense_AT(collect(x))
+        @test SparseMatrixCSC(dense_x) == x
+        @test sparse(dense_x) isa GPUArrays.sparse_array_type(dense_x){ET}
+        @test SparseMatrixCSC(sparse(dense_x)) == x
+        if dense_x isa GPUArrays.AnyGPUArray
+            @test SparseMatrixCSC(sparse(dense_x; fmt=:csr)) == x
+        end
     end
 end
 

From 6563c12243ceb8b076d70535e6ecf2e18449a815 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Mon, 22 Jun 2026 16:04:49 +0200
Subject: [PATCH 02/15] Add generic COO sparse matmul

---
 Project.toml                 |   2 +
 lib/JLArrays/src/JLArrays.jl |  81 +++++++++-
 src/GPUArrays.jl             |   1 +
 src/host/sparse.jl           | 306 +++++++++++++++++++++++++++++++++++
 test/runtests.jl             |   1 +
 test/testsuite.jl            |   1 +
 test/testsuite/sparse.jl     |  83 ++++++++++
 7 files changed, 473 insertions(+), 2 deletions(-)

diff --git a/Project.toml b/Project.toml
index 707ee08d5..d0e72995a 100644
--- a/Project.toml
+++ b/Project.toml
@@ -7,6 +7,7 @@ projects = ["lib/GPUArraysCore", "lib/JLArrays", "test", "docs"]
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
+Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
@@ -27,6 +28,7 @@ JLD2Ext = "JLD2"
 
 [compat]
 Adapt = "4.6.1"
+Atomix = "1.1"
 GPUArraysCore = "= 0.2.0"
 JLD2 = "0.4, 0.5, 0.6"
 KernelAbstractions = "0.9.28, 0.10"
diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl
index b55f13d96..1c9123e9d 100644
--- a/lib/JLArrays/src/JLArrays.jl
+++ b/lib/JLArrays/src/JLArrays.jl
@@ -6,7 +6,8 @@
 
 module JLArrays
 
-export JLArray, JLVector, JLMatrix, jl, JLBackend, JLSparseVector, JLSparseMatrixCSC, JLSparseMatrixCSR
+export JLArray, JLVector, JLMatrix, jl, JLBackend, JLSparseVector,
+       JLSparseMatrixCSC, JLSparseMatrixCSR, JLSparseMatrixCOO
 
 using GPUArrays
 
@@ -74,6 +75,11 @@ Adapt.adapt_structure(to::Adaptor, r::Base.RefValue) = JlRefValue(adapt(to, r[])
 
     @inline Base.getindex(A::JLDeviceArray, index::Integer) = getindex(typed_data(A), index)
     @inline Base.setindex!(A::JLDeviceArray, x, index::Integer) = setindex!(typed_data(A), x, index)
+    Base.pointer(x::JLDeviceArray{T}) where {T} =
+        convert(Ptr{T}, pointer(x.data)) + x.offset
+    @inline function Base.pointer(x::JLDeviceArray{T}, i::Integer) where T
+        pointer(x) + (i - 1) * sizeof(T)
+    end
 end
 
 #
@@ -188,6 +194,46 @@ end
 
 JLSparseMatrixCSC(Mat::Union{Transpose{Tv, <:SparseMatrixCSC}, Adjoint{Tv, <:SparseMatrixCSC}}) where {Tv} = JLSparseMatrixCSC(JLSparseMatrixCSR(Mat))
 
+mutable struct JLSparseMatrixCOO{Tv, Ti} <: GPUArrays.AbstractGPUSparseMatrixCOO{Tv, Ti}
+    rowInd::JLArray{Ti, 1}
+    colInd::JLArray{Ti, 1}
+    nzVal::JLArray{Tv, 1}
+    dims::NTuple{2,Int}
+    nnz::Ti
+
+    function JLSparseMatrixCOO{Tv, Ti}(rowInd::JLArray{Ti, 1}, colInd::JLArray{Ti, 1},
+                                       nzVal::JLArray{Tv, 1}, dims::NTuple{2,<:Integer},
+                                       nnz::Integer=length(nzVal)) where {Tv, Ti <: Integer}
+        length(rowInd) == length(colInd) == length(nzVal) ||
+            throw(DimensionMismatch("COO row, column, and value arrays have different lengths"))
+        new{Tv, Ti}(rowInd, colInd, nzVal, (Int(dims[1]), Int(dims[2])), Ti(nnz))
+    end
+end
+JLSparseMatrixCOO(rowInd::JLArray{Ti, 1}, colInd::JLArray{Ti, 1},
+                  nzVal::JLArray{Tv, 1}, dims::NTuple{2,<:Integer},
+                  nnz::Integer=length(nzVal)) where {Tv, Ti <: Integer} =
+    JLSparseMatrixCOO{Tv, Ti}(rowInd, colInd, nzVal, dims, nnz)
+
+Base.length(x::JLSparseMatrixCOO) = prod(x.dims)
+Base.size(x::JLSparseMatrixCOO) = x.dims
+SparseArrays.nnz(x::JLSparseMatrixCOO) = x.nnz
+SparseArrays.nonzeros(x::JLSparseMatrixCOO) = x.nzVal
+
+function SparseArrays.SparseMatrixCSC(x::JLSparseMatrixCOO)
+    return sparse(Array(x.rowInd), Array(x.colInd), Array(x.nzVal), size(x)...)
+end
+
+function Base.getindex(A::JLSparseMatrixCOO{Tv}, i::Integer, j::Integer) where {Tv}
+    @boundscheck checkbounds(A, i, j)
+    acc = zero(Tv)
+    for k in 1:Int(nnz(A))
+        @inbounds if A.rowInd[k] == i && A.colInd[k] == j
+            acc += A.nzVal[k]
+        end
+    end
+    return acc
+end
+
 function Base.size(g::JLSparseMatrixCSR, d::Integer)
     if 1 <= d <= 2
         return g.dims[d]
@@ -229,6 +275,8 @@ GPUArrays.sparse_array_type(::Type{<:JLArray{<:Any,2}}) = JLSparseMatrixCSC
 GPUArrays.sparse_array_type(::Type{<:JLSparseMatrixCSC}) = JLSparseMatrixCSC
 GPUArrays.sparse_array_type(sa::JLSparseMatrixCSR) = JLSparseMatrixCSR
 GPUArrays.sparse_array_type(::Type{<:JLSparseMatrixCSR}) = JLSparseMatrixCSR
+GPUArrays.sparse_array_type(sa::JLSparseMatrixCOO) = JLSparseMatrixCOO
+GPUArrays.sparse_array_type(::Type{<:JLSparseMatrixCOO}) = JLSparseMatrixCOO
 GPUArrays.sparse_array_type(sa::JLSparseVector) = JLSparseVector
 GPUArrays.sparse_array_type(sa::JLArray{<:Any,1}) = JLSparseVector
 GPUArrays.sparse_array_type(::Type{<:JLArray{<:Any,1}}) = JLSparseVector
@@ -240,14 +288,24 @@ GPUArrays.dense_array_type(sa::JLSparseMatrixCSC) = JLArray
 GPUArrays.dense_array_type(::Type{<:JLSparseMatrixCSC}) = JLArray
 GPUArrays.dense_array_type(sa::JLSparseMatrixCSR) = JLArray
 GPUArrays.dense_array_type(::Type{<:JLSparseMatrixCSR}) = JLArray
+GPUArrays.dense_array_type(sa::JLSparseMatrixCOO) = JLArray
+GPUArrays.dense_array_type(::Type{<:JLSparseMatrixCOO}) = JLArray
 
 GPUArrays.csc_type(sa::JLSparseMatrixCSR) = JLSparseMatrixCSC
+GPUArrays.csc_type(sa::JLSparseMatrixCOO) = JLSparseMatrixCSC
 GPUArrays.csc_type(::Type{<:JLSparseMatrixCSR}) = JLSparseMatrixCSC
+GPUArrays.csc_type(::Type{<:JLSparseMatrixCOO}) = JLSparseMatrixCSC
 GPUArrays.csr_type(sa::JLSparseMatrixCSC) = JLSparseMatrixCSR
+GPUArrays.csr_type(sa::JLSparseMatrixCOO) = JLSparseMatrixCSR
 GPUArrays.csr_type(::Type{<:JLSparseMatrixCSC}) = JLSparseMatrixCSR
+GPUArrays.csr_type(::Type{<:JLSparseMatrixCOO}) = JLSparseMatrixCSR
+GPUArrays.coo_type(sa::Union{JLSparseMatrixCSC,JLSparseMatrixCSR,JLSparseMatrixCOO}) = JLSparseMatrixCOO
+GPUArrays.coo_type(::Type{<:Union{JLSparseMatrixCSC,JLSparseMatrixCSR,JLSparseMatrixCOO}}) = JLSparseMatrixCOO
 
 Base.similar(Mat::JLSparseMatrixCSR) = JLSparseMatrixCSR(copy(Mat.rowPtr), copy(Mat.colVal), similar(nonzeros(Mat)), size(Mat))
 Base.similar(Mat::JLSparseMatrixCSR, T::Type) = JLSparseMatrixCSR(copy(Mat.rowPtr), copy(Mat.colVal), similar(nonzeros(Mat), T), size(Mat))
+Base.similar(Mat::JLSparseMatrixCOO) = JLSparseMatrixCOO(copy(Mat.rowInd), copy(Mat.colInd), similar(nonzeros(Mat)), size(Mat), nnz(Mat))
+Base.similar(Mat::JLSparseMatrixCOO, T::Type) = JLSparseMatrixCOO(copy(Mat.rowInd), copy(Mat.colInd), similar(nonzeros(Mat), T), size(Mat), nnz(Mat))
 
 Base.similar(Mat::JLSparseMatrixCSC, T::Type, N::Int, M::Int) =  JLSparseMatrixCSC(JLVector([zero(Int32)]), JLVector{Int32}(undef, 0), JLVector{T}(undef, 0), (N, M))
 Base.similar(Mat::JLSparseMatrixCSR, T::Type, N::Int, M::Int) =  JLSparseMatrixCSR(JLVector([zero(Int32)]), JLVector{Int32}(undef, 0), JLVector{T}(undef, 0), (N, M))
@@ -264,6 +322,7 @@ Base.similar(Mat::JLSparseMatrixCSR, dims::Tuple{Int, Int}) = similar(Mat, dims.
 JLArray(x::JLSparseVector)    = JLArray(collect(SparseVector(x)))
 JLArray(x::JLSparseMatrixCSC) = JLArray(collect(SparseMatrixCSC(x)))
 JLArray(x::JLSparseMatrixCSR) = JLArray(collect(SparseMatrixCSC(x)))
+JLArray(x::JLSparseMatrixCOO) = JLArray(collect(SparseMatrixCSC(x)))
 
 # conversion of untyped data to a typed Array
 function typed_data(x::JLArray{T}) where {T}
@@ -405,12 +464,28 @@ function JLSparseMatrixCSR(xs::SparseMatrixCSC{Tv, Ti}) where {Ti, Tv}
 end
 JLSparseMatrixCSR(xs::SparseVector{Tv, Ti}) where {Ti, Tv} = JLSparseMatrixCSR(SparseMatrixCSC(xs))
 JLSparseMatrixCSR(xs::JLArray{<:Any,2}) = JLSparseMatrixCSR(SparseMatrixCSC(xs))
+function JLSparseMatrixCOO(xs::SparseMatrixCSC{Tv, Ti}) where {Ti, Tv}
+    rowInd, colInd, nzVal = findnz(xs)
+    return JLSparseMatrixCOO{Tv, Ti}(JLVector{Ti}(rowInd), JLVector{Ti}(colInd),
+                                     JLVector{Tv}(nzVal), (xs.m, xs.n))
+end
+JLSparseMatrixCOO(xs::SparseVector) = JLSparseMatrixCOO(SparseMatrixCSC(xs))
+JLSparseMatrixCOO(xs::JLArray{<:Any,2}) = JLSparseMatrixCOO(SparseMatrixCSC(xs))
 function JLSparseMatrixCSR(xs::JLSparseMatrixCSC{Tv, Ti}) where {Ti, Tv}
     return JLSparseMatrixCSR(SparseMatrixCSC(xs))
 end
+function JLSparseMatrixCOO(xs::Union{JLSparseMatrixCSC{Tv, Ti},JLSparseMatrixCSR{Tv, Ti}}) where {Tv, Ti}
+    return JLSparseMatrixCOO(SparseMatrixCSC(xs))
+end
 function JLSparseMatrixCSC(xs::JLSparseMatrixCSR{Tv, Ti}) where {Ti, Tv}
     return JLSparseMatrixCSC(SparseMatrixCSC(xs))
 end
+function JLSparseMatrixCSC(xs::JLSparseMatrixCOO{Tv, Ti}) where {Ti, Tv}
+    return JLSparseMatrixCSC(SparseMatrixCSC(xs))
+end
+function JLSparseMatrixCSR(xs::JLSparseMatrixCOO{Tv, Ti}) where {Ti, Tv}
+    return JLSparseMatrixCSR(SparseMatrixCSC(xs))
+end
 function Base.copyto!(dst::JLSparseMatrixCSR, src::JLSparseMatrixCSR)
     if size(dst) != size(src)
         throw(ArgumentError("Inconsistent Sparse Matrix size"))
@@ -583,13 +658,15 @@ Adapt.adapt_structure(to::Adaptor, x::JLSparseMatrixCSC{Tv,Ti}) where {Tv,Ti} =
 GPUSparseDeviceMatrixCSC{Tv,Ti,JLDeviceArray{Ti, 1}, JLDeviceArray{Tv, 1}, AS.Generic}(adapt(to, x.colPtr), adapt(to, x.rowVal), adapt(to, x.nzVal), x.dims, x.nnz)
 Adapt.adapt_structure(to::Adaptor, x::JLSparseMatrixCSR{Tv,Ti}) where {Tv,Ti} =
 GPUSparseDeviceMatrixCSR{Tv,Ti,JLDeviceArray{Ti, 1}, JLDeviceArray{Tv, 1}, AS.Generic}(adapt(to, x.rowPtr), adapt(to, x.colVal), adapt(to, x.nzVal), x.dims, x.nnz)
+Adapt.adapt_structure(to::Adaptor, x::JLSparseMatrixCOO{Tv,Ti}) where {Tv,Ti} =
+GPUSparseDeviceMatrixCOO{Tv,Ti,JLDeviceArray{Ti, 1}, JLDeviceArray{Tv, 1}, AS.Generic}(adapt(to, x.rowInd), adapt(to, x.colInd), adapt(to, x.nzVal), x.dims, x.nnz)
 Adapt.adapt_structure(to::Adaptor, x::JLSparseVector{Tv,Ti}) where {Tv,Ti} =
 GPUSparseDeviceVector{Tv,Ti,JLDeviceArray{Ti, 1}, JLDeviceArray{Tv, 1}, AS.Generic}(adapt(to, x.iPtr), adapt(to, x.nzVal), x.len, x.nnz)
 
 ## KernelAbstractions interface
 
 KernelAbstractions.get_backend(a::JLA) where JLA <: JLArray = JLBackend()
-KernelAbstractions.get_backend(a::JLA) where JLA <: Union{JLSparseMatrixCSC, JLSparseMatrixCSR, JLSparseVector} = JLBackend()
+KernelAbstractions.get_backend(a::JLA) where JLA <: Union{JLSparseMatrixCSC, JLSparseMatrixCSR, JLSparseMatrixCOO, JLSparseVector} = JLBackend()
 
 function KernelAbstractions.mkcontext(kernel::Kernel{JLBackend}, I, _ndrange, iterspace, ::Dynamic) where Dynamic
     return KernelAbstractions.CompilerMetadata{KernelAbstractions.ndrange(kernel), Dynamic}(I, _ndrange, iterspace)
diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl
index 17a4d8975..d50dcee0e 100644
--- a/src/GPUArrays.jl
+++ b/src/GPUArrays.jl
@@ -9,6 +9,7 @@ using Printf
 using LinearAlgebra.BLAS
 using Base.Cartesian
 
+using Atomix
 using Adapt
 using LLVM.Interop
 
diff --git a/src/host/sparse.jl b/src/host/sparse.jl
index 60d469af6..266e659ce 100644
--- a/src/host/sparse.jl
+++ b/src/host/sparse.jl
@@ -119,6 +119,312 @@ coo_type(sa) = coo_type(typeof(sa))
 function _spadjoint end
 function _sptranspose end
 
+const SparseMatmulEltypes = Union{Float16,Float32,ComplexF16,ComplexF32}
+
+sparse_matmul_accumulator(::Type{T}) where {T} = T
+sparse_matmul_accumulator(::Type{Float16}) = Float32
+sparse_matmul_accumulator(::Type{ComplexF16}) = ComplexF32
+
+const GPUSparseMatrixOrWrapper = Union{
+    AbstractGPUSparseMatrix,
+    Transpose{<:Any,<:AbstractGPUSparseMatrix},
+    Adjoint{<:Any,<:AbstractGPUSparseMatrix},
+}
+
+function sparse_op(t::AbstractChar)
+    t in ('N', 'n', 'S', 's', 'H', 'h') && return false, false
+    t in ('T', 't') && return true, false
+    t in ('C', 'c') && return true, true
+    throw(ArgumentError("unsupported sparse matrix operation '$t'"))
+end
+
+sparse_op_size(A, trans::Bool) = trans ? reverse(size(A)) : size(A)
+
+sparse_coo(A::AbstractGPUSparseMatrixCOO) = A
+sparse_coo(A::AbstractGPUSparseMatrix) = coo_type(A)(A)
+
+@inline function sparse_atomic_add!(A::AbstractArray{<:Real}, parts, I::Integer, x)
+    @inbounds Atomix.@atomic A[Int(I)] += x
+    return
+end
+
+@inline function sparse_atomic_add!(A::AbstractArray{<:Complex}, parts, I::Integer, x)
+    j = 2 * Int(I)
+    @inbounds begin
+        Atomix.@atomic parts[j - 1] += real(x)
+        Atomix.@atomic parts[j] += imag(x)
+    end
+    return
+end
+
+sparse_atomic_parts(A::AbstractArray{<:Real}) = A
+sparse_atomic_parts(A::AbstractArray{<:Complex{T}}) where {T} = reinterpret(T, A)
+
+@kernel function sparse_scale_kernel!(Y, beta)
+    i = @index(Global, Linear)
+    if i <= length(Y)
+        T = eltype(Y)
+        @inbounds Y[i] = iszero(beta) ? zero(T) : T(beta) * Y[i]
+    end
+end
+
+@kernel function sparse_combine_kernel!(Y, acc, alpha, beta, ::Val{Tacc}) where {Tacc}
+    i = @index(Global, Linear)
+    if i <= length(Y)
+        Ty = eltype(Y)
+        @inbounds Y[i] = iszero(beta) ?
+            Ty(Tacc(alpha) * Tacc(acc[i])) :
+            Ty(Tacc(alpha) * Tacc(acc[i]) + Tacc(beta) * Tacc(Y[i]))
+    end
+end
+
+function sparse_scale!(Y, beta)
+    isempty(Y) && return Y
+    sparse_scale_kernel!(get_backend(Y))(Y, beta; ndrange=length(Y))
+    return Y
+end
+
+function sparse_combine!(Y::AnyGPUArray{Ty}, acc, alpha, beta) where {Ty}
+    isempty(Y) && return Y
+    Tacc = sparse_matmul_accumulator(Ty)
+    sparse_combine_kernel!(get_backend(Y))(Y, acc, Tacc(alpha), Tacc(beta), Val(Tacc);
+                                           ndrange=length(Y))
+    return Y
+end
+
+@kernel function spmv_coo_kernel!(y, yparts, alpha, rowInd, colInd, nzVal, x,
+                                  trans::Bool, conjval::Bool, ::Val{Tacc}) where {Tacc}
+    k = @index(Global, Linear)
+    if k <= length(nzVal)
+        @inbounds begin
+            out = trans ? colInd[k] : rowInd[k]
+            src = trans ? rowInd[k] : colInd[k]
+            v = Tacc(nzVal[k])
+            v = conjval ? conj(v) : v
+            sparse_atomic_add!(y, yparts, out, Tacc(alpha) * v * Tacc(x[src]))
+        end
+    end
+end
+
+@kernel function spmm_coo_kernel!(C, Cparts, alpha, rowInd, colInd, nzVal, B,
+                                  trans::Bool, conjval::Bool, ::Val{Tacc}) where {Tacc}
+    k, j = @index(Global, NTuple)
+    if k <= length(nzVal) && j <= size(B, 2)
+        @inbounds begin
+            outrow = trans ? colInd[k] : rowInd[k]
+            srcrow = trans ? rowInd[k] : colInd[k]
+            v = Tacc(nzVal[k])
+            v = conjval ? conj(v) : v
+            lin = Int(outrow) + (Int(j) - 1) * size(C, 1)
+            sparse_atomic_add!(C, Cparts, lin, Tacc(alpha) * v * Tacc(B[srcrow, j]))
+        end
+    end
+end
+
+@kernel function dense_spmm_coo_kernel!(C, Cparts, alpha, A, rowInd, colInd, nzVal,
+                                        trans::Bool, conjval::Bool, ::Val{Tacc}) where {Tacc}
+    i, k = @index(Global, NTuple)
+    if i <= size(C, 1) && k <= length(nzVal)
+        @inbounds begin
+            outcol = trans ? rowInd[k] : colInd[k]
+            srccol = trans ? colInd[k] : rowInd[k]
+            v = Tacc(nzVal[k])
+            v = conjval ? conj(v) : v
+            lin = Int(i) + (Int(outcol) - 1) * size(C, 1)
+            sparse_atomic_add!(C, Cparts, lin, Tacc(alpha) * Tacc(A[i, srccol]) * v)
+        end
+    end
+end
+
+function spmv_coo!(y::AnyGPUVector{Ty}, A::AbstractGPUSparseMatrixCOO, x,
+                   alpha::Number, beta::Number, trans::Bool, conjval::Bool) where {Ty}
+    Tacc = sparse_matmul_accumulator(Ty)
+    if Tacc === Ty
+        sparse_scale!(y, beta)
+        target = y
+        scatter_alpha = Tacc(alpha)
+    else
+        target = similar(y, Tacc, size(y))
+        fill!(target, zero(Tacc))
+        scatter_alpha = one(Tacc)
+    end
+
+    if nnz(A) > 0
+        parts = sparse_atomic_parts(target)
+        spmv_coo_kernel!(get_backend(y))(target, parts, scatter_alpha, A.rowInd, A.colInd,
+                                         nonzeros(A), x, trans, conjval, Val(Tacc);
+                                         ndrange=nnz(A))
+    end
+
+    if !(Tacc === Ty)
+        sparse_combine!(y, target, alpha, beta)
+        unsafe_free!(target)
+    end
+    return y
+end
+
+function spmm_coo!(C::AnyGPUMatrix{Tc}, A::AbstractGPUSparseMatrixCOO, B,
+                   alpha::Number, beta::Number, trans::Bool, conjval::Bool) where {Tc}
+    Tacc = sparse_matmul_accumulator(Tc)
+    if Tacc === Tc
+        sparse_scale!(C, beta)
+        target = C
+        scatter_alpha = Tacc(alpha)
+    else
+        target = similar(C, Tacc, size(C))
+        fill!(target, zero(Tacc))
+        scatter_alpha = one(Tacc)
+    end
+
+    if nnz(A) > 0 && size(C, 2) > 0
+        parts = sparse_atomic_parts(target)
+        spmm_coo_kernel!(get_backend(C))(target, parts, scatter_alpha, A.rowInd, A.colInd,
+                                         nonzeros(A), B, trans, conjval, Val(Tacc);
+                                         ndrange=(nnz(A), size(C, 2)))
+    end
+
+    if !(Tacc === Tc)
+        sparse_combine!(C, target, alpha, beta)
+        unsafe_free!(target)
+    end
+    return C
+end
+
+function dense_spmm_coo!(C::AnyGPUMatrix{Tc}, A, B::AbstractGPUSparseMatrixCOO,
+                         alpha::Number, beta::Number, trans::Bool, conjval::Bool) where {Tc}
+    Tacc = sparse_matmul_accumulator(Tc)
+    if Tacc === Tc
+        sparse_scale!(C, beta)
+        target = C
+        scatter_alpha = Tacc(alpha)
+    else
+        target = similar(C, Tacc, size(C))
+        fill!(target, zero(Tacc))
+        scatter_alpha = one(Tacc)
+    end
+
+    if nnz(B) > 0 && size(C, 1) > 0
+        parts = sparse_atomic_parts(target)
+        dense_spmm_coo_kernel!(get_backend(C))(target, parts, scatter_alpha, A, B.rowInd,
+                                               B.colInd, nonzeros(B), trans, conjval,
+                                               Val(Tacc);
+                                               ndrange=(size(C, 1), nnz(B)))
+    end
+
+    if !(Tacc === Tc)
+        sparse_combine!(C, target, alpha, beta)
+        unsafe_free!(target)
+    end
+    return C
+end
+
+function Base.:(*)(A::GPUSparseMatrixOrWrapper, x::AnyGPUVector)
+    size(A, 2) == length(x) || throw(DimensionMismatch())
+    y = similar(x, promote_type(eltype(A), eltype(x)), size(A, 1))
+    return LinearAlgebra.mul!(y, A, x)
+end
+
+function Base.:(*)(A::GPUSparseMatrixOrWrapper, B::AnyGPUMatrix)
+    size(A, 2) == size(B, 1) || throw(DimensionMismatch())
+    C = similar(B, promote_type(eltype(A), eltype(B)), (size(A, 1), size(B, 2)))
+    return LinearAlgebra.mul!(C, A, B)
+end
+
+function Base.:(*)(A::AnyGPUMatrix, B::GPUSparseMatrixOrWrapper)
+    size(A, 2) == size(B, 1) || throw(DimensionMismatch())
+    C = similar(A, promote_type(eltype(A), eltype(B)), (size(A, 1), size(B, 2)))
+    return LinearAlgebra.mul!(C, A, B)
+end
+
+LinearAlgebra.generic_matvecmul!(y::AnyGPUVector{Ty}, tA::AbstractChar,
+                                 A::AbstractGPUSparseMatrix{Ta}, x::AnyGPUVector{Tx},
+                                 muladd::MulAddMul) where {
+                                     Ty<:SparseMatmulEltypes,
+                                     Ta<:SparseMatmulEltypes,
+                                     Tx<:SparseMatmulEltypes,
+                                 } =
+    LinearAlgebra.generic_matvecmul!(y, tA, A, x, muladd.alpha, muladd.beta)
+
+function LinearAlgebra.generic_matvecmul!(y::AnyGPUVector{Ty}, tA::AbstractChar,
+                                          A::AbstractGPUSparseMatrix{Ta},
+                                          x::AnyGPUVector{Tx},
+                                          alpha::Number, beta::Number) where {
+                                              Ty<:SparseMatmulEltypes,
+                                              Ta<:SparseMatmulEltypes,
+                                              Tx<:SparseMatmulEltypes,
+                                          }
+    trans, conjval = sparse_op(tA)
+    Asize = sparse_op_size(A, trans)
+    Asize[2] == length(x) ||
+        throw(DimensionMismatch("A has dimensions $Asize but x has length $(length(x))"))
+    Asize[1] == length(y) ||
+        throw(DimensionMismatch("A has dimensions $Asize but y has length $(length(y))"))
+    y === x && throw(ArgumentError("output vector must not be aliased with input vector"))
+    isempty(y) && return y
+
+    return spmv_coo!(y, sparse_coo(A), x, alpha, beta, trans, conjval)
+end
+
+LinearAlgebra.generic_matmatmul!(C::AnyGPUMatrix{Tc}, tA, tB,
+                                 A::AbstractGPUSparseMatrix{Ta}, B::AnyGPUMatrix{Tb},
+                                 muladd::MulAddMul) where {
+                                     Tc<:SparseMatmulEltypes,
+                                     Ta<:SparseMatmulEltypes,
+                                     Tb<:SparseMatmulEltypes,
+                                 } =
+    LinearAlgebra.generic_matmatmul!(C, tA, tB, A, B, muladd.alpha, muladd.beta)
+
+function LinearAlgebra.generic_matmatmul!(C::AnyGPUMatrix{Tc}, tA, tB,
+                                          A::AbstractGPUSparseMatrix{Ta},
+                                          B::AnyGPUMatrix{Tb},
+                                          alpha::Number, beta::Number) where {
+                                              Tc<:SparseMatmulEltypes,
+                                              Ta<:SparseMatmulEltypes,
+                                              Tb<:SparseMatmulEltypes,
+                                          }
+    trans, conjval = sparse_op(tA)
+    Bop = LinearAlgebra.wrap(B, tB)
+    Asize = sparse_op_size(A, trans)
+    Asize[2] == size(Bop, 1) ||
+        throw(DimensionMismatch("A has dimensions $Asize but B has dimensions $(size(Bop))"))
+    size(C) == (Asize[1], size(Bop, 2)) ||
+        throw(DimensionMismatch("C has dimensions $(size(C)), should have $((Asize[1], size(Bop, 2)))"))
+    C === B && throw(ArgumentError("output matrix must not be aliased with input matrix"))
+    isempty(C) && return C
+
+    return spmm_coo!(C, sparse_coo(A), Bop, alpha, beta, trans, conjval)
+end
+
+LinearAlgebra.generic_matmatmul!(C::AnyGPUMatrix{Tc}, tA, tB,
+                                 A::AnyGPUMatrix{Ta}, B::AbstractGPUSparseMatrix{Tb},
+                                 muladd::MulAddMul) where {
+                                     Tc<:SparseMatmulEltypes,
+                                     Ta<:SparseMatmulEltypes,
+                                     Tb<:SparseMatmulEltypes,
+                                 } =
+    LinearAlgebra.generic_matmatmul!(C, tA, tB, A, B, muladd.alpha, muladd.beta)
+
+function LinearAlgebra.generic_matmatmul!(C::AnyGPUMatrix{Tc}, tA, tB,
+                                          A::AnyGPUMatrix{Ta},
+                                          B::AbstractGPUSparseMatrix{Tb},
+                                          alpha::Number, beta::Number) where {
+                                              Tc<:SparseMatmulEltypes,
+                                              Ta<:SparseMatmulEltypes,
+                                              Tb<:SparseMatmulEltypes,
+                                          }
+    Aop = LinearAlgebra.wrap(A, tA)
+    trans, conjval = sparse_op(tB)
+    Bsize = sparse_op_size(B, trans)
+    size(Aop, 2) == Bsize[1] ||
+        throw(DimensionMismatch("A has dimensions $(size(Aop)) but B has dimensions $Bsize"))
+    size(C) == (size(Aop, 1), Bsize[2]) ||
+        throw(DimensionMismatch("C has dimensions $(size(C)), should have $((size(Aop, 1), Bsize[2]))"))
+    C === A && throw(ArgumentError("output matrix must not be aliased with input matrix"))
+    isempty(C) && return C
+
+    return dense_spmm_coo!(C, Aop, sparse_coo(B), alpha, beta, trans, conjval)
+end
+
 function LinearAlgebra.opnorm(A::AbstractGPUSparseMatrixCSR, p::Real=2)
     if p == Inf
         return maximum(sum(abs, A; dims=2))
diff --git a/test/runtests.jl b/test/runtests.jl
index a91715b9a..5d13993ad 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -9,6 +9,7 @@ const init_worker_code = quote
     include("testsuite.jl")
 
     TestSuite.sparse_types(::Type{<:JLArray}) = (JLSparseVector, JLSparseMatrixCSC, JLSparseMatrixCSR)
+    TestSuite.sparse_coo_type(::Type{<:JLArray}) = JLSparseMatrixCOO
     TestSuite.sparse_types(::Type{<:Array})   = (SparseVector, SparseMatrixCSC)
 
     # Disable Float16-related tests until JuliaGPU/KernelAbstractions#600 is resolved
diff --git a/test/testsuite.jl b/test/testsuite.jl
index e4aea4f72..f95c7366e 100644
--- a/test/testsuite.jl
+++ b/test/testsuite.jl
@@ -76,6 +76,7 @@ supported_eltypes() = (Int16, Int32, Int64,
 # derived sparse types that are supported by the array type
 
 sparse_types(::Type{AT}) where {AT} = ()
+sparse_coo_type(::Type{AT}) where {AT} = nothing
 
 # some convenience predicates for filtering test eltypes
 isrealtype(T) = T <: Real
diff --git a/test/testsuite/sparse.jl b/test/testsuite/sparse.jl
index 763e244ca..e120e450c 100644
--- a/test/testsuite/sparse.jl
+++ b/test/testsuite/sparse.jl
@@ -15,11 +15,94 @@
             iszero_matrix(sparse_AT, eltypes)
        end
     end
+    coo_AT = sparse_coo_type(AT)
+    coo_AT === nothing || coo_matmul(coo_AT, AT, eltypes)
 end
 
 using SparseArrays
 using SparseArrays: nonzeroinds, nonzeros, rowvals
 
+function coo_values(::Type{T}) where {T<:Real}
+    return T[1, 2, 3, 4]
+end
+
+function coo_values(::Type{T}) where {T<:Complex}
+    return T[1 + im, 2 - 3im, 3 + 2im, 4 - im]
+end
+
+function coo_vector(::Type{T}) where {T<:Real}
+    return T[2, 3, 5]
+end
+
+function coo_vector(::Type{T}) where {T<:Complex}
+    return T[2 - im, 3 + 4im, 5 - 2im]
+end
+
+function coo_rhs(::Type{T}) where {T<:Real}
+    return T[
+        1 2;
+        3 4;
+        5 6
+    ]
+end
+
+function coo_rhs(::Type{T}) where {T<:Complex}
+    return T[
+        1 + im 2 - im;
+        3 + 2im 4 - 3im;
+        5 - im 6 + im
+    ]
+end
+
+function coo_lhs(::Type{T}) where {T<:Real}
+    return T[
+        1 2 3;
+        4 5 6
+    ]
+end
+
+function coo_lhs(::Type{T}) where {T<:Complex}
+    return T[
+        1 - im 2 + im 3 - 2im;
+        4 + 3im 5 - im 6 + im
+    ]
+end
+
+function coo_matmul(AT, dense_AT, eltypes)
+    for ET in (Float16, Float32, ComplexF16, ComplexF32)
+        ET in eltypes || continue
+        @testset "COO repeated-index matmul($ET)" begin
+            I = Int32[1, 1, 2, 3]
+            J = Int32[1, 1, 2, 1]
+            V = coo_values(ET)
+            A = sparse(Int.(I), Int.(J), V, 3, 3)
+            dA = AT(dense_AT(I), dense_AT(J), dense_AT(V), (3, 3), length(V))
+
+            x = coo_vector(ET)
+            dx = dense_AT(x)
+            @test Array(dA * dx) ≈ Array(A * x)
+            @test Array(transpose(dA) * dx) ≈ Array(transpose(A) * x)
+            @test Array(adjoint(dA) * dx) ≈ Array(adjoint(A) * x)
+
+            old_y = dense_AT(fill(ET(1), 3))
+            mul!(old_y, dA, dx, ET(2), ET(3))
+            @test Array(old_y) ≈ Array(ET(2) .* (A * x) .+ ET(3))
+
+            B = coo_rhs(ET)
+            dB = dense_AT(B)
+            @test Array(dA * dB) ≈ Array(A * B)
+            @test Array(transpose(dA) * dB) ≈ Array(transpose(A) * B)
+            @test Array(adjoint(dA) * dB) ≈ Array(adjoint(A) * B)
+
+            L = coo_lhs(ET)
+            dL = dense_AT(L)
+            @test Array(dL * dA) ≈ Array(L * A)
+            @test Array(dL * transpose(dA)) ≈ Array(L * transpose(A))
+            @test Array(dL * adjoint(dA)) ≈ Array(L * adjoint(A))
+        end
+    end
+end
+
 function vector(AT, eltypes)
     dense_AT = GPUArrays.dense_array_type(AT)
     for ET in eltypes

From 13ae5b99de9f9748a046af30acf9ac14568e6c7c Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Mon, 22 Jun 2026 19:34:14 +0200
Subject: [PATCH 03/15] Add typed dense sparse conversion

---
 lib/JLArrays/src/JLArrays.jl | 38 ++++++++++++++--
 src/host/sparse.jl           | 86 ++++++++++++++++++++----------------
 test/testsuite/sparse.jl     | 10 +++++
 3 files changed, 94 insertions(+), 40 deletions(-)

diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl
index 1c9123e9d..9f33a29bc 100644
--- a/lib/JLArrays/src/JLArrays.jl
+++ b/lib/JLArrays/src/JLArrays.jl
@@ -435,6 +435,9 @@ function JLSparseVector(xs::SparseVector{Tv, Ti}) where {Ti, Tv}
     copyto!(nzVal, convert(Vector{Tv}, xs.nzval))
     return JLSparseVector{Tv, Ti}(iPtr, nzVal, length(xs),)
 end
+SparseArrays.SparseVector(xs::JLArray{<:Any,1}) = SparseVector(JLSparseVector(xs))
+JLSparseVector(xs::JLArray{T,1}) where {T} =
+    GPUArrays.sparse_from_dense(JLSparseVector{T,Int}, xs)
 Base.length(x::JLSparseVector) = x.len
 Base.size(x::JLSparseVector) = (x.len,)
 
@@ -447,8 +450,10 @@ function JLSparseMatrixCSC(xs::SparseMatrixCSC{Tv, Ti}) where {Ti, Tv}
     copyto!(nzVal,  convert(Vector{Tv}, xs.nzval))
     return JLSparseMatrixCSC{Tv, Ti}(colPtr, rowVal, nzVal, (xs.m, xs.n))
 end
+SparseArrays.SparseMatrixCSC(xs::JLArray{<:Any,2}) = SparseMatrixCSC(JLSparseMatrixCSC(xs))
 JLSparseMatrixCSC(xs::SparseVector) = JLSparseMatrixCSC(SparseMatrixCSC(xs))
-JLSparseMatrixCSC(xs::JLArray{<:Any,2}) = JLSparseMatrixCSC(SparseMatrixCSC(xs))
+JLSparseMatrixCSC(xs::JLArray{T,2}) where {T} =
+    GPUArrays.sparse_from_dense(JLSparseMatrixCSC{T,Int}, xs)
 Base.length(x::JLSparseMatrixCSC) = prod(x.dims)
 Base.size(x::JLSparseMatrixCSC) = x.dims
 
@@ -463,29 +468,56 @@ function JLSparseMatrixCSR(xs::SparseMatrixCSC{Tv, Ti}) where {Ti, Tv}
     return JLSparseMatrixCSR{Tv, Ti}(rowPtr, colVal, nzVal, (xs.m, xs.n))
 end
 JLSparseMatrixCSR(xs::SparseVector{Tv, Ti}) where {Ti, Tv} = JLSparseMatrixCSR(SparseMatrixCSC(xs))
-JLSparseMatrixCSR(xs::JLArray{<:Any,2}) = JLSparseMatrixCSR(SparseMatrixCSC(xs))
+JLSparseMatrixCSR(xs::JLArray{T,2}) where {T} =
+    GPUArrays.sparse_from_dense(JLSparseMatrixCSR{T,Int}, xs)
 function JLSparseMatrixCOO(xs::SparseMatrixCSC{Tv, Ti}) where {Ti, Tv}
     rowInd, colInd, nzVal = findnz(xs)
     return JLSparseMatrixCOO{Tv, Ti}(JLVector{Ti}(rowInd), JLVector{Ti}(colInd),
                                      JLVector{Tv}(nzVal), (xs.m, xs.n))
 end
 JLSparseMatrixCOO(xs::SparseVector) = JLSparseMatrixCOO(SparseMatrixCSC(xs))
-JLSparseMatrixCOO(xs::JLArray{<:Any,2}) = JLSparseMatrixCOO(SparseMatrixCSC(xs))
+JLSparseMatrixCOO(xs::JLArray{T,2}) where {T} =
+    GPUArrays.sparse_from_dense(JLSparseMatrixCOO{T,Int}, xs)
 function JLSparseMatrixCSR(xs::JLSparseMatrixCSC{Tv, Ti}) where {Ti, Tv}
     return JLSparseMatrixCSR(SparseMatrixCSC(xs))
 end
 function JLSparseMatrixCOO(xs::Union{JLSparseMatrixCSC{Tv, Ti},JLSparseMatrixCSR{Tv, Ti}}) where {Tv, Ti}
     return JLSparseMatrixCOO(SparseMatrixCSC(xs))
 end
+function JLSparseMatrixCOO{Tv,Ti}(xs::Union{JLSparseMatrixCSC{Tv, Ti},JLSparseMatrixCSR{Tv, Ti}}) where {Tv, Ti}
+    return JLSparseMatrixCOO(SparseMatrixCSC(xs))
+end
 function JLSparseMatrixCSC(xs::JLSparseMatrixCSR{Tv, Ti}) where {Ti, Tv}
     return JLSparseMatrixCSC(SparseMatrixCSC(xs))
 end
 function JLSparseMatrixCSC(xs::JLSparseMatrixCOO{Tv, Ti}) where {Ti, Tv}
     return JLSparseMatrixCSC(SparseMatrixCSC(xs))
 end
+function JLSparseMatrixCSC{Tv,Ti}(xs::JLSparseMatrixCOO{Tv, Ti}) where {Ti, Tv}
+    return JLSparseMatrixCSC(SparseMatrixCSC(xs))
+end
 function JLSparseMatrixCSR(xs::JLSparseMatrixCOO{Tv, Ti}) where {Ti, Tv}
     return JLSparseMatrixCSR(SparseMatrixCSC(xs))
 end
+function JLSparseMatrixCSR{Tv,Ti}(xs::JLSparseMatrixCOO{Tv, Ti}) where {Ti, Tv}
+    return JLSparseMatrixCSR(SparseMatrixCSC(xs))
+end
+
+SparseArrays.sparse(xs::JLArray{T,1}) where {T} =
+    GPUArrays.sparse_from_dense(JLSparseVector{T,Int}, xs)
+
+function SparseArrays.sparse(xs::JLArray{T,2}; fmt=:csc) where {T}
+    if fmt == :csc
+        return GPUArrays.sparse_from_dense(JLSparseMatrixCSC{T,Int}, xs)
+    elseif fmt == :csr
+        return GPUArrays.sparse_from_dense(JLSparseMatrixCSR{T,Int}, xs)
+    elseif fmt == :coo
+        return GPUArrays.sparse_from_dense(JLSparseMatrixCOO{T,Int}, xs)
+    else
+        throw(ArgumentError("format :$fmt not available, use :csc, :csr, or :coo"))
+    end
+end
+
 function Base.copyto!(dst::JLSparseMatrixCSR, src::JLSparseMatrixCSR)
     if size(dst) != size(src)
         throw(ArgumentError("Inconsistent Sparse Matrix size"))
diff --git a/src/host/sparse.jl b/src/host/sparse.jl
index 266e659ce..6d9c7b0a3 100644
--- a/src/host/sparse.jl
+++ b/src/host/sparse.jl
@@ -44,55 +44,67 @@ Base.iszero(A::AbstractGPUSparseArray) = SparseArrays.nnz(A) == 0 || all(iszero,
 SparseArrays.SparseVector(x::AbstractGPUSparseVector) = SparseVector(length(x), Array(SparseArrays.nonzeroinds(x)), Array(SparseArrays.nonzeros(x)))
 SparseArrays.SparseMatrixCSC(x::AbstractGPUSparseMatrixCSC) = SparseMatrixCSC(size(x)..., Array(SparseArrays.getcolptr(x)), Array(SparseArrays.rowvals(x)), Array(SparseArrays.nonzeros(x)))
 
-@kernel function dense_sparse_values_kernel!(values, A, indices)
+function check_sparse_target(::Type{ST}, ::Type{Tv}, ::Type{Ti}) where {ST,Tv,Ti}
+    Tv isa Type && Ti isa Type ||
+        throw(ArgumentError("sparse target type $ST must specify value and index eltypes"))
+    return
+end
+
+@kernel function dense_sparse_vector_kernel!(iPtr, nzVal, A, indices)
     i = @index(Global, Linear)
     if i <= length(indices)
-        @inbounds values[i] = A[indices[i]]
+        @inbounds begin
+            idx = indices[i]
+            iPtr[i] = idx
+            nzVal[i] = A[idx]
+        end
     end
 end
 
-function dense_sparse_values(A::AnyGPUArray{Tv}, indices) where {Tv}
-    values = similar(A, Tv, length(indices))
-    if !isempty(values)
-        dense_sparse_values_kernel!(get_backend(A))(values, A, indices; ndrange=length(indices))
+@kernel function dense_sparse_matrix_kernel!(rowInd, colInd, nzVal, A, indices)
+    i = @index(Global, Linear)
+    if i <= length(indices)
+        @inbounds begin
+            idx = indices[i]
+            rowInd[i] = idx[1]
+            colInd[i] = idx[2]
+            nzVal[i] = A[idx]
+        end
     end
-    return values
-end
-
-function SparseArrays.SparseVector(x::AnyGPUVector)
-    inds = findall(!iszero, x)
-    vals = dense_sparse_values(x, inds)
-    host_inds = Array(inds)
-    host_vals = Array(vals)
-    unsafe_free!(inds)
-    unsafe_free!(vals)
-    return SparseVector(length(x), host_inds, host_vals)
 end
 
-function SparseArrays.SparseMatrixCSC(x::AnyGPUMatrix)
-    inds = findall(!iszero, x)
-    vals = dense_sparse_values(x, inds)
-    host_inds = Array(inds)
-    host_vals = Array(vals)
-    unsafe_free!(inds)
-    unsafe_free!(vals)
-
-    rows = getindex.(host_inds, 1)
-    cols = getindex.(host_inds, 2)
-    return sparse(rows, cols, host_vals, size(x)...)
+function sparse_from_dense(::Type{ST}, A::AnyGPUVector) where {Tv,Ti,ST<:AbstractGPUSparseVector{Tv,Ti}}
+    check_sparse_target(ST, Tv, Ti)
+    indices = findall(!iszero, A)
+    nstored = length(indices)
+    iPtr = similar(A, Ti, nstored)
+    nzVal = similar(A, Tv, nstored)
+    if nstored > 0
+        dense_sparse_vector_kernel!(get_backend(A))(iPtr, nzVal, A, indices;
+                                                    ndrange=nstored)
+    end
+    unsafe_free!(indices)
+    return ST(iPtr, nzVal, length(A))
 end
 
-function SparseArrays.sparse(A::AnyGPUVector)
-    sparse_array_type(A)(SparseVector(A))
-end
+function sparse_from_dense(::Type{ST}, A::AnyGPUMatrix) where {Tv,Ti,ST<:AbstractGPUSparseMatrix{Tv,Ti}}
+    check_sparse_target(ST, Tv, Ti)
+    indices = findall(!iszero, A)
+    nstored = length(indices)
+    rowInd = similar(A, Ti, nstored)
+    colInd = similar(A, Ti, nstored)
+    nzVal = similar(A, Tv, nstored)
+    if nstored > 0
+        dense_sparse_matrix_kernel!(get_backend(A))(rowInd, colInd, nzVal, A, indices;
+                                                    ndrange=nstored)
+    end
+    unsafe_free!(indices)
 
-function SparseArrays.sparse(A::AnyGPUMatrix; fmt=:csc)
-    if fmt == :csc
-        return sparse_array_type(A)(SparseMatrixCSC(A))
-    elseif fmt == :csr
-        return csr_type(sparse_array_type(A))(SparseMatrixCSC(A))
+    if ST <: AbstractGPUSparseMatrixCOO
+        return ST(rowInd, colInd, nzVal, size(A), nstored)
     else
-        throw(ArgumentError("format :$fmt not available, use :csc or :csr"))
+        COO = coo_type(ST)
+        return ST(COO{Tv,Ti}(rowInd, colInd, nzVal, size(A), nstored))
     end
 end
 
diff --git a/test/testsuite/sparse.jl b/test/testsuite/sparse.jl
index e120e450c..431a5cd3c 100644
--- a/test/testsuite/sparse.jl
+++ b/test/testsuite/sparse.jl
@@ -157,6 +157,11 @@ function vector_construction(AT, eltypes)
 
         dense_x = dense_AT(collect(x))
         @test SparseVector(dense_x) == x
+        if dense_x isa GPUArrays.AnyGPUArray
+            typed_d_x = GPUArrays.sparse_from_dense(typeof(d_x), dense_x)
+            @test typed_d_x isa typeof(d_x)
+            @test SparseVector(typed_d_x) == x
+        end
         @test sparse(dense_x) isa AT{ET}
         @test SparseVector(sparse(dense_x)) == x
     end
@@ -239,6 +244,11 @@ function matrix_construction(AT, eltypes)
 
         dense_x = dense_AT(collect(x))
         @test SparseMatrixCSC(dense_x) == x
+        if dense_x isa GPUArrays.AnyGPUArray
+            typed_d_x = GPUArrays.sparse_from_dense(typeof(d_x), dense_x)
+            @test typed_d_x isa typeof(d_x)
+            @test SparseMatrixCSC(typed_d_x) == x
+        end
         @test sparse(dense_x) isa GPUArrays.sparse_array_type(dense_x){ET}
         @test SparseMatrixCSC(sparse(dense_x)) == x
         if dense_x isa GPUArrays.AnyGPUArray

From 5224d4286082d42c5f2edc939f63732c968edd77 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Mon, 22 Jun 2026 23:12:20 +0200
Subject: [PATCH 04/15] Rework sparse back-end interface around value verbs and
 `similar`

The sparse support required back-ends to implement a cluster of
type-returning functions -- `dense_array_type`, `dense_vector_type`,
`sparse_array_type`, `csc_type`, `csr_type`, `coo_type` -- that generic
code called to name a related type (the dense counterpart, the
unparameterized sparse wrapper, a sibling format). That is unidiomatic:
Julia expresses these with value-level verbs (`float`, `complex`,
`sparse`) and `similar`, not type-level mapping functions.

Replace that machinery:

  * Format conversion is now the value verbs `sparse_csc`/`sparse_csr`/
    `sparse_coo`. Converting to the format already held is the generic
    identity; back-ends implement only the cross-format cases (which they
    already had as constructors). `csc_type`/`csr_type`/`coo_type` are gone.

  * Broadcast and `similar` outputs are built with `similar` dispatched on a
    representative sparse value (plus an internal `_stored_inds` field
    accessor for filling), mirroring `SparseArrays`' `_allocres`. No more
    reconstructing a sparse type from a type, so `sparse_array_type` is gone.

  * `dense_array_type`/`dense_vector_type` had no consumer in the generic
    algorithms (a dense result comes from `similar(nonzeros(A), ...)`); they
    are removed, and the test suite threads the dense array type explicitly.

  * `sparse_from_dense` now yields COO directly (the natural format when
    scanning a dense array); CSR/CSC are derived with the conversion verbs,
    removing the last type-level `coo_type(ST)` use.

A back-end's obligation shrinks to: the storage structs, their
constructors, `similar`, and the three conversion verbs. JLArrays is
updated as the reference; the GPUArrays test suite passes for both JLArray
and Array.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 lib/JLArrays/src/JLArrays.jl |  73 +++++++------------
 src/host/sparse.jl           | 137 ++++++++++++++++-------------------
 test/testsuite/sparse.jl     |  52 +++++++------
 3 files changed, 114 insertions(+), 148 deletions(-)

diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl
index 9f33a29bc..bb1011a0d 100644
--- a/lib/JLArrays/src/JLArrays.jl
+++ b/lib/JLArrays/src/JLArrays.jl
@@ -14,8 +14,6 @@ using GPUArrays
 using Adapt
 using SparseArrays, LinearAlgebra
 
-import GPUArrays: dense_array_type
-
 import KernelAbstractions
 import KernelAbstractions: Adapt, StaticArrays, Backend, Kernel, StaticSize, DynamicSize, partition, blocks, workitems, launch_config
 
@@ -264,51 +262,32 @@ function Base.getindex(A::JLSparseMatrixCSR{Tv, Ti}, i0::Integer, i1::Integer) w
 end
 
 GPUArrays.storage(a::JLArray) = a.data
-GPUArrays.dense_array_type(a::JLArray{T, N}) where {T, N} = JLArray{T, N}
-GPUArrays.dense_array_type(::Type{JLArray{T, N}}) where {T, N} = JLArray{T, N}
-GPUArrays.dense_vector_type(a::JLArray{T, N}) where {T, N} = JLArray{T, 1}
-GPUArrays.dense_vector_type(::Type{JLArray{T, N}}) where {T, N} = JLArray{T, 1}
-
-GPUArrays.sparse_array_type(sa::JLSparseMatrixCSC) = JLSparseMatrixCSC
-GPUArrays.sparse_array_type(sa::JLArray{<:Any,2}) = JLSparseMatrixCSC
-GPUArrays.sparse_array_type(::Type{<:JLArray{<:Any,2}}) = JLSparseMatrixCSC
-GPUArrays.sparse_array_type(::Type{<:JLSparseMatrixCSC}) = JLSparseMatrixCSC
-GPUArrays.sparse_array_type(sa::JLSparseMatrixCSR) = JLSparseMatrixCSR
-GPUArrays.sparse_array_type(::Type{<:JLSparseMatrixCSR}) = JLSparseMatrixCSR
-GPUArrays.sparse_array_type(sa::JLSparseMatrixCOO) = JLSparseMatrixCOO
-GPUArrays.sparse_array_type(::Type{<:JLSparseMatrixCOO}) = JLSparseMatrixCOO
-GPUArrays.sparse_array_type(sa::JLSparseVector) = JLSparseVector
-GPUArrays.sparse_array_type(sa::JLArray{<:Any,1}) = JLSparseVector
-GPUArrays.sparse_array_type(::Type{<:JLArray{<:Any,1}}) = JLSparseVector
-GPUArrays.sparse_array_type(::Type{<:JLSparseVector}) = JLSparseVector
-
-GPUArrays.dense_array_type(sa::JLSparseVector) = JLArray
-GPUArrays.dense_array_type(::Type{<:JLSparseVector}) = JLArray
-GPUArrays.dense_array_type(sa::JLSparseMatrixCSC) = JLArray
-GPUArrays.dense_array_type(::Type{<:JLSparseMatrixCSC}) = JLArray
-GPUArrays.dense_array_type(sa::JLSparseMatrixCSR) = JLArray
-GPUArrays.dense_array_type(::Type{<:JLSparseMatrixCSR}) = JLArray
-GPUArrays.dense_array_type(sa::JLSparseMatrixCOO) = JLArray
-GPUArrays.dense_array_type(::Type{<:JLSparseMatrixCOO}) = JLArray
-
-GPUArrays.csc_type(sa::JLSparseMatrixCSR) = JLSparseMatrixCSC
-GPUArrays.csc_type(sa::JLSparseMatrixCOO) = JLSparseMatrixCSC
-GPUArrays.csc_type(::Type{<:JLSparseMatrixCSR}) = JLSparseMatrixCSC
-GPUArrays.csc_type(::Type{<:JLSparseMatrixCOO}) = JLSparseMatrixCSC
-GPUArrays.csr_type(sa::JLSparseMatrixCSC) = JLSparseMatrixCSR
-GPUArrays.csr_type(sa::JLSparseMatrixCOO) = JLSparseMatrixCSR
-GPUArrays.csr_type(::Type{<:JLSparseMatrixCSC}) = JLSparseMatrixCSR
-GPUArrays.csr_type(::Type{<:JLSparseMatrixCOO}) = JLSparseMatrixCSR
-GPUArrays.coo_type(sa::Union{JLSparseMatrixCSC,JLSparseMatrixCSR,JLSparseMatrixCOO}) = JLSparseMatrixCOO
-GPUArrays.coo_type(::Type{<:Union{JLSparseMatrixCSC,JLSparseMatrixCSR,JLSparseMatrixCOO}}) = JLSparseMatrixCOO
+# format-conversion verbs (cross-format only; the identity cases are handled generically
+# by GPUArrays). these reuse the format constructors defined below.
+GPUArrays.sparse_csc(A::Union{JLSparseMatrixCSR,JLSparseMatrixCOO}) = JLSparseMatrixCSC(A)
+GPUArrays.sparse_csr(A::Union{JLSparseMatrixCSC,JLSparseMatrixCOO}) = JLSparseMatrixCSR(A)
+GPUArrays.sparse_coo(A::Union{JLSparseMatrixCSC,JLSparseMatrixCSR}) = JLSparseMatrixCOO(A)
 
 Base.similar(Mat::JLSparseMatrixCSR) = JLSparseMatrixCSR(copy(Mat.rowPtr), copy(Mat.colVal), similar(nonzeros(Mat)), size(Mat))
 Base.similar(Mat::JLSparseMatrixCSR, T::Type) = JLSparseMatrixCSR(copy(Mat.rowPtr), copy(Mat.colVal), similar(nonzeros(Mat), T), size(Mat))
 Base.similar(Mat::JLSparseMatrixCOO) = JLSparseMatrixCOO(copy(Mat.rowInd), copy(Mat.colInd), similar(nonzeros(Mat)), size(Mat), nnz(Mat))
 Base.similar(Mat::JLSparseMatrixCOO, T::Type) = JLSparseMatrixCOO(copy(Mat.rowInd), copy(Mat.colInd), similar(nonzeros(Mat), T), size(Mat), nnz(Mat))
 
-Base.similar(Mat::JLSparseMatrixCSC, T::Type, N::Int, M::Int) =  JLSparseMatrixCSC(JLVector([zero(Int32)]), JLVector{Int32}(undef, 0), JLVector{T}(undef, 0), (N, M))
-Base.similar(Mat::JLSparseMatrixCSR, T::Type, N::Int, M::Int) =  JLSparseMatrixCSR(JLVector([zero(Int32)]), JLVector{Int32}(undef, 0), JLVector{T}(undef, 0), (N, M))
+# CSC and vector structure-preserving `similar` (mirror the CSR/COO methods above; these
+# used to come for free from GPUArrays via a type-reconstruction helper).
+Base.similar(Mat::JLSparseMatrixCSC{<:Any, Ti}, ::Type{T}) where {T, Ti} =
+    JLSparseMatrixCSC{T, Ti}(copy(Mat.colPtr), copy(Mat.rowVal), similar(nonzeros(Mat), T), size(Mat))
+Base.similar(Vec::JLSparseVector{<:Any, Ti}, ::Type{T}) where {T, Ti} =
+    JLSparseVector{T, Ti}(copy(Vec.iPtr), similar(nonzeros(Vec), T), length(Vec))
+
+# empty sparse arrays of a given shape, used by the generic broadcast machinery as the
+# output container before its storage is sized and filled. the index type is preserved.
+Base.similar(Mat::JLSparseMatrixCSC{<:Any, Ti}, ::Type{T}, N::Int, M::Int) where {T, Ti} =
+    JLSparseMatrixCSC{T, Ti}(JLVector(ones(Ti, M + 1)), JLVector{Ti}(undef, 0), JLVector{T}(undef, 0), (N, M))
+Base.similar(Mat::JLSparseMatrixCSR{<:Any, Ti}, ::Type{T}, N::Int, M::Int) where {T, Ti} =
+    JLSparseMatrixCSR{T, Ti}(JLVector(ones(Ti, N + 1)), JLVector{Ti}(undef, 0), JLVector{T}(undef, 0), (N, M))
+Base.similar(Vec::JLSparseVector{<:Any, Ti}, ::Type{T}, dims::Dims{1}) where {T, Ti} =
+    JLSparseVector{T, Ti}(JLVector{Ti}(undef, 0), JLVector{T}(undef, 0), dims[1])
 
 Base.similar(Mat::JLSparseMatrixCSC{Tv, Ti}, N::Int, M::Int) where {Tv, Ti} = similar(Mat, Tv, N, M)
 Base.similar(Mat::JLSparseMatrixCSR{Tv, Ti}, N::Int, M::Int) where {Tv, Ti} = similar(Mat, Tv, N, M)
@@ -452,8 +431,7 @@ function JLSparseMatrixCSC(xs::SparseMatrixCSC{Tv, Ti}) where {Ti, Tv}
 end
 SparseArrays.SparseMatrixCSC(xs::JLArray{<:Any,2}) = SparseMatrixCSC(JLSparseMatrixCSC(xs))
 JLSparseMatrixCSC(xs::SparseVector) = JLSparseMatrixCSC(SparseMatrixCSC(xs))
-JLSparseMatrixCSC(xs::JLArray{T,2}) where {T} =
-    GPUArrays.sparse_from_dense(JLSparseMatrixCSC{T,Int}, xs)
+JLSparseMatrixCSC(xs::JLArray{T,2}) where {T} = JLSparseMatrixCSC(JLSparseMatrixCOO(xs))
 Base.length(x::JLSparseMatrixCSC) = prod(x.dims)
 Base.size(x::JLSparseMatrixCSC) = x.dims
 
@@ -468,8 +446,7 @@ function JLSparseMatrixCSR(xs::SparseMatrixCSC{Tv, Ti}) where {Ti, Tv}
     return JLSparseMatrixCSR{Tv, Ti}(rowPtr, colVal, nzVal, (xs.m, xs.n))
 end
 JLSparseMatrixCSR(xs::SparseVector{Tv, Ti}) where {Ti, Tv} = JLSparseMatrixCSR(SparseMatrixCSC(xs))
-JLSparseMatrixCSR(xs::JLArray{T,2}) where {T} =
-    GPUArrays.sparse_from_dense(JLSparseMatrixCSR{T,Int}, xs)
+JLSparseMatrixCSR(xs::JLArray{T,2}) where {T} = JLSparseMatrixCSR(JLSparseMatrixCOO(xs))
 function JLSparseMatrixCOO(xs::SparseMatrixCSC{Tv, Ti}) where {Ti, Tv}
     rowInd, colInd, nzVal = findnz(xs)
     return JLSparseMatrixCOO{Tv, Ti}(JLVector{Ti}(rowInd), JLVector{Ti}(colInd),
@@ -508,11 +485,11 @@ SparseArrays.sparse(xs::JLArray{T,1}) where {T} =
 
 function SparseArrays.sparse(xs::JLArray{T,2}; fmt=:csc) where {T}
     if fmt == :csc
-        return GPUArrays.sparse_from_dense(JLSparseMatrixCSC{T,Int}, xs)
+        return JLSparseMatrixCSC(xs)
     elseif fmt == :csr
-        return GPUArrays.sparse_from_dense(JLSparseMatrixCSR{T,Int}, xs)
+        return JLSparseMatrixCSR(xs)
     elseif fmt == :coo
-        return GPUArrays.sparse_from_dense(JLSparseMatrixCOO{T,Int}, xs)
+        return JLSparseMatrixCOO(xs)
     else
         throw(ArgumentError("format :$fmt not available, use :csc, :csr, or :coo"))
     end
diff --git a/src/host/sparse.jl b/src/host/sparse.jl
index 6d9c7b0a3..77c1544bb 100644
--- a/src/host/sparse.jl
+++ b/src/host/sparse.jl
@@ -87,7 +87,11 @@ function sparse_from_dense(::Type{ST}, A::AnyGPUVector) where {Tv,Ti,ST<:Abstrac
     return ST(iPtr, nzVal, length(A))
 end
 
-function sparse_from_dense(::Type{ST}, A::AnyGPUMatrix) where {Tv,Ti,ST<:AbstractGPUSparseMatrix{Tv,Ti}}
+# Densifying a dense array naturally produces coordinate (COO) format, so this builds a
+# COO directly. Other formats are obtained by converting the COO with `sparse_csr`/
+# `sparse_csc` (which back-ends already provide as constructors), avoiding any need to map
+# a target type to its COO sibling.
+function sparse_from_dense(::Type{ST}, A::AnyGPUMatrix) where {Tv,Ti,ST<:AbstractGPUSparseMatrixCOO{Tv,Ti}}
     check_sparse_target(ST, Tv, Ti)
     indices = findall(!iszero, A)
     nstored = length(indices)
@@ -99,34 +103,23 @@ function sparse_from_dense(::Type{ST}, A::AnyGPUMatrix) where {Tv,Ti,ST<:Abstrac
                                                     ndrange=nstored)
     end
     unsafe_free!(indices)
-
-    if ST <: AbstractGPUSparseMatrixCOO
-        return ST(rowInd, colInd, nzVal, size(A), nstored)
-    else
-        COO = coo_type(ST)
-        return ST(COO{Tv,Ti}(rowInd, colInd, nzVal, size(A), nstored))
-    end
+    return ST(rowInd, colInd, nzVal, size(A), nstored)
 end
 
 # similar
 Base.similar(Vec::V) where {V<:AbstractGPUSparseVector} = V(copy(SparseArrays.nonzeroinds(Vec)), similar(SparseArrays.nonzeros(Vec)), length(Vec))
 Base.similar(Mat::M) where {M<:AbstractGPUSparseMatrixCSC} = M(copy(SparseArrays.getcolptr(Mat)), copy(SparseArrays.rowvals(Mat)), similar(SparseArrays.nonzeros(Mat)), size(Mat))
 
-Base.similar(Vec::V, T::Type) where {Tv, Ti, V<:AbstractGPUSparseVector{Tv, Ti}} = sparse_array_type(V){T, Ti}(copy(SparseArrays.nonzeroinds(Vec)), similar(SparseArrays.nonzeros(Vec), T), length(Vec))
-Base.similar(Mat::M, T::Type) where {M<:AbstractGPUSparseMatrixCSC} = sparse_array_type(M)(copy(SparseArrays.getcolptr(Mat)), copy(SparseArrays.rowvals(Mat)), similar(SparseArrays.nonzeros(Mat), T), size(Mat))
-
-dense_array_type(a::AbstractArray)     = dense_array_type(typeof(a))
-sparse_array_type(a::AbstractArray)    = sparse_array_type(typeof(a))
-dense_vector_type(a::AbstractArray)    = dense_vector_type(typeof(a))
+# `similar(A, ::Type)` (structure-preserving, new element type) is a back-end method, just
+# like the dense `similar`. The generic algorithms below always build their outputs with
+# `similar` (dispatched on a representative value), so there is no need for backends to
+# provide `*_array_type` mapping functions to reconstruct a sparse type from a type.
 
-dense_array_type(::Type{<:SparseVector}) = SparseVector
-sparse_array_type(::Type{<:SparseVector}) = SparseVector
-dense_vector_type(::Type{<:AbstractSparseArray}) = Vector
-dense_vector_type(::Type{<:AbstractArray})       = Vector
-dense_array_type(::Type{<:SparseMatrixCSC}) = SparseMatrixCSC
-sparse_array_type(::Type{<:SparseMatrixCSC})    = SparseMatrixCSC
-
-coo_type(sa) = coo_type(typeof(sa))
+# Stored (minor-axis) index buffer, accessed generically through the conventional field;
+# the analogue of `SparseArrays.storedinds`. Used by the broadcast/construction code.
+_stored_inds(A::AbstractGPUSparseMatrixCSR) = A.colVal
+_stored_inds(A::AbstractGPUSparseMatrixCSC) = A.rowVal
+_stored_inds(A::AbstractGPUSparseVector)    = A.iPtr
 
 function _spadjoint end
 function _sptranspose end
@@ -152,8 +145,11 @@ end
 
 sparse_op_size(A, trans::Bool) = trans ? reverse(size(A)) : size(A)
 
+# Format-conversion verbs: converting to the format already held is the identity;
+# back-ends implement the cross-format conversions (as constructors).
+sparse_csc(A::AbstractGPUSparseMatrixCSC) = A
+sparse_csr(A::AbstractGPUSparseMatrixCSR) = A
 sparse_coo(A::AbstractGPUSparseMatrixCOO) = A
-sparse_coo(A::AbstractGPUSparseMatrix) = coo_type(A)(A)
 
 @inline function sparse_atomic_add!(A::AbstractArray{<:Real}, parts, I::Integer, x)
     @inbounds Atomix.@atomic A[Int(I)] += x
@@ -447,7 +443,7 @@ function LinearAlgebra.opnorm(A::AbstractGPUSparseMatrixCSR, p::Real=2)
     end
 end
 
-LinearAlgebra.opnorm(A::AbstractGPUSparseMatrixCSC, p::Real=2) = opnorm(csr_type(A)(A), p)
+LinearAlgebra.opnorm(A::AbstractGPUSparseMatrixCSC, p::Real=2) = opnorm(sparse_csr(A), p)
 
 function LinearAlgebra.norm(A::AbstractGPUSparseMatrix{T}, p::Real=2) where T
     if p == Inf
@@ -462,7 +458,7 @@ function LinearAlgebra.norm(A::AbstractGPUSparseMatrix{T}, p::Real=2) where T
 end
 
 function SparseArrays.findnz(S::MT) where {MT <: AbstractGPUSparseMatrix}
-    S2 = coo_type(MT)(S)
+    S2 = sparse_coo(S)
     I = S2.rowInd
     J = S2.colInd
     V = S2.nzVal
@@ -502,61 +498,61 @@ end
 for SparseMatrixType in [:AbstractGPUSparseMatrixCSC, :AbstractGPUSparseMatrixCSR]
     @eval begin
         LinearAlgebra.triu(A::ST, k::Integer) where {T, ST<:$SparseMatrixType{T}} =
-            ST( triu(coo_type(A)(A), k) )
+            ST( triu(sparse_coo(A), k) )
         LinearAlgebra.triu(A::Transpose{T,<:ST}, k::Integer) where {T, ST<:$SparseMatrixType{T}} =
-            ST( triu(coo_type(A)(_sptranspose(parent(A))), k) )
+            ST( triu(sparse_coo(_sptranspose(parent(A))), k) )
         LinearAlgebra.triu(A::Adjoint{T,<:ST}, k::Integer) where {T, ST<:$SparseMatrixType{T}} =
-            ST( triu(coo_type(A)(_spadjoint(parent(A))), k) )
+            ST( triu(sparse_coo(_spadjoint(parent(A))), k) )
 
         LinearAlgebra.tril(A::ST, k::Integer) where {T, ST<:$SparseMatrixType{T}} =
-            ST( tril(coo_type(A)(A), k) )
+            ST( tril(sparse_coo(A), k) )
         LinearAlgebra.tril(A::Transpose{T,<:ST}, k::Integer) where {T, ST<:$SparseMatrixType{T}} =
-            ST( tril(coo_type(A)(_sptranspose(parent(A))), k) )
+            ST( tril(sparse_coo(_sptranspose(parent(A))), k) )
         LinearAlgebra.tril(A::Adjoint{T,<:ST}, k::Integer) where {T, ST<:$SparseMatrixType{T}} =
-            ST( tril(coo_type(A)(_spadjoint(parent(A))), k) )
+            ST( tril(sparse_coo(_spadjoint(parent(A))), k) )
 
         LinearAlgebra.triu(A::Union{ST, Transpose{T,<:ST}, Adjoint{T,<:ST}}) where {T, ST<:$SparseMatrixType{T}} =
-            ST( triu(coo_type(A)(A), 0) )
+            ST( triu(sparse_coo(A), 0) )
         LinearAlgebra.tril(A::Union{ST,Transpose{T,<:ST}, Adjoint{T,<:ST}}) where {T, ST<:$SparseMatrixType{T}} =
-            ST( tril(coo_type(A)(A), 0) )
+            ST( tril(sparse_coo(A), 0) )
 
         LinearAlgebra.kron(A::ST, B::ST) where {T, ST<:$SparseMatrixType{T}} =
-            ST( kron(coo_type(A)(A), coo_type(B)(B)) )
+            ST( kron(sparse_coo(A), sparse_coo(B)) )
         LinearAlgebra.kron(A::ST, B::Diagonal) where {T, ST<:$SparseMatrixType{T}} =
-            ST( kron(coo_type(A)(A), B) )
+            ST( kron(sparse_coo(A), B) )
         LinearAlgebra.kron(A::Diagonal, B::ST) where {T, ST<:$SparseMatrixType{T}} =
-            ST( kron(A, coo_type(B)(B)) )
+            ST( kron(A, sparse_coo(B)) )
 
         LinearAlgebra.kron(A::Transpose{T,<:ST}, B::ST) where {T, ST<:$SparseMatrixType{T}} =
-            ST( kron(coo_type(A)(_sptranspose(parent(A))), coo_type(B)(B)) )
+            ST( kron(sparse_coo(_sptranspose(parent(A))), sparse_coo(B)) )
         LinearAlgebra.kron(A::ST, B::Transpose{T,<:ST}) where {T, ST<:$SparseMatrixType{T}} =
-            ST( kron(coo_type(A)(A), coo_type(parent(B))(_sptranspose(parent(B)))) )
+            ST( kron(sparse_coo(A), sparse_coo(_sptranspose(parent(B)))) )
         LinearAlgebra.kron(A::Transpose{T,<:ST}, B::Transpose{T,<:ST}) where {T, ST<:$SparseMatrixType{T}} =
-            ST( kron(coo_type(A)(_sptranspose(parent(A))), coo_type(parent(B))(_sptranspose(parent(B)))) )
+            ST( kron(sparse_coo(_sptranspose(parent(A))), sparse_coo(_sptranspose(parent(B)))) )
         LinearAlgebra.kron(A::Transpose{T,<:ST}, B::Diagonal) where {T, ST<:$SparseMatrixType{T}} =
-            ST( kron(coo_type(A)(_sptranspose(parent(A))), B) )
+            ST( kron(sparse_coo(_sptranspose(parent(A))), B) )
         LinearAlgebra.kron(A::Diagonal, B::Transpose{T,<:ST}) where {T, ST<:$SparseMatrixType{T}} =
-            ST( kron(A, coo_type(B)(_sptranspose(parent(B)))) )
+            ST( kron(A, sparse_coo(_sptranspose(parent(B)))) )
 
         LinearAlgebra.kron(A::Adjoint{T,<:ST}, B::ST) where {T, ST<:$SparseMatrixType{T}} =
-            ST( kron(coo_type(A)(_spadjoint(parent(A))), coo_type(B)(B)) )
+            ST( kron(sparse_coo(_spadjoint(parent(A))), sparse_coo(B)) )
         LinearAlgebra.kron(A::ST, B::Adjoint{T,<:ST}) where {T, ST<:$SparseMatrixType{T}} =
-            ST( kron(coo_type(A)(A), coo_type(parent(B))(_spadjoint(parent(B)))) )
+            ST( kron(sparse_coo(A), sparse_coo(_spadjoint(parent(B)))) )
         LinearAlgebra.kron(A::Adjoint{T,<:ST}, B::Adjoint{T,<:ST}) where {T, ST<:$SparseMatrixType{T}} =
-            ST( kron(coo_type(parent(A))(_spadjoint(parent(A))), coo_type(parent(B))(_spadjoint(parent(B)))) )
+            ST( kron(sparse_coo(_spadjoint(parent(A))), sparse_coo(_spadjoint(parent(B)))) )
         LinearAlgebra.kron(A::Adjoint{T,<:ST}, B::Diagonal) where {T, ST<:$SparseMatrixType{T}} =
-            ST( kron(coo_type(parent(A))(_spadjoint(parent(A))), B) )
+            ST( kron(sparse_coo(_spadjoint(parent(A))), B) )
         LinearAlgebra.kron(A::Diagonal, B::Adjoint{T,<:ST}) where {T, ST<:$SparseMatrixType{T}} =
-            ST( kron(A, coo_type(parent(B))(_spadjoint(parent(B)))) )
+            ST( kron(A, sparse_coo(_spadjoint(parent(B)))) )
 
 
         function Base.reshape(A::ST, dims::Dims) where {ST<:$SparseMatrixType}
-            B = coo_type(A)(A)
+            B = sparse_coo(A)
             ST(reshape(B, dims))
         end
 
         function SparseArrays.droptol!(A::ST, tol::Real) where {ST<:$SparseMatrixType}
-            B = coo_type(A)(A)
+            B = sparse_coo(A)
             droptol!(B, tol)
             copyto!(A, ST(B))
         end
@@ -1184,17 +1180,10 @@ function Broadcast.copy(bc::Broadcasted{<:Union{GPUSparseVecStyle,GPUSparseMatSt
     elseif length(sparse_args) == 1 && sparse_typ <: Union{AbstractGPUSparseMatrixCSR, AbstractGPUSparseMatrixCSC}
         # we only have a single sparse input, so we can reuse its structure for the output.
         # this avoids a kernel launch and costly synchronization.
-        if sparse_typ <: AbstractGPUSparseMatrixCSR
-            offsets = rowPtr = sparse_arg.rowPtr
-            colVal  = similar(sparse_arg.colVal)
-            nzVal   = similar(sparse_arg.nzVal, Tv)
-            output  = sparse_array_type(sparse_typ)(rowPtr, colVal, nzVal, size(bc))
-        elseif sparse_typ <: AbstractGPUSparseMatrixCSC
-            offsets = colPtr = sparse_arg.colPtr
-            rowVal  = similar(sparse_arg.rowVal)
-            nzVal   = similar(sparse_arg.nzVal, Tv)
-            output  = sparse_array_type(sparse_typ)(colPtr, rowVal, nzVal, size(bc))
-        end
+        # structure is preserved, so a structure-preserving `similar` gives the output
+        # container directly; the kernel rewrites the (identical) pointer array from `offsets`.
+        output  = similar(sparse_arg, Tv)
+        offsets = sparse_typ <: AbstractGPUSparseMatrixCSR ? sparse_arg.rowPtr : sparse_arg.colPtr
     else
         # determine the number of non-zero elements per row so that we can create an
         # appropriately-structured output container
@@ -1240,17 +1229,22 @@ function Broadcast.copy(bc::Broadcasted{<:Union{GPUSparseVecStyle,GPUSparseMatSt
             total_nnz = mapreduce(x->first(x) != typemax(first(x)), +, offsets)
         end
         output = if sparse_typ <: Union{AbstractGPUSparseMatrixCSR,AbstractGPUSparseMatrixCSC}
-            ixVal = similar(offsets, Ti, total_nnz)
-            nzVal = similar(offsets, Tv, total_nnz)
-            output_sparse_typ = sparse_array_type(sparse_typ) 
-            output_sparse_typ(offsets, ixVal, nzVal, size(bc))
+            # build an empty sparse output of the same kind, then size its index/value
+            # storage to the computed nnz. The kernel fills the pointer array (from
+            # `offsets`), the stored indices, and the values.
+            out = similar(sparse_arg, Tv, size(bc))
+            resize!(_stored_inds(out), total_nnz)
+            resize!(nonzeros(out), total_nnz)
+            out.nnz = total_nnz
+            out
         elseif sparse_typ <: AbstractGPUSparseVector && !fpreszeros
-            val_array = bc.args[first(sparse_args)].nzVal
-            similar(val_array, Tv, size(bc))
+            similar(nonzeros(sparse_arg), Tv, size(bc))
         elseif sparse_typ <: AbstractGPUSparseVector && fpreszeros
-            iPtr   = similar(offsets, Ti, total_nnz)
-            nzVal  = similar(offsets, Tv, total_nnz)
-            sparse_array_type(sparse_arg){Tv, Ti}(iPtr, nzVal, rows)
+            out = similar(sparse_arg, Tv, (rows,))
+            resize!(_stored_inds(out), total_nnz)
+            resize!(nonzeros(out), total_nnz)
+            out.nnz = total_nnz
+            out
         end
         if sparse_typ <: AbstractGPUSparseVector && !fpreszeros
             nonsparse_args = map(bc.args) do arg
@@ -1353,9 +1347,6 @@ end
 end
 ## COV_EXCL_STOP
 
-function csc_type end
-function csr_type end
-
 # TODO: implement mapreducedim!
 function Base.mapreduce(f, op, A::AbstractGPUSparseMatrix; dims=:, init=nothing)
     # figure out the destination container type by looking at the initializer element,
@@ -1377,9 +1368,9 @@ function Base.mapreduce(f, op, A::AbstractGPUSparseMatrix; dims=:, init=nothing)
     in(dims, [Colon(), 1, 2]) || error("only dims=:, dims=1 or dims=2 is supported")
 
     if A isa AbstractGPUSparseMatrixCSR && dims == 1
-        A = csc_type(A)(A)
+        A = sparse_csc(A)
     elseif A isa AbstractGPUSparseMatrixCSC && dims == 2
-        A = csr_type(A)(A)
+        A = sparse_csr(A)
     end
     m, n      = size(A)
     val_array = nonzeros(A)
diff --git a/test/testsuite/sparse.jl b/test/testsuite/sparse.jl
index 431a5cd3c..f9c91d0f1 100644
--- a/test/testsuite/sparse.jl
+++ b/test/testsuite/sparse.jl
@@ -2,14 +2,14 @@
     sparse_ATs = sparse_types(AT)
     for sparse_AT in sparse_ATs
         if sparse_AT <: AbstractSparseVector
-            vector(sparse_AT, eltypes)
-            vector_construction(sparse_AT, eltypes)
-            broadcasting_vector(sparse_AT, eltypes)
+            vector(sparse_AT, AT, eltypes)
+            vector_construction(sparse_AT, AT, eltypes)
+            broadcasting_vector(sparse_AT, AT, eltypes)
             iszero_vector(sparse_AT, eltypes)
         elseif sparse_AT <: AbstractSparseMatrix
             matrix(sparse_AT, eltypes)
-            matrix_construction(sparse_AT, eltypes)
-            broadcasting_matrix(sparse_AT, eltypes)
+            matrix_construction(sparse_AT, AT, eltypes)
+            broadcasting_matrix(sparse_AT, AT, eltypes)
             mapreduce_matrix(sparse_AT, eltypes)
             linalg(sparse_AT, eltypes)
             iszero_matrix(sparse_AT, eltypes)
@@ -68,6 +68,12 @@ function coo_lhs(::Type{T}) where {T<:Complex}
     ]
 end
 
+# A non-zero-preserving broadcast densifies its result: GPU back-ends return the dense
+# array type `dense_AT`, whereas CPU `SparseArrays` returns its sparse type with every
+# entry explicitly stored. Accept either so the suite is meaningful for both.
+is_densified(x, dense_AT, ::Type{ET}) where {ET} =
+    dense_AT <: GPUArrays.AnyGPUArray ? x isa dense_AT{ET} : (issparse(x) && eltype(x) === ET)
+
 function coo_matmul(AT, dense_AT, eltypes)
     for ET in (Float16, Float32, ComplexF16, ComplexF32)
         ET in eltypes || continue
@@ -103,8 +109,7 @@ function coo_matmul(AT, dense_AT, eltypes)
     end
 end
 
-function vector(AT, eltypes)
-    dense_AT = GPUArrays.dense_array_type(AT)
+function vector(AT, dense_AT, eltypes)
     for ET in eltypes
         @testset "Sparse vector properties($ET)" begin
             m = 25
@@ -141,8 +146,7 @@ function vector(AT, eltypes)
     end
 end
 
-function vector_construction(AT, eltypes)
-    dense_AT = GPUArrays.dense_array_type(AT)
+function vector_construction(AT, dense_AT, eltypes)
     for ET in eltypes
         m = 25
         n = 35
@@ -158,7 +162,7 @@ function vector_construction(AT, eltypes)
         dense_x = dense_AT(collect(x))
         @test SparseVector(dense_x) == x
         if dense_x isa GPUArrays.AnyGPUArray
-            typed_d_x = GPUArrays.sparse_from_dense(typeof(d_x), dense_x)
+            typed_d_x = AT(dense_x)
             @test typed_d_x isa typeof(d_x)
             @test SparseVector(typed_d_x) == x
         end
@@ -168,7 +172,6 @@ function vector_construction(AT, eltypes)
 end
 
 function matrix(AT, eltypes)
-    dense_AT = GPUArrays.dense_array_type(AT)
     for ET in eltypes
         @testset "Sparse matrix properties($ET)" begin
             m = 25
@@ -224,8 +227,7 @@ function matrix(AT, eltypes)
     end
 end
 
-function matrix_construction(AT, eltypes)
-    dense_AT = GPUArrays.dense_array_type(AT)
+function matrix_construction(AT, dense_AT, eltypes)
     for ET in eltypes
         m = 25
         n = 35
@@ -245,11 +247,11 @@ function matrix_construction(AT, eltypes)
         dense_x = dense_AT(collect(x))
         @test SparseMatrixCSC(dense_x) == x
         if dense_x isa GPUArrays.AnyGPUArray
-            typed_d_x = GPUArrays.sparse_from_dense(typeof(d_x), dense_x)
+            typed_d_x = AT(dense_x)
             @test typed_d_x isa typeof(d_x)
             @test SparseMatrixCSC(typed_d_x) == x
         end
-        @test sparse(dense_x) isa GPUArrays.sparse_array_type(dense_x){ET}
+        @test issparse(sparse(dense_x))
         @test SparseMatrixCSC(sparse(dense_x)) == x
         if dense_x isa GPUArrays.AnyGPUArray
             @test SparseMatrixCSC(sparse(dense_x; fmt=:csr)) == x
@@ -257,8 +259,7 @@ function matrix_construction(AT, eltypes)
     end
 end
 
-function broadcasting_vector(AT, eltypes)
-    dense_AT = GPUArrays.dense_array_type(AT)
+function broadcasting_vector(AT, dense_AT, eltypes)
     for ET in eltypes
         @testset "SparseVector($ET)" begin
             m  = 64
@@ -278,14 +279,14 @@ function broadcasting_vector(AT, eltypes)
             # not zero-preserving
             y  = x  .+ ET(1)
             dy = dx .+ ET(1)
-            @test dy isa dense_AT{ET}
+            @test is_densified(dy, dense_AT, ET)
             hy = Array(dy)
             @test Array(y) == hy
 
             # involving something dense
             y  = x  .+ ones(ET, m)
             dy = dx .+ dense_AT(ones(ET, m))
-            @test dy isa dense_AT{ET}
+            @test is_densified(dy, dense_AT, ET)
             @test Array(y) == Array(dy)
 
             # sparse to sparse
@@ -316,7 +317,7 @@ function broadcasting_vector(AT, eltypes)
             dw = AT(w)
             z  = @. x  * y  * w  * dense_arr
             dz = @. dx * dy * dw * d_dense_arr
-            @test dz isa dense_AT{ET}
+            @test is_densified(dz, dense_AT, ET)
             @test Array(z) == Array(dz)
             
             y  = sprand(ET, m, p)
@@ -332,7 +333,7 @@ function broadcasting_vector(AT, eltypes)
             dx = AT(x)
             dy = dx .+ 1
             y  = x .+ 1
-            @test dy isa dense_AT{promote_type(ET, Int)}
+            @test is_densified(dy, dense_AT, promote_type(ET, Int))
             @test Array(y) == Array(dy)
             ## zero-preserving
             dy = dx .* 1
@@ -346,8 +347,7 @@ function broadcasting_vector(AT, eltypes)
     end
 end
 
-function broadcasting_matrix(AT, eltypes)
-    dense_AT = GPUArrays.dense_array_type(AT)
+function broadcasting_matrix(AT, dense_AT, eltypes)
     for ET in eltypes
        @testset "SparseMatrix($ET)" begin
             m, n = 5, 6
@@ -363,7 +363,7 @@ function broadcasting_matrix(AT, eltypes)
             # not zero-preserving
             y  = x  .+ ET(1)
             dy = dx .+ ET(1)
-            @test dy isa dense_AT{ET}
+            @test is_densified(dy, dense_AT, ET)
             hy = Array(dy)
             dense_y = Array(y)
             @test Array(y) == Array(dy)
@@ -371,7 +371,7 @@ function broadcasting_matrix(AT, eltypes)
             # involving something dense
             y  = x  .* ones(ET, m, n)
             dy = dx .* dense_AT(ones(ET, m, n))
-            @test dy isa dense_AT{ET}
+            @test is_densified(dy, dense_AT, ET)
             @test Array(y) == Array(dy)
             
             # multiple inputs
@@ -407,7 +407,6 @@ function broadcasting_matrix(AT, eltypes)
 end
 
 function mapreduce_matrix(AT, eltypes)
-    dense_AT = GPUArrays.dense_array_type(AT)
     for ET in eltypes
         @testset "SparseMatrix($ET)" begin
             m,n = 5,6
@@ -460,7 +459,6 @@ function mapreduce_matrix(AT, eltypes)
 end
 
 function linalg(AT, eltypes)
-    dense_AT = GPUArrays.dense_array_type(AT)
     for ET in eltypes
         # sprandn doesn't work nicely with these...
         if !(ET <: Union{Int16, Int32, Int64, Complex{Int16}, Complex{Int32}, Complex{Int64}})

From 7e149d08534fdafee7feada6e24b3e491bfcdfd9 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Mon, 22 Jun 2026 23:13:23 +0200
Subject: [PATCH 05/15] testsuite: only test sparse `abs`-reductions when the
 result type is supported
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`mapreduce(abs, …)` over a `Complex{<:Integer}` sparse matrix has result
element type `Float64` (since `abs(::Complex{<:Integer})::Float64`). Back-ends
that can't allocate that type (e.g. Metal, which has no `Float64`) errored on
these cases. Gate the `abs`-reductions on the result type being among the
back-end's supported eltypes; `sum`, which preserves the input eltype, stays
unconditional.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 test/testsuite/sparse.jl | 47 ++++++++++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 16 deletions(-)

diff --git a/test/testsuite/sparse.jl b/test/testsuite/sparse.jl
index f9c91d0f1..7a15dd09e 100644
--- a/test/testsuite/sparse.jl
+++ b/test/testsuite/sparse.jl
@@ -414,32 +414,45 @@ function mapreduce_matrix(AT, eltypes)
             x = sprand(ET, m, n, p)
             dx = AT(x)
 
+            # The `abs`-reductions below produce a result of element type `RT`; for some
+            # input eltypes that is a type the back-end can't allocate (e.g. `abs` of a
+            # `Complex{<:Integer}` is `Float64`, unsupported on Metal). Only exercise them
+            # when the back-end supports the result type; `sum` keeps the input eltype and
+            # is always tested.
+            RT = typeof(abs(zero(ET)))
+
             # dim=:
             y  = sum(x)
             dy = sum(dx)
             @test y ≈ dy
 
-            y  = mapreduce(abs, +, x)
-            dy = mapreduce(abs, +, dx)
-            @test y ≈ dy
+            if RT in eltypes
+                y  = mapreduce(abs, +, x)
+                dy = mapreduce(abs, +, dx)
+                @test y ≈ dy
+            end
 
             # dim=1
             y  = sum(x, dims=1)
             dy = sum(dx, dims=1)
             @test y ≈ Array(dy)
 
-            y  = mapreduce(abs, +, x, dims=1)
-            dy = mapreduce(abs, +, dx, dims=1)
-            @test y ≈ Array(dy)
+            if RT in eltypes
+                y  = mapreduce(abs, +, x, dims=1)
+                dy = mapreduce(abs, +, dx, dims=1)
+                @test y ≈ Array(dy)
+            end
 
             # dim=2
             y = sum(x, dims=2)
             dy = sum(dx, dims=2)
             @test y ≈ Array(dy)
 
-            y  = mapreduce(abs, +, x, dims=2)
-            dy = mapreduce(abs, +, dx, dims=2)
-            @test y ≈ Array(dy)
+            if RT in eltypes
+                y  = mapreduce(abs, +, x, dims=2)
+                dy = mapreduce(abs, +, dx, dims=2)
+                @test y ≈ Array(dy)
+            end
             if ET in (Float32, Float64)
                 dy = mapreduce(abs, +, dx; init=zero(ET))
                 y  = mapreduce(abs, +, x; init=zero(ET))
@@ -447,13 +460,15 @@ function mapreduce_matrix(AT, eltypes)
             end
 
             # test with a matrix with fully empty rows
-            x = zeros(ET, m, n)
-            x[2, :] .= -one(ET)
-            x[2, end] = -ET(16)
-            dx = AT(sparse(x))
-            y  = mapreduce(abs, max, x)
-            dy = mapreduce(abs, max, dx)
-            @test y ≈ dy
+            if RT in eltypes
+                x = zeros(ET, m, n)
+                x[2, :] .= -one(ET)
+                x[2, end] = -ET(16)
+                dx = AT(sparse(x))
+                y  = mapreduce(abs, max, x)
+                dy = mapreduce(abs, max, dx)
+                @test y ≈ dy
+            end
         end
     end
 end

From 92bde191ad055ed95ec8bbe1e4f811512af15e7b Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Mon, 22 Jun 2026 23:14:11 +0200
Subject: [PATCH 06/15] docs: document the sparse array back-end interface

Spell out the parallel sparse hierarchy, the storage structs and their
conventional field names, and the methods a back-end implements (constructors,
`similar`, and the `sparse_csc`/`sparse_csr`/`sparse_coo` conversion verbs),
plus the generic functionality that comes for free.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 docs/src/interface.md | 46 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/docs/src/interface.md b/docs/src/interface.md
index a9c57c539..1e86b8245 100644
--- a/docs/src/interface.md
+++ b/docs/src/interface.md
@@ -31,6 +31,52 @@ KernelAbstractions.get_backend(a::CA) where CA <: CustomArray = CustomBackend()
 
 There are numerous examples of potential interfaces for GPUArrays, such as with [JLArrays](https://github.com/JuliaGPU/GPUArrays.jl/blob/main/lib/JLArrays/src/JLArrays.jl), [CuArrays](https://github.com/JuliaGPU/CUDA.jl/blob/main/src/gpuarrays.jl), and [ROCArrays](https://github.com/JuliaGPU/AMDGPU.jl/blob/main/src/gpuarrays.jl).
 
+## Sparse arrays
+
+Sparse arrays cannot share the `AbstractGPUArray` supertype: that is a
+`DenseArray`, whereas a sparse array must be an `AbstractSparseArray`. GPUArrays
+therefore provides a parallel hierarchy, and generic sparse functionality
+(broadcast, `mapreduce`, sparse/dense matrix multiplication, `norm`/`opnorm`,
+`findnz`, `triu`/`tril`/`kron`, format conversion, indexing) is defined on it.
+
+A back-end provides one mutable struct per supported format, subtyping the
+matching abstract type and using the conventional field names (the generic code
+reads them directly):
+
+| supertype | fields |
+|:--|:--|
+| `AbstractGPUSparseVector{Tv,Ti}`    | `iPtr`, `nzVal`, `len`, `nnz` |
+| `AbstractGPUSparseMatrixCSC{Tv,Ti}` | `colPtr`, `rowVal`, `nzVal`, `dims`, `nnz` |
+| `AbstractGPUSparseMatrixCSR{Tv,Ti}` | `rowPtr`, `colVal`, `nzVal`, `dims`, `nnz` |
+| `AbstractGPUSparseMatrixCOO{Tv,Ti}` | `rowInd`, `colInd`, `nzVal`, `dims`, `nnz` |
+
+where the index/pointer/value arrays are the back-end's own dense vector type
+(so `nonzeros(A)`, `SparseArrays.nonzeroinds`, `rowvals`, `getcolptr` work, and
+`get_backend(nonzeros(A))` identifies the compute backend).
+
+On top of the structs, a back-end implements:
+
+  * constructors from the component arrays (e.g.
+    `MyCSR(rowPtr, colVal, nzVal, dims)`) and the inter-format and
+    dense↔sparse conversions (`MyCSR(::MyCOO)`, `MyCSC(::SparseMatrixCSC)`,
+    `SparseMatrixCSC(::MyCSC)`, …);
+
+  * `Base.similar` — structure-preserving (`similar(A)`, `similar(A, ::Type)`)
+    and empty-of-a-shape (`similar(A, ::Type, dims)`), exactly as for dense
+    arrays. The generic algorithms allocate their outputs through `similar`
+    rather than by naming a type;
+
+  * the format-conversion verbs `GPUArrays.sparse_csc`, `GPUArrays.sparse_csr`
+    and `GPUArrays.sparse_coo`, for the formats other than the one each
+    produces (converting to the format already held is the generic identity).
+    These are the value-level equivalent of `SparseArrays.sparse` and back the
+    generic code that needs a particular layout (e.g. matrix multiplication
+    funnels through COO).
+
+`sparse_from_dense(::Type{<:AbstractGPUSparseMatrixCOO}, A)` is provided
+generically to densify-scan into COO; a back-end's CSR/CSC-from-dense
+constructors can be defined as `sparse_csr`/`sparse_csc` of that COO.
+
 ## Caching Allocator
 
 ```@docs

From 38b5dc52ef1f01b14e08ecb98528fd0dec363281 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 23 Jun 2026 06:58:07 +0200
Subject: [PATCH 07/15] =?UTF-8?q?Add=20on-device=20`to=5Fdense`;=20rename?=
 =?UTF-8?q?=20`sparse=5Ffrom=5Fdense`=20=E2=86=92=20`to=5Fsparse`?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Densifying a sparse array had no generic on-device path: back-ends did it with a
host round-trip (`Array(collect(SparseMatrixCSC(x)))`). Add `to_dense(A)`, which
scatters the stored entries into a dense array of the same back-end with a kernel
(allocated via `similar(nonzeros(A), …)`), no host transfer. Coordinates are taken
to be unique -- true for CSC/CSR and for COO produced by conversion.

Rename the existing `sparse_from_dense` to `to_sparse` so the two on-device
conversions form a symmetric, discoverable pair (`to_sparse` needs a target format;
`to_dense` does not). JLArrays uses `to_dense` for `JLArray(::JLSparse…)`, and the
interface docs are updated.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 docs/src/interface.md        |  9 ++++---
 lib/JLArrays/src/JLArrays.jl | 14 +++++------
 src/host/sparse.jl           | 49 ++++++++++++++++++++++++++++++++++--
 3 files changed, 60 insertions(+), 12 deletions(-)

diff --git a/docs/src/interface.md b/docs/src/interface.md
index 1e86b8245..4bd21efa0 100644
--- a/docs/src/interface.md
+++ b/docs/src/interface.md
@@ -73,9 +73,12 @@ On top of the structs, a back-end implements:
     generic code that needs a particular layout (e.g. matrix multiplication
     funnels through COO).
 
-`sparse_from_dense(::Type{<:AbstractGPUSparseMatrixCOO}, A)` is provided
-generically to densify-scan into COO; a back-end's CSR/CSC-from-dense
-constructors can be defined as `sparse_csr`/`sparse_csc` of that COO.
+Dense↔sparse conversion is provided generically and on-device: `to_sparse(::Type{ST},
+A)` scans a dense array into a sparse one (`ST` is an `AbstractGPUSparseVector` or
+`AbstractGPUSparseMatrixCOO` -- CSR/CSC follow via the conversion verbs), and `to_dense(A)`
+is the inverse, scattering a sparse array into a dense one of the same back-end. A
+back-end's dense↔sparse constructors (`MyArray(::MySparse…)`, `MyCSR(::MyDense)`) can be
+defined in terms of these.
 
 ## Caching Allocator
 
diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl
index bb1011a0d..aa935950c 100644
--- a/lib/JLArrays/src/JLArrays.jl
+++ b/lib/JLArrays/src/JLArrays.jl
@@ -298,10 +298,10 @@ Base.similar(Mat::JLSparseMatrixCSR, T::Type, dims::Tuple{Int, Int}) = similar(M
 Base.similar(Mat::JLSparseMatrixCSC, dims::Tuple{Int, Int}) = similar(Mat, dims...)
 Base.similar(Mat::JLSparseMatrixCSR, dims::Tuple{Int, Int}) = similar(Mat, dims...)
 
-JLArray(x::JLSparseVector)    = JLArray(collect(SparseVector(x)))
-JLArray(x::JLSparseMatrixCSC) = JLArray(collect(SparseMatrixCSC(x)))
-JLArray(x::JLSparseMatrixCSR) = JLArray(collect(SparseMatrixCSC(x)))
-JLArray(x::JLSparseMatrixCOO) = JLArray(collect(SparseMatrixCSC(x)))
+JLArray(x::JLSparseVector)    = GPUArrays.to_dense(x)
+JLArray(x::JLSparseMatrixCSC) = GPUArrays.to_dense(x)
+JLArray(x::JLSparseMatrixCSR) = GPUArrays.to_dense(x)
+JLArray(x::JLSparseMatrixCOO) = GPUArrays.to_dense(x)
 
 # conversion of untyped data to a typed Array
 function typed_data(x::JLArray{T}) where {T}
@@ -416,7 +416,7 @@ function JLSparseVector(xs::SparseVector{Tv, Ti}) where {Ti, Tv}
 end
 SparseArrays.SparseVector(xs::JLArray{<:Any,1}) = SparseVector(JLSparseVector(xs))
 JLSparseVector(xs::JLArray{T,1}) where {T} =
-    GPUArrays.sparse_from_dense(JLSparseVector{T,Int}, xs)
+    GPUArrays.to_sparse(JLSparseVector{T,Int}, xs)
 Base.length(x::JLSparseVector) = x.len
 Base.size(x::JLSparseVector) = (x.len,)
 
@@ -454,7 +454,7 @@ function JLSparseMatrixCOO(xs::SparseMatrixCSC{Tv, Ti}) where {Ti, Tv}
 end
 JLSparseMatrixCOO(xs::SparseVector) = JLSparseMatrixCOO(SparseMatrixCSC(xs))
 JLSparseMatrixCOO(xs::JLArray{T,2}) where {T} =
-    GPUArrays.sparse_from_dense(JLSparseMatrixCOO{T,Int}, xs)
+    GPUArrays.to_sparse(JLSparseMatrixCOO{T,Int}, xs)
 function JLSparseMatrixCSR(xs::JLSparseMatrixCSC{Tv, Ti}) where {Ti, Tv}
     return JLSparseMatrixCSR(SparseMatrixCSC(xs))
 end
@@ -481,7 +481,7 @@ function JLSparseMatrixCSR{Tv,Ti}(xs::JLSparseMatrixCOO{Tv, Ti}) where {Ti, Tv}
 end
 
 SparseArrays.sparse(xs::JLArray{T,1}) where {T} =
-    GPUArrays.sparse_from_dense(JLSparseVector{T,Int}, xs)
+    GPUArrays.to_sparse(JLSparseVector{T,Int}, xs)
 
 function SparseArrays.sparse(xs::JLArray{T,2}; fmt=:csc) where {T}
     if fmt == :csc
diff --git a/src/host/sparse.jl b/src/host/sparse.jl
index 77c1544bb..406e1862a 100644
--- a/src/host/sparse.jl
+++ b/src/host/sparse.jl
@@ -73,7 +73,11 @@ end
     end
 end
 
-function sparse_from_dense(::Type{ST}, A::AnyGPUVector) where {Tv,Ti,ST<:AbstractGPUSparseVector{Tv,Ti}}
+# `to_sparse`/`to_dense` are the on-device conversions between a dense GPU array and a
+# sparse one. `to_sparse` needs a target format (the result layout is ambiguous);
+# `to_dense` does not.
+
+function to_sparse(::Type{ST}, A::AnyGPUVector) where {Tv,Ti,ST<:AbstractGPUSparseVector{Tv,Ti}}
     check_sparse_target(ST, Tv, Ti)
     indices = findall(!iszero, A)
     nstored = length(indices)
@@ -91,7 +95,7 @@ end
 # COO directly. Other formats are obtained by converting the COO with `sparse_csr`/
 # `sparse_csc` (which back-ends already provide as constructors), avoiding any need to map
 # a target type to its COO sibling.
-function sparse_from_dense(::Type{ST}, A::AnyGPUMatrix) where {Tv,Ti,ST<:AbstractGPUSparseMatrixCOO{Tv,Ti}}
+function to_sparse(::Type{ST}, A::AnyGPUMatrix) where {Tv,Ti,ST<:AbstractGPUSparseMatrixCOO{Tv,Ti}}
     check_sparse_target(ST, Tv, Ti)
     indices = findall(!iszero, A)
     nstored = length(indices)
@@ -106,6 +110,47 @@ function sparse_from_dense(::Type{ST}, A::AnyGPUMatrix) where {Tv,Ti,ST<:Abstrac
     return ST(rowInd, colInd, nzVal, size(A), nstored)
 end
 
+@kernel function densify_vector_kernel!(out, iPtr, nzVal)
+    k = @index(Global, Linear)
+    if k <= length(nzVal)
+        @inbounds out[iPtr[k]] = nzVal[k]
+    end
+end
+
+@kernel function densify_matrix_kernel!(out, rowInd, colInd, nzVal)
+    k = @index(Global, Linear)
+    if k <= length(nzVal)
+        @inbounds out[rowInd[k], colInd[k]] = nzVal[k]
+    end
+end
+
+"""
+    to_dense(A)
+
+Materialize the sparse GPU array `A` into a dense array of the same back-end, on the
+device (no host round-trip); the inverse of [`to_sparse`](@ref). Stored coordinates are
+assumed unique (true for CSC/CSR and for COO produced by conversion); duplicate COO
+coordinates resolve to the last value written.
+"""
+function to_dense(x::AbstractGPUSparseVector{Tv}) where {Tv}
+    out = fill!(similar(nonzeros(x), Tv, length(x)), zero(Tv))
+    if nnz(x) > 0
+        densify_vector_kernel!(get_backend(out))(out, SparseArrays.nonzeroinds(x), nonzeros(x);
+                                                 ndrange=nnz(x))
+    end
+    return out
+end
+
+function to_dense(A::AbstractGPUSparseMatrix{Tv}) where {Tv}
+    coo = sparse_coo(A)
+    out = fill!(similar(nonzeros(A), Tv, size(A)), zero(Tv))
+    if nnz(coo) > 0
+        densify_matrix_kernel!(get_backend(out))(out, coo.rowInd, coo.colInd, nonzeros(coo);
+                                                 ndrange=nnz(coo))
+    end
+    return out
+end
+
 # similar
 Base.similar(Vec::V) where {V<:AbstractGPUSparseVector} = V(copy(SparseArrays.nonzeroinds(Vec)), similar(SparseArrays.nonzeros(Vec)), length(Vec))
 Base.similar(Mat::M) where {M<:AbstractGPUSparseMatrixCSC} = M(copy(SparseArrays.getcolptr(Mat)), copy(SparseArrays.rowvals(Mat)), similar(SparseArrays.nonzeros(Mat)), size(Mat))

From 637acb0658b48f4c0c38adf15551efbf2eaa52d5 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 23 Jun 2026 07:14:18 +0200
Subject: [PATCH 08/15] docs: restructure the sparse interface; document
 `to_sparse`/`to_dense`

Reorganize the sparse section of the interface docs into three parts -- the
storage types a back-end provides, the methods it implements to integrate them,
and the functionality it gets for free -- and round it out (device-struct
`Adapt` rule, `get_backend`, `_sptranspose`/`_spadjoint`, the `to_sparse`/
`to_dense` conversions, and the full list of generic operations). Also give
`to_sparse` a docstring to match `to_dense`.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 docs/src/interface.md | 82 ++++++++++++++++++++++++-------------------
 src/host/sparse.jl    | 11 ++++--
 2 files changed, 53 insertions(+), 40 deletions(-)

diff --git a/docs/src/interface.md b/docs/src/interface.md
index 4bd21efa0..0aeb49636 100644
--- a/docs/src/interface.md
+++ b/docs/src/interface.md
@@ -33,15 +33,16 @@ There are numerous examples of potential interfaces for GPUArrays, such as with
 
 ## Sparse arrays
 
-Sparse arrays cannot share the `AbstractGPUArray` supertype: that is a
-`DenseArray`, whereas a sparse array must be an `AbstractSparseArray`. GPUArrays
-therefore provides a parallel hierarchy, and generic sparse functionality
-(broadcast, `mapreduce`, sparse/dense matrix multiplication, `norm`/`opnorm`,
-`findnz`, `triu`/`tril`/`kron`, format conversion, indexing) is defined on it.
+A sparse array can't share the `AbstractGPUArray` supertype — that is a `DenseArray`,
+whereas a sparse array must be an `AbstractSparseArray` — so GPUArrays keeps a parallel
+sparse hierarchy with its own generic functionality. Integrating a back-end has three
+parts: the storage types it provides, the methods it implements to plug them in, and the
+functionality it then gets for free.
 
-A back-end provides one mutable struct per supported format, subtyping the
-matching abstract type and using the conventional field names (the generic code
-reads them directly):
+### Storage types to provide
+
+One mutable struct per supported format, subtyping the matching abstract type and using
+the conventional field names (generic code reads them directly):
 
 | supertype | fields |
 |:--|:--|
@@ -50,35 +51,42 @@ reads them directly):
 | `AbstractGPUSparseMatrixCSR{Tv,Ti}` | `rowPtr`, `colVal`, `nzVal`, `dims`, `nnz` |
 | `AbstractGPUSparseMatrixCOO{Tv,Ti}` | `rowInd`, `colInd`, `nzVal`, `dims`, `nnz` |
 
-where the index/pointer/value arrays are the back-end's own dense vector type
-(so `nonzeros(A)`, `SparseArrays.nonzeroinds`, `rowvals`, `getcolptr` work, and
-`get_backend(nonzeros(A))` identifies the compute backend).
-
-On top of the structs, a back-end implements:
-
-  * constructors from the component arrays (e.g.
-    `MyCSR(rowPtr, colVal, nzVal, dims)`) and the inter-format and
-    dense↔sparse conversions (`MyCSR(::MyCOO)`, `MyCSC(::SparseMatrixCSC)`,
-    `SparseMatrixCSC(::MyCSC)`, …);
-
-  * `Base.similar` — structure-preserving (`similar(A)`, `similar(A, ::Type)`)
-    and empty-of-a-shape (`similar(A, ::Type, dims)`), exactly as for dense
-    arrays. The generic algorithms allocate their outputs through `similar`
-    rather than by naming a type;
-
-  * the format-conversion verbs `GPUArrays.sparse_csc`, `GPUArrays.sparse_csr`
-    and `GPUArrays.sparse_coo`, for the formats other than the one each
-    produces (converting to the format already held is the generic identity).
-    These are the value-level equivalent of `SparseArrays.sparse` and back the
-    generic code that needs a particular layout (e.g. matrix multiplication
-    funnels through COO).
-
-Dense↔sparse conversion is provided generically and on-device: `to_sparse(::Type{ST},
-A)` scans a dense array into a sparse one (`ST` is an `AbstractGPUSparseVector` or
-`AbstractGPUSparseMatrixCOO` -- CSR/CSC follow via the conversion verbs), and `to_dense(A)`
-is the inverse, scattering a sparse array into a dense one of the same back-end. A
-back-end's dense↔sparse constructors (`MyArray(::MySparse…)`, `MyCSR(::MyDense)`) can be
-defined in terms of these.
+The pointer/index/value arrays are the back-end's own dense vector type. Provide only the
+formats you need, but note that several generic operations route through COO.
+
+### Interface to implement
+
+  * **Constructors** — from component arrays (`MyCSR(rowPtr, colVal, nzVal, dims)`),
+    between formats (`MyCSR(::MyCOO)`, …), and to/from host `SparseArrays`
+    (`MyCSC(::SparseMatrixCSC)`, `SparseMatrixCSC(::MyCSC)`).
+  * **`Base.similar`** — structure-preserving (`similar(A)`, `similar(A, ::Type)`) and
+    empty-of-a-shape (`similar(A, ::Type, dims)`), as for dense arrays; generic code
+    allocates its outputs through `similar`, never by naming a type.
+  * **Conversion verbs** `GPUArrays.sparse_csc`/`sparse_csr`/`sparse_coo`, for the formats
+    other than the one each produces (the identity case is generic) — the value-level
+    analogue of `SparseArrays.sparse`.
+  * **`KernelAbstractions.get_backend`** for the sparse types (usually
+    `get_backend(nonzeros(A))`).
+  * **`Adapt.adapt_structure`** converting each host struct to its device counterpart
+    (`GPUArrays.GPUSparseDeviceVector`, `GPUSparseDeviceMatrixCSC`/`CSR`/`COO`), so the
+    generic kernels can consume it inside `@kernel`s.
+  * **`GPUArrays._sptranspose`/`_spadjoint`** — materialize a (conjugate) transpose; used
+    by `kron`/`triu`/`tril` on lazily wrapped operands.
+
+`SparseArrays`' accessors (`nnz`, `nonzeros`, `nonzeroinds`, `rowvals`, `getcolptr`) come
+for free from the field names. Dense↔sparse conversion is generic and on-device:
+`to_sparse(::Type{ST}, dense)` scans into a sparse array (`ST` a vector or COO type;
+CSR/CSC follow via the verbs) and `to_dense(A)` scatters back to a dense array of the
+back-end — so a back-end's `MyArray(::MySparse…)` and dense→sparse constructors can simply
+call them.
+
+### Functionality you get
+
+Broadcasting; `mapreduce` and reductions (`sum`, `norm`, `opnorm`); sparse–dense and
+sparse–vector multiplication (`*`, `mul!`, including transposed/adjoint operands);
+`findnz`, `triu`/`tril`/`kron`/`reshape`/`droptol!`; `iszero`/`issymmetric`/`ishermitian`;
+scalar and slice indexing; `copy`/`copyto!`/`collect`/`Array`; and conversion between
+formats and to/from dense.
 
 ## Caching Allocator
 
diff --git a/src/host/sparse.jl b/src/host/sparse.jl
index 406e1862a..f057016c0 100644
--- a/src/host/sparse.jl
+++ b/src/host/sparse.jl
@@ -73,10 +73,15 @@ end
     end
 end
 
-# `to_sparse`/`to_dense` are the on-device conversions between a dense GPU array and a
-# sparse one. `to_sparse` needs a target format (the result layout is ambiguous);
-# `to_dense` does not.
+"""
+    to_sparse(ST, A)
 
+Build a sparse array of type `ST` from the dense GPU array `A`, on the device (no host
+round-trip); the inverse of [`to_dense`](@ref). `ST` must be a concrete
+`AbstractGPUSparseVector` or `AbstractGPUSparseMatrixCOO` (other matrix formats follow by
+converting the resulting COO with `sparse_csr`/`sparse_csc`). The target format is required
+because the result layout would otherwise be ambiguous; `to_dense` needs no such argument.
+"""
 function to_sparse(::Type{ST}, A::AnyGPUVector) where {Tv,Ti,ST<:AbstractGPUSparseVector{Tv,Ti}}
     check_sparse_target(ST, Tv, Ti)
     indices = findall(!iszero, A)

From 22313e19f088a4e256655b7e74f283059c3df6b1 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 23 Jun 2026 14:54:50 +0200
Subject: [PATCH 09/15] Add on-device sparse format conversions via `convert`

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/host/sparse.jl | 171 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 169 insertions(+), 2 deletions(-)

diff --git a/src/host/sparse.jl b/src/host/sparse.jl
index f057016c0..7c381b0d8 100644
--- a/src/host/sparse.jl
+++ b/src/host/sparse.jl
@@ -195,12 +195,179 @@ end
 
 sparse_op_size(A, trans::Bool) = trans ? reverse(size(A)) : size(A)
 
-# Format-conversion verbs: converting to the format already held is the identity;
-# back-ends implement the cross-format conversions (as constructors).
+# Format-conversion verbs: converting to the format already held is the identity. The
+# cross-format conversions are `convert` (below). A back-end supplies the 3 verb methods
+# naming its concrete sibling types — `sparse_coo(A::MyCSC) = convert(MyCOO, A)` — which is
+# also where its `MyCOO(::MyCSC)` constructors route.
 sparse_csc(A::AbstractGPUSparseMatrixCSC) = A
 sparse_csr(A::AbstractGPUSparseMatrixCSR) = A
 sparse_coo(A::AbstractGPUSparseMatrixCOO) = A
 
+## Generic format conversions (CSC↔CSR↔COO), on-device.
+#
+# Conversion is owned by `convert` (dispatched on GPUArrays' own abstract types, so not type
+# piracy); construction is owned by the back-end through its conventional field constructor,
+# which `convert` calls — the same division `SparseArrays` uses (`convert`/constructors for
+# conversion, `SparseMatrixCSC(m, n, colptr, rowval, nzval)` for construction). Conversions
+# preserve the source's value/index eltypes; every buffer is allocated from a source array
+# via `similar`, so the storage mode and back-end are inherited. `sortperm` on a back-end
+# vector is required (the COO→CSR/CSC path; already implied by `findnz`).
+
+@inline function sparse_lower_bound(xs, lo::Ti, hi::Ti, key::Ti) where {Ti}
+    l = lo
+    r = hi
+    while l < r
+        m = (l + r) >>> 1
+        @inbounds v = xs[m]
+        if v < key
+            l = m + one(Ti)
+        else
+            r = m
+        end
+    end
+    return l
+end
+
+# Expand a compressed pointer array (CSC `colPtr` / CSR `rowPtr`, length `dim+1`) into one
+# explicit lead index per stored entry: thread `lead` writes its index into the slots
+# `ptr[lead]:ptr[lead+1]-1` of `out`.
+@kernel function expand_ptr_kernel!(out, ptr)
+    lead = @index(Global, Linear)
+    if lead <= length(ptr) - 1
+        T = eltype(out)
+        @inbounds begin
+            k = ptr[lead]
+            stop = ptr[lead + 1]
+            v = T(lead)
+            while k < stop
+                out[k] = v
+                k += oneunit(k)
+            end
+        end
+    end
+end
+
+# Build a lead-major linear sort key per stored COO entry (lead-major: sorting by the key
+# orders entries by `(lead, sub)`); `subdim` is the size of the sub axis.
+@kernel function coo_sort_keys_kernel!(keys, leadInd, subInd, subdim)
+    i = @index(Global, Linear)
+    if i <= length(keys)
+        @inbounds keys[i] = (Int64(leadInd[i]) - Int64(1)) * Int64(subdim) + Int64(subInd[i])
+    end
+end
+
+# Gather the three COO arrays through a permutation (e.g. from `sortperm`).
+@kernel function coo_gather_kernel!(leadOut, subOut, nzOut, leadIn, subIn, nzIn, perm)
+    i = @index(Global, Linear)
+    if i <= length(perm)
+        @inbounds begin
+            src = perm[i]
+            leadOut[i] = leadIn[src]
+            subOut[i] = subIn[src]
+            nzOut[i] = nzIn[src]
+        end
+    end
+end
+
+# Build a compressed pointer array from sorted lead indices: `ptr[i]` is the first 1-based
+# position of a stored entry whose lead index is ≥ `i`, and `ptr[end] = nnz+1`.
+@kernel function compressed_ptr_from_sorted_kernel!(ptr, sortedLead, nnz)
+    i = @index(Global, Linear)
+    Ti = eltype(ptr)
+    if i <= length(ptr)
+        @inbounds ptr[i] = if i == length(ptr)
+            Ti(nnz) + one(Ti)
+        else
+            sparse_lower_bound(sortedLead, one(Ti), Ti(nnz) + one(Ti), Ti(i))
+        end
+    end
+end
+
+# Order a COO matrix's stored entries lead-major, returning fresh (lead, sub, nz) arrays at
+# the source eltypes. Duplicate coordinates are preserved (no summing).
+function _sorted_coo(A::AbstractGPUSparseMatrixCOO{Tv,Ti}, ::Val{lead}) where {Tv,Ti,lead}
+    lead === :row || lead === :col ||
+        throw(ArgumentError("COO sort lead must be :row or :col"))
+    total = Int(nnz(A))
+    leadIn = lead === :row ? A.rowInd : A.colInd
+    subIn  = lead === :row ? A.colInd : A.rowInd
+    subdim = lead === :row ? size(A, 2) : size(A, 1)
+
+    leadOut = similar(leadIn, Ti, total)
+    subOut  = similar(subIn, Ti, total)
+    nzOut   = similar(nonzeros(A), Tv, total)
+    total == 0 && return leadOut, subOut, nzOut
+
+    backend = get_backend(leadOut)
+    keys = similar(leadIn, Int64, total)
+    coo_sort_keys_kernel!(backend)(keys, leadIn, subIn, subdim; ndrange=total)
+    perm = sortperm(keys)
+    coo_gather_kernel!(backend)(leadOut, subOut, nzOut, leadIn, subIn, nonzeros(A), perm;
+                                ndrange=total)
+    unsafe_free!(keys)
+    unsafe_free!(perm)
+    return leadOut, subOut, nzOut
+end
+
+# compressed pointer array of length `dim+1` from (possibly empty) sorted lead indices,
+# inheriting the lead array's index eltype and storage.
+function _compressed_ptr(sortedLead, dim::Integer, n::Integer)
+    ptr = similar(sortedLead, dim + 1)
+    if n > 0
+        compressed_ptr_from_sorted_kernel!(get_backend(ptr))(ptr, sortedLead, n;
+                                                             ndrange=dim + 1)
+    else
+        fill!(ptr, one(eltype(ptr)))
+    end
+    return ptr
+end
+
+# Cross-format conversions are ordinary `convert` methods, dispatched on GPUArrays' own
+# abstract types (so this is not type piracy and avoids the ambiguity a generic constructor
+# would have with a back-end's host-array constructor). The result is built with the
+# back-end's conventional field constructor `ST(components..., dims[, nnz])` — the write-side
+# analogue of the `colPtr`/`rowVal`/`nzVal` read contract, and how `SparseArrays` itself
+# builds (`SparseMatrixCSC(m, n, colptr, rowval, nzval)`). A back-end with unconventional
+# construction overrides the relevant `convert` method.
+
+# CSR → COO: expand `rowPtr` into explicit row indices; columns and values are shared.
+function Base.convert(::Type{ST}, A::AbstractGPUSparseMatrixCSR) where {ST<:AbstractGPUSparseMatrixCOO}
+    n = Int(nnz(A))
+    rowInd = similar(A.colVal, n)
+    n > 0 && expand_ptr_kernel!(get_backend(rowInd))(rowInd, A.rowPtr; ndrange=size(A, 1))
+    return ST(rowInd, A.colVal, nonzeros(A), size(A), n)
+end
+
+# CSC → COO: expand `colPtr` into explicit column indices; rows and values are shared.
+function Base.convert(::Type{ST}, A::AbstractGPUSparseMatrixCSC) where {ST<:AbstractGPUSparseMatrixCOO}
+    n = Int(nnz(A))
+    colInd = similar(A.rowVal, n)
+    n > 0 && expand_ptr_kernel!(get_backend(colInd))(colInd, A.colPtr; ndrange=size(A, 2))
+    return ST(A.rowVal, colInd, nonzeros(A), size(A), n)
+end
+
+# COO → CSR: sort row-major, then compress the row indices into a `rowPtr`.
+function Base.convert(::Type{ST}, A::AbstractGPUSparseMatrixCOO) where {ST<:AbstractGPUSparseMatrixCSR}
+    rowInd, colVal, nzVal = _sorted_coo(A, Val(:row))
+    rowPtr = _compressed_ptr(rowInd, size(A, 1), Int(nnz(A)))
+    unsafe_free!(rowInd)
+    return ST(rowPtr, colVal, nzVal, size(A))
+end
+
+# COO → CSC: sort column-major, then compress the column indices into a `colPtr`.
+function Base.convert(::Type{ST}, A::AbstractGPUSparseMatrixCOO) where {ST<:AbstractGPUSparseMatrixCSC}
+    colInd, rowVal, nzVal = _sorted_coo(A, Val(:col))
+    colPtr = _compressed_ptr(colInd, size(A, 2), Int(nnz(A)))
+    unsafe_free!(colInd)
+    return ST(colPtr, rowVal, nzVal, size(A))
+end
+
+# CSC ↔ CSR route through COO (the back-end's verb yields its concrete COO type).
+Base.convert(::Type{ST}, A::AbstractGPUSparseMatrixCSC) where {ST<:AbstractGPUSparseMatrixCSR} =
+    convert(ST, sparse_coo(A))
+Base.convert(::Type{ST}, A::AbstractGPUSparseMatrixCSR) where {ST<:AbstractGPUSparseMatrixCSC} =
+    convert(ST, sparse_coo(A))
+
 @inline function sparse_atomic_add!(A::AbstractArray{<:Real}, parts, I::Integer, x)
     @inbounds Atomix.@atomic A[Int(I)] += x
     return

From 780e855bcc098e8056a6f528c4d0e0e083e8e975 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 23 Jun 2026 14:54:50 +0200
Subject: [PATCH 10/15] Adopt generic sparse conversions in JLArrays

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 lib/JLArrays/src/JLArrays.jl | 40 +++++++++++++++---------------------
 1 file changed, 16 insertions(+), 24 deletions(-)

diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl
index aa935950c..882f7ba1d 100644
--- a/lib/JLArrays/src/JLArrays.jl
+++ b/lib/JLArrays/src/JLArrays.jl
@@ -455,30 +455,22 @@ end
 JLSparseMatrixCOO(xs::SparseVector) = JLSparseMatrixCOO(SparseMatrixCSC(xs))
 JLSparseMatrixCOO(xs::JLArray{T,2}) where {T} =
     GPUArrays.to_sparse(JLSparseMatrixCOO{T,Int}, xs)
-function JLSparseMatrixCSR(xs::JLSparseMatrixCSC{Tv, Ti}) where {Ti, Tv}
-    return JLSparseMatrixCSR(SparseMatrixCSC(xs))
-end
-function JLSparseMatrixCOO(xs::Union{JLSparseMatrixCSC{Tv, Ti},JLSparseMatrixCSR{Tv, Ti}}) where {Tv, Ti}
-    return JLSparseMatrixCOO(SparseMatrixCSC(xs))
-end
-function JLSparseMatrixCOO{Tv,Ti}(xs::Union{JLSparseMatrixCSC{Tv, Ti},JLSparseMatrixCSR{Tv, Ti}}) where {Tv, Ti}
-    return JLSparseMatrixCOO(SparseMatrixCSC(xs))
-end
-function JLSparseMatrixCSC(xs::JLSparseMatrixCSR{Tv, Ti}) where {Ti, Tv}
-    return JLSparseMatrixCSC(SparseMatrixCSC(xs))
-end
-function JLSparseMatrixCSC(xs::JLSparseMatrixCOO{Tv, Ti}) where {Ti, Tv}
-    return JLSparseMatrixCSC(SparseMatrixCSC(xs))
-end
-function JLSparseMatrixCSC{Tv,Ti}(xs::JLSparseMatrixCOO{Tv, Ti}) where {Ti, Tv}
-    return JLSparseMatrixCSC(SparseMatrixCSC(xs))
-end
-function JLSparseMatrixCSR(xs::JLSparseMatrixCOO{Tv, Ti}) where {Ti, Tv}
-    return JLSparseMatrixCSR(SparseMatrixCSC(xs))
-end
-function JLSparseMatrixCSR{Tv,Ti}(xs::JLSparseMatrixCOO{Tv, Ti}) where {Ti, Tv}
-    return JLSparseMatrixCSR(SparseMatrixCSC(xs))
-end
+# Cross-format device→device conversions (CSC↔CSR↔COO): the back-end owns the constructors;
+# the on-device conversion algorithm is GPUArrays' generic `convert` (replacing the former
+# host round-trips through `SparseMatrixCSC`). The `sparse_csc/csr/coo` verbs above route to
+# these. The COO→CSR/CSC path needs `sortperm`, provided below.
+JLSparseMatrixCOO(A::Union{JLSparseMatrixCSC,JLSparseMatrixCSR}) =
+    convert(JLSparseMatrixCOO, A)
+JLSparseMatrixCSR(A::Union{JLSparseMatrixCSC,JLSparseMatrixCOO}) =
+    convert(JLSparseMatrixCSR, A)
+JLSparseMatrixCSC(A::Union{JLSparseMatrixCSR,JLSparseMatrixCOO}) =
+    convert(JLSparseMatrixCSC, A)
+
+# `sortperm` on a JLArray vector, required by the generic COO→CSR/CSC conversions (and by
+# `findnz`). Computed on the host and wrapped back into a device vector so that the gather
+# kernel can index the permutation on-device (mirrors Metal's `sortperm`, which also returns
+# a device `Int` vector).
+Base.sortperm(x::JLVector; kwargs...) = JLArray(sortperm(Array(x); kwargs...))
 
 SparseArrays.sparse(xs::JLArray{T,1}) where {T} =
     GPUArrays.to_sparse(JLSparseVector{T,Int}, xs)

From f0be29ffb004a4dd8bfe7c9cd6aeaa6792912df5 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 23 Jun 2026 14:54:50 +0200
Subject: [PATCH 11/15] Drop `sparse_coo_type`; test COO via `sparse_types`

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 test/runtests.jl         |  3 +--
 test/testsuite.jl        |  1 -
 test/testsuite/sparse.jl | 26 ++++++++++++++++++--------
 3 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 5d13993ad..f60c43e60 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -8,8 +8,7 @@ const init_worker_code = quote
 
     include("testsuite.jl")
 
-    TestSuite.sparse_types(::Type{<:JLArray}) = (JLSparseVector, JLSparseMatrixCSC, JLSparseMatrixCSR)
-    TestSuite.sparse_coo_type(::Type{<:JLArray}) = JLSparseMatrixCOO
+    TestSuite.sparse_types(::Type{<:JLArray}) = (JLSparseVector, JLSparseMatrixCSC, JLSparseMatrixCSR, JLSparseMatrixCOO)
     TestSuite.sparse_types(::Type{<:Array})   = (SparseVector, SparseMatrixCSC)
 
     # Disable Float16-related tests until JuliaGPU/KernelAbstractions#600 is resolved
diff --git a/test/testsuite.jl b/test/testsuite.jl
index f95c7366e..e4aea4f72 100644
--- a/test/testsuite.jl
+++ b/test/testsuite.jl
@@ -76,7 +76,6 @@ supported_eltypes() = (Int16, Int32, Int64,
 # derived sparse types that are supported by the array type
 
 sparse_types(::Type{AT}) where {AT} = ()
-sparse_coo_type(::Type{AT}) where {AT} = nothing
 
 # some convenience predicates for filtering test eltypes
 isrealtype(T) = T <: Real
diff --git a/test/testsuite/sparse.jl b/test/testsuite/sparse.jl
index 7a15dd09e..51d93e678 100644
--- a/test/testsuite/sparse.jl
+++ b/test/testsuite/sparse.jl
@@ -7,16 +7,26 @@
             broadcasting_vector(sparse_AT, AT, eltypes)
             iszero_vector(sparse_AT, eltypes)
         elseif sparse_AT <: AbstractSparseMatrix
-            matrix(sparse_AT, eltypes)
-            matrix_construction(sparse_AT, AT, eltypes)
-            broadcasting_matrix(sparse_AT, AT, eltypes)
-            mapreduce_matrix(sparse_AT, eltypes)
-            linalg(sparse_AT, eltypes)
-            iszero_matrix(sparse_AT, eltypes)
+            # The matrix battery is only run for CSC/CSR. COO lacks the generic operations
+            # these exercise: broadcast (which `iszero_matrix` and several construction
+            # paths use), dims-reductions, `opnorm`, and `issymmetric`/`similar`-with-shape.
+            # COO is still registered in `sparse_types` so the repeated-index `coo_matmul`
+            # test below is discovered; the loop itself doesn't otherwise exercise it.
+            if sparse_AT <: Union{GPUArrays.AbstractGPUSparseMatrixCSC,
+                                  GPUArrays.AbstractGPUSparseMatrixCSR}
+                matrix(sparse_AT, eltypes)
+                matrix_construction(sparse_AT, AT, eltypes)
+                broadcasting_matrix(sparse_AT, AT, eltypes)
+                mapreduce_matrix(sparse_AT, eltypes)
+                linalg(sparse_AT, eltypes)
+                iszero_matrix(sparse_AT, eltypes)
+            end
        end
     end
-    coo_AT = sparse_coo_type(AT)
-    coo_AT === nothing || coo_matmul(coo_AT, AT, eltypes)
+    # the repeated-index COO matmul test runs against whichever COO type the back-end
+    # registers in `sparse_types` (no separate `sparse_coo_type` hook).
+    i = findfirst(T -> T <: GPUArrays.AbstractGPUSparseMatrixCOO, sparse_ATs)
+    i === nothing || coo_matmul(sparse_ATs[i], AT, eltypes)
 end
 
 using SparseArrays

From 21b3dd7e18e03228b67684cc17e1757a4fb35070 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 23 Jun 2026 16:10:05 +0200
Subject: [PATCH 12/15] Test sparse matmul and conversions generically

Also add the missing JLArrays COO identity constructor.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 lib/JLArrays/src/JLArrays.jl |   2 +
 test/testsuite/sparse.jl     | 147 +++++++++++++++++++++++++++++++++--
 2 files changed, 144 insertions(+), 5 deletions(-)

diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl
index 882f7ba1d..e18b2eabc 100644
--- a/lib/JLArrays/src/JLArrays.jl
+++ b/lib/JLArrays/src/JLArrays.jl
@@ -212,6 +212,8 @@ JLSparseMatrixCOO(rowInd::JLArray{Ti, 1}, colInd::JLArray{Ti, 1},
                   nnz::Integer=length(nzVal)) where {Tv, Ti <: Integer} =
     JLSparseMatrixCOO{Tv, Ti}(rowInd, colInd, nzVal, dims, nnz)
 
+JLSparseMatrixCOO(A::JLSparseMatrixCOO) = A
+
 Base.length(x::JLSparseMatrixCOO) = prod(x.dims)
 Base.size(x::JLSparseMatrixCOO) = x.dims
 SparseArrays.nnz(x::JLSparseMatrixCOO) = x.nnz
diff --git a/test/testsuite/sparse.jl b/test/testsuite/sparse.jl
index 51d93e678..73134c68d 100644
--- a/test/testsuite/sparse.jl
+++ b/test/testsuite/sparse.jl
@@ -1,5 +1,6 @@
 @testsuite "sparse" (AT, eltypes)->begin
     sparse_ATs = sparse_types(AT)
+    matrix_ATs = filter(T -> T <: AbstractSparseMatrix, sparse_ATs)
     for sparse_AT in sparse_ATs
         if sparse_AT <: AbstractSparseVector
             vector(sparse_AT, AT, eltypes)
@@ -7,11 +8,13 @@
             broadcasting_vector(sparse_AT, AT, eltypes)
             iszero_vector(sparse_AT, eltypes)
         elseif sparse_AT <: AbstractSparseMatrix
-            # The matrix battery is only run for CSC/CSR. COO lacks the generic operations
-            # these exercise: broadcast (which `iszero_matrix` and several construction
-            # paths use), dims-reductions, `opnorm`, and `issymmetric`/`similar`-with-shape.
-            # COO is still registered in `sparse_types` so the repeated-index `coo_matmul`
-            # test below is discovered; the loop itself doesn't otherwise exercise it.
+            # `matmul`/`matmul_accumulation` run for every matrix format (incl. COO, which
+            # natively supports matmul). The rest of the matrix battery is CSC/CSR-only:
+            # COO lacks the generic operations they exercise — broadcast (used by
+            # `iszero_matrix` and several construction paths), dims-reductions, `opnorm`,
+            # and `issymmetric`/`similar`-with-shape.
+            matmul(sparse_AT, AT, eltypes)
+            matmul_accumulation(sparse_AT, AT, eltypes)
             if sparse_AT <: Union{GPUArrays.AbstractGPUSparseMatrixCSC,
                                   GPUArrays.AbstractGPUSparseMatrixCSR}
                 matrix(sparse_AT, eltypes)
@@ -23,6 +26,8 @@
             end
        end
     end
+    # cross-format conversions need ≥2 registered matrix formats to be meaningful.
+    length(matrix_ATs) > 1 && conversions(matrix_ATs, AT, eltypes)
     # the repeated-index COO matmul test runs against whichever COO type the back-end
     # registers in `sparse_types` (no separate `sparse_coo_type` hook).
     i = findfirst(T -> T <: GPUArrays.AbstractGPUSparseMatrixCOO, sparse_ATs)
@@ -119,6 +124,138 @@ function coo_matmul(AT, dense_AT, eltypes)
     end
 end
 
+# Narrow complex test data to the matmul eltype: complex eltypes keep both parts (so
+# `transpose` and `adjoint` differ); real eltypes take the real part, since a complex value
+# with a nonzero imaginary part can't be converted to a real type directly. Projecting onto
+# the real part reproduces the proven real test data exactly (e.g. `1 + 2im` → `1`).
+matmul_cast(x::AbstractArray, ::Type{ET}) where {ET<:Complex} = ET.(x)
+matmul_cast(x::AbstractArray, ::Type{ET}) where {ET<:Real}    = ET.(real.(x))
+matmul_cast(x::Number, ::Type{ET}) where {ET<:Complex} = ET(x)
+matmul_cast(x::Number, ::Type{ET}) where {ET<:Real}    = ET(real(x))
+
+# Per matrix format, general (duplicate-free) matmul: SpMV, SpMM, and dense·sparse, each
+# with transpose/adjoint and the 5-arg `mul!`. The repeated-coordinate COO case is covered
+# separately by `coo_matmul`.
+function matmul(AT, dense_AT, eltypes)
+    for ET in (Float16, Float32, ComplexF16, ComplexF32)
+        ET in eltypes || continue
+        @testset "$(nameof(AT)) matmul($ET)" begin
+            V = matmul_cast(ComplexF32[1 + 2im, 2 - im, 3 + im, 4 - 3im, 5 + 2im], ET)
+            A = sparse([1, 1, 2, 4, 4], [1, 3, 2, 1, 4], V, 4, 4)
+            dA = AT(A)
+
+            α = matmul_cast(2 + im, ET)
+            β = matmul_cast(3 - im, ET)
+
+            # SpMV, transpose/adjoint, and the 5-arg and bool `mul!`
+            x = matmul_cast(ComplexF32[2 - im, 3 + 2im, 5 - 4im, 7 + im], ET)
+            dx = dense_AT(x)
+            @test Array(dA * dx) ≈ Array(A * x)
+            @test Array(transpose(dA) * dx) ≈ Array(transpose(A) * x)
+            @test Array(adjoint(dA) * dx) ≈ Array(adjoint(A) * x)
+
+            y = matmul_cast(ComplexF32[1 + im, 2 - im, 3 + 2im, 4 - im], ET)
+            dy = dense_AT(copy(y))
+            mul!(dy, dA, dx, α, β)
+            @test Array(dy) ≈ α .* Array(A * x) .+ β .* y
+
+            dz = dense_AT(zeros(ET, size(A, 1)))
+            mul!(dz, dA, dx, true, false)
+            @test Array(dz) ≈ Array(A * x)
+
+            # SpMM, transpose/adjoint, and the 5-arg `mul!`
+            B = matmul_cast(ComplexF32[
+                1 + im 2 - im 3 + 2im;
+                4 - im 5 + im 6 - 2im;
+                7 + im 8 - im 9 + 3im;
+                2 + 4im 3 - 5im 4 + im
+            ], ET)
+            dB = dense_AT(B)
+            @test Array(dA * dB) ≈ Array(A * B)
+            @test Array(transpose(dA) * dB) ≈ Array(transpose(A) * B)
+            @test Array(adjoint(dA) * dB) ≈ Array(adjoint(A) * B)
+
+            C = matmul_cast(ComplexF32[
+                1 - im 2 + im 3 - 2im;
+                4 + im 5 - im 6 + 2im;
+                7 - im 8 + im 9 - 3im;
+                2 - 4im 3 + 5im 4 - im
+            ], ET)
+            dC = dense_AT(copy(C))
+            mul!(dC, dA, dB, α, β)
+            @test Array(dC) ≈ α .* Array(A * B) .+ β .* C
+
+            # dense·sparse, transpose/adjoint, and the 5-arg `mul!`
+            L = matmul_cast(ComplexF32[
+                1 - im 2 + im 3 - 2im 4 + im;
+                5 + 2im 6 - im 7 + im 8 - 3im
+            ], ET)
+            dL = dense_AT(L)
+            @test Array(dL * dA) ≈ Array(L * A)
+            @test Array(dL * transpose(dA)) ≈ Array(L * transpose(A))
+            @test Array(dL * adjoint(dA)) ≈ Array(L * adjoint(A))
+
+            D = matmul_cast(ComplexF32[
+                1 + im 2 - im 3 + 2im 4 - im;
+                2 + 3im 3 - im 5 + im 7 - 2im
+            ], ET)
+            dD = dense_AT(copy(D))
+            mul!(dD, dL, dA, α, β)
+            @test Array(dD) ≈ α .* Array(L * A) .+ β .* D
+        end
+    end
+end
+
+# Per matrix format, the Float16/ComplexF16 accumulation contract: the back-end must
+# accumulate in Float32/ComplexF32 and round once at the end. Every product feeding a given
+# output entry is identical, so the n-term sum is independent of accumulation order, and the
+# result must match a host reference accumulated in `Tacc` *exactly* (`==`, not `≈`); a
+# narrow Float16 accumulation would lose precision and the equality would fail.
+function matmul_accumulation(AT, dense_AT, eltypes)
+    for ET in (Float16, ComplexF16)
+        ET in eltypes || continue
+        Tacc = ET <: Complex ? ComplexF32 : Float32
+        @testset "$(nameof(AT)) matmul accumulation($ET)" begin
+            n = 128
+            av = matmul_cast(0.1 + 0.2im, ET)
+            xv = matmul_cast(0.1 - 0.3im, ET)
+
+            # SpMV: a 1×n row times a length-n vector
+            A = sparse(fill(1, n), 1:n, fill(av, n), 1, n)
+            x = fill(xv, n)
+            dA = AT(A)
+            dx = dense_AT(x)
+            @test Array(dA * dx) == ET.(SparseMatrixCSC{Tacc,Int}(A) * Tacc.(x))
+
+            # SpMM: the same row times an n×3 dense matrix
+            B = fill(xv, n, 3)
+            dB = dense_AT(B)
+            @test Array(dA * dB) == ET.(SparseMatrixCSC{Tacc,Int}(A) * Tacc.(B))
+
+            # dense·sparse: a 3×n dense matrix times an n×1 column
+            S = sparse(1:n, fill(1, n), fill(av, n), n, 1)
+            dS = AT(S)
+            L = fill(xv, 3, n)
+            dL = dense_AT(L)
+            @test Array(dL * dS) == ET.(Tacc.(L) * SparseMatrixCSC{Tacc,Int}(S))
+        end
+    end
+end
+
+# Round-trip a duplicate-free matrix (and an empty one) through every ordered pair of
+# registered matrix formats: `src == dst` exercises the identity path and `src != dst` the
+# cross-format `convert`. Conversions preserve coordinates, so the round-trip is exact.
+function conversions(matrix_ATs, dense_AT, eltypes)
+    @testset "sparse format conversions" begin
+        A = sparse([1, 4, 2, 4, 1], [3, 1, 2, 5, 1], Float32[3, 4, 5, 6, 7], 4, 5)
+        Z = spzeros(Float32, 3, 4)
+        for src in matrix_ATs, dst in matrix_ATs
+            @test SparseMatrixCSC(dst(src(A))) == A
+            @test SparseMatrixCSC(dst(src(Z))) == Z
+        end
+    end
+end
+
 function vector(AT, dense_AT, eltypes)
     for ET in eltypes
         @testset "Sparse vector properties($ET)" begin

From d8a8c60e699931c9734641cf85a383883519ea38 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 23 Jun 2026 18:29:13 +0200
Subject: [PATCH 13/15] Allocate empty sparse arrays via spzeros/undef

Mandate the undef constructor as the empty-of-shape primitive; similar delegates to it. JLArrays now provides spzeros + undef constructors.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 docs/src/interface.md        | 14 ++++++++++++-
 lib/JLArrays/src/JLArrays.jl | 40 +++++++++++++++++++++++++++++++++---
 test/testsuite/sparse.jl     | 34 ++++++++++++++++++++++++++++++
 3 files changed, 84 insertions(+), 4 deletions(-)

diff --git a/docs/src/interface.md b/docs/src/interface.md
index 0aeb49636..245a2bfa5 100644
--- a/docs/src/interface.md
+++ b/docs/src/interface.md
@@ -59,9 +59,21 @@ formats you need, but note that several generic operations route through COO.
   * **Constructors** — from component arrays (`MyCSR(rowPtr, colVal, nzVal, dims)`),
     between formats (`MyCSR(::MyCOO)`, …), and to/from host `SparseArrays`
     (`MyCSC(::SparseMatrixCSC)`, `SparseMatrixCSC(::MyCSC)`).
+  * **`undef` constructors** — `MyCSC{Tv,Ti}(undef, dims)` / `MyVec{Tv,Ti}(undef, n)`,
+    building a *structurally-empty* array (no stored entries), mirroring dense
+    `Array{T}(undef, dims)` and `SparseArrays`' `SparseMatrixCSC{Tv,Ti}(undef, m, n)`. This
+    is the empty-of-a-shape allocation primitive. Note there is no uninitialized-structure
+    analogue: for a sparse array `undef` means empty, exactly as in `SparseArrays`.
+    Implementing these through a `spzeros(Tv, Ti, dims…; fmt=…)` helper (the value-level
+    analogue of `SparseArrays.spzeros`, with a format selector) is recommended — it also
+    serves as a convenient public, format-polymorphic entry point — but `spzeros` itself is
+    not mandated, since its signature is back-end-flavored (format symbols, storage modes)
+    whereas the `undef` constructor is uniform.
   * **`Base.similar`** — structure-preserving (`similar(A)`, `similar(A, ::Type)`) and
     empty-of-a-shape (`similar(A, ::Type, dims)`), as for dense arrays; generic code
-    allocates its outputs through `similar`, never by naming a type.
+    allocates its outputs through `similar`, never by naming a type. The empty-of-a-shape
+    form just delegates to the `undef` constructor (threading the source's storage mode),
+    so the constructor is the real primitive.
   * **Conversion verbs** `GPUArrays.sparse_csc`/`sparse_csr`/`sparse_coo`, for the formats
     other than the one each produces (the identity case is generic) — the value-level
     analogue of `SparseArrays.sparse`.
diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl
index e18b2eabc..5331c1d75 100644
--- a/lib/JLArrays/src/JLArrays.jl
+++ b/lib/JLArrays/src/JLArrays.jl
@@ -282,14 +282,48 @@ Base.similar(Mat::JLSparseMatrixCSC{<:Any, Ti}, ::Type{T}) where {T, Ti} =
 Base.similar(Vec::JLSparseVector{<:Any, Ti}, ::Type{T}) where {T, Ti} =
     JLSparseVector{T, Ti}(copy(Vec.iPtr), similar(nonzeros(Vec), T), length(Vec))
 
+# Empty (structurally-zero) sparse allocation — the value-level analogue of
+# `SparseArrays.spzeros` and the shared implementation behind the `undef` constructors and
+# the empty-of-shape `similar` below. The matrix method selects the format via `fmt`
+# (mirroring `sparse(A; fmt=…)`); the vector method takes a length. Unexported (so it never
+# clashes with `SparseArrays.spzeros`); call it as `JLArrays.spzeros`.
+function spzeros(::Type{Tv}, ::Type{Ti}, dims::Dims{2}; fmt=:csc) where {Tv, Ti}
+    if fmt === :csc
+        JLSparseMatrixCSC{Tv, Ti}(JLVector(ones(Ti, dims[2] + 1)), JLVector{Ti}(undef, 0), JLVector{Tv}(undef, 0), dims)
+    elseif fmt === :csr
+        JLSparseMatrixCSR{Tv, Ti}(JLVector(ones(Ti, dims[1] + 1)), JLVector{Ti}(undef, 0), JLVector{Tv}(undef, 0), dims)
+    elseif fmt === :coo
+        JLSparseMatrixCOO{Tv, Ti}(JLVector{Ti}(undef, 0), JLVector{Ti}(undef, 0), JLVector{Tv}(undef, 0), dims)
+    else
+        throw(ArgumentError("format :$fmt not available, use :csc, :csr, or :coo"))
+    end
+end
+spzeros(::Type{Tv}, ::Type{Ti}, dims::Dims{1}) where {Tv, Ti} =
+    JLSparseVector{Tv, Ti}(JLVector{Ti}(undef, 0), JLVector{Tv}(undef, 0), dims[1])
+spzeros(::Type{Tv}, ::Type{Ti}, dims::Integer...; kw...) where {Tv, Ti} =
+    spzeros(Tv, Ti, map(Int, dims); kw...)
+spzeros(::Type{Tv}, dims::Union{Integer,Dims}...; kw...) where {Tv} =
+    spzeros(Tv, Int, dims...; kw...)
+
+# `undef` constructors — the mandated empty-of-shape allocation primitive, mirroring the
+# dense `JLArray{T,N}(undef, dims)` and `SparseArrays`' `SparseMatrixCSC{Tv,Ti}(undef, m, n)`.
+JLSparseVector{Tv, Ti}(::UndefInitializer, len::Integer) where {Tv, Ti} =
+    spzeros(Tv, Ti, (Int(len),))
+JLSparseMatrixCSC{Tv, Ti}(::UndefInitializer, dims::Dims{2}) where {Tv, Ti} = spzeros(Tv, Ti, dims; fmt=:csc)
+JLSparseMatrixCSR{Tv, Ti}(::UndefInitializer, dims::Dims{2}) where {Tv, Ti} = spzeros(Tv, Ti, dims; fmt=:csr)
+JLSparseMatrixCOO{Tv, Ti}(::UndefInitializer, dims::Dims{2}) where {Tv, Ti} = spzeros(Tv, Ti, dims; fmt=:coo)
+JLSparseMatrixCSC{Tv, Ti}(u::UndefInitializer, m::Integer, n::Integer) where {Tv, Ti} = JLSparseMatrixCSC{Tv, Ti}(u, (Int(m), Int(n)))
+JLSparseMatrixCSR{Tv, Ti}(u::UndefInitializer, m::Integer, n::Integer) where {Tv, Ti} = JLSparseMatrixCSR{Tv, Ti}(u, (Int(m), Int(n)))
+JLSparseMatrixCOO{Tv, Ti}(u::UndefInitializer, m::Integer, n::Integer) where {Tv, Ti} = JLSparseMatrixCOO{Tv, Ti}(u, (Int(m), Int(n)))
+
 # empty sparse arrays of a given shape, used by the generic broadcast machinery as the
 # output container before its storage is sized and filled. the index type is preserved.
 Base.similar(Mat::JLSparseMatrixCSC{<:Any, Ti}, ::Type{T}, N::Int, M::Int) where {T, Ti} =
-    JLSparseMatrixCSC{T, Ti}(JLVector(ones(Ti, M + 1)), JLVector{Ti}(undef, 0), JLVector{T}(undef, 0), (N, M))
+    JLSparseMatrixCSC{T, Ti}(undef, (N, M))
 Base.similar(Mat::JLSparseMatrixCSR{<:Any, Ti}, ::Type{T}, N::Int, M::Int) where {T, Ti} =
-    JLSparseMatrixCSR{T, Ti}(JLVector(ones(Ti, N + 1)), JLVector{Ti}(undef, 0), JLVector{T}(undef, 0), (N, M))
+    JLSparseMatrixCSR{T, Ti}(undef, (N, M))
 Base.similar(Vec::JLSparseVector{<:Any, Ti}, ::Type{T}, dims::Dims{1}) where {T, Ti} =
-    JLSparseVector{T, Ti}(JLVector{Ti}(undef, 0), JLVector{T}(undef, 0), dims[1])
+    JLSparseVector{T, Ti}(undef, dims[1])
 
 Base.similar(Mat::JLSparseMatrixCSC{Tv, Ti}, N::Int, M::Int) where {Tv, Ti} = similar(Mat, Tv, N, M)
 Base.similar(Mat::JLSparseMatrixCSR{Tv, Ti}, N::Int, M::Int) where {Tv, Ti} = similar(Mat, Tv, N, M)
diff --git a/test/testsuite/sparse.jl b/test/testsuite/sparse.jl
index 73134c68d..823e4e925 100644
--- a/test/testsuite/sparse.jl
+++ b/test/testsuite/sparse.jl
@@ -26,6 +26,7 @@
             end
        end
     end
+    undef_construction(sparse_ATs, eltypes)
     # cross-format conversions need ≥2 registered matrix formats to be meaningful.
     length(matrix_ATs) > 1 && conversions(matrix_ATs, AT, eltypes)
     # the repeated-index COO matmul test runs against whichever COO type the back-end
@@ -256,6 +257,39 @@ function conversions(matrix_ATs, dense_AT, eltypes)
     end
 end
 
+# The mandated empty-of-shape primitive: `ST{Tv,Ti}(undef, dims)` builds a structurally
+# empty array (no stored entries), mirroring dense `undef` and `SparseArrays`' undef
+# constructors. Exercised per registered format; the back-end's index type is read off a
+# reference instance so the test stays back-end agnostic.
+function undef_construction(sparse_ATs, eltypes)
+    @testset "undef construction" begin
+        for ST in sparse_ATs
+            if ST <: AbstractSparseVector
+                Ti = typeof(ST(spzeros(Float32, 3))).parameters[2]
+                for Tv in (Float32, ComplexF32)
+                    Tv in eltypes || continue
+                    v = ST{Tv,Ti}(undef, 5)
+                    @test v isa ST{Tv,Ti}
+                    @test size(v) == (5,)
+                    @test nnz(v) == 0
+                    @test SparseVector(v) == spzeros(Tv, 5)
+                end
+            else
+                Ti = typeof(ST(spzeros(Float32, 3, 4))).parameters[2]
+                for Tv in (Float32, ComplexF32)
+                    Tv in eltypes || continue
+                    A = ST{Tv,Ti}(undef, (3, 4))
+                    @test A isa ST{Tv,Ti}
+                    @test size(A) == (3, 4)
+                    @test nnz(A) == 0
+                    @test SparseMatrixCSC(A) == spzeros(Tv, 3, 4)
+                    @test size(ST{Tv,Ti}(undef, 3, 4)) == (3, 4)  # varargs form
+                end
+            end
+        end
+    end
+end
+
 function vector(AT, dense_AT, eltypes)
     for ET in eltypes
         @testset "Sparse vector properties($ET)" begin

From ae6812fa6ef3abd4f1ad0fabfb40cbbd69099b82 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 23 Jun 2026 21:41:49 +0200
Subject: [PATCH 14/15] Convert sparse formats via coo_type/csr_type/csc_type
 hooks

Replaces the sparse branch's sparse_csc/csr/coo value-verbs with main's type-level format hooks, used as coo_type(A)(A); keeps the generic on-device convert algorithm as the engine. Re-aligns the conversion API with main.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 docs/src/interface.md        | 12 +++--
 lib/JLArrays/src/JLArrays.jl | 16 ++++---
 src/host/sparse.jl           | 92 +++++++++++++++++++-----------------
 3 files changed, 66 insertions(+), 54 deletions(-)

diff --git a/docs/src/interface.md b/docs/src/interface.md
index 245a2bfa5..7be109009 100644
--- a/docs/src/interface.md
+++ b/docs/src/interface.md
@@ -74,9 +74,15 @@ formats you need, but note that several generic operations route through COO.
     allocates its outputs through `similar`, never by naming a type. The empty-of-a-shape
     form just delegates to the `undef` constructor (threading the source's storage mode),
     so the constructor is the real primitive.
-  * **Conversion verbs** `GPUArrays.sparse_csc`/`sparse_csr`/`sparse_coo`, for the formats
-    other than the one each produces (the identity case is generic) — the value-level
-    analogue of `SparseArrays.sparse`.
+  * **Format-conversion hooks** `GPUArrays.coo_type`/`csr_type`/`csc_type` — map any of your
+    sparse-matrix types to the *type* of the named sibling format
+    (`coo_type(::Type{<:MyCSC}) = MyCOO`); generic code converts with `coo_type(A)(A)`. These
+    are type-level hooks rather than plain `convert(Dest, A)` because a format is the
+    wrapper's identity (distinct structs), not a type parameter — so, unlike an eltype change,
+    there is no generic wrapper→sibling-wrapper operation, and only the back-end knows its
+    sibling types. The cross-format `convert` methods above are the engine the resulting
+    constructors route through; the identity case (`coo_type(coo)(coo)`) is your identity
+    constructor.
   * **`KernelAbstractions.get_backend`** for the sparse types (usually
     `get_backend(nonzeros(A))`).
   * **`Adapt.adapt_structure`** converting each host struct to its device counterpart
diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl
index 5331c1d75..5ebfa26b6 100644
--- a/lib/JLArrays/src/JLArrays.jl
+++ b/lib/JLArrays/src/JLArrays.jl
@@ -264,11 +264,13 @@ function Base.getindex(A::JLSparseMatrixCSR{Tv, Ti}, i0::Integer, i1::Integer) w
 end
 
 GPUArrays.storage(a::JLArray) = a.data
-# format-conversion verbs (cross-format only; the identity cases are handled generically
-# by GPUArrays). these reuse the format constructors defined below.
-GPUArrays.sparse_csc(A::Union{JLSparseMatrixCSR,JLSparseMatrixCOO}) = JLSparseMatrixCSC(A)
-GPUArrays.sparse_csr(A::Union{JLSparseMatrixCSC,JLSparseMatrixCOO}) = JLSparseMatrixCSR(A)
-GPUArrays.sparse_coo(A::Union{JLSparseMatrixCSC,JLSparseMatrixCSR}) = JLSparseMatrixCOO(A)
+const JLSparseMatrix{Tv,Ti} = Union{JLSparseMatrixCSC{Tv,Ti},JLSparseMatrixCSR{Tv,Ti},JLSparseMatrixCOO{Tv,Ti}}
+# format-conversion hooks: map any JLArrays sparse matrix to the *type* of its sibling
+# format. Generic GPUArrays code converts with `coo_type(A)(A)` etc., routing through the
+# format constructors defined below.
+GPUArrays.csc_type(::Type{<:JLSparseMatrix}) = JLSparseMatrixCSC
+GPUArrays.csr_type(::Type{<:JLSparseMatrix}) = JLSparseMatrixCSR
+GPUArrays.coo_type(::Type{<:JLSparseMatrix}) = JLSparseMatrixCOO
 
 Base.similar(Mat::JLSparseMatrixCSR) = JLSparseMatrixCSR(copy(Mat.rowPtr), copy(Mat.colVal), similar(nonzeros(Mat)), size(Mat))
 Base.similar(Mat::JLSparseMatrixCSR, T::Type) = JLSparseMatrixCSR(copy(Mat.rowPtr), copy(Mat.colVal), similar(nonzeros(Mat), T), size(Mat))
@@ -493,8 +495,8 @@ JLSparseMatrixCOO(xs::JLArray{T,2}) where {T} =
     GPUArrays.to_sparse(JLSparseMatrixCOO{T,Int}, xs)
 # Cross-format device→device conversions (CSC↔CSR↔COO): the back-end owns the constructors;
 # the on-device conversion algorithm is GPUArrays' generic `convert` (replacing the former
-# host round-trips through `SparseMatrixCSC`). The `sparse_csc/csr/coo` verbs above route to
-# these. The COO→CSR/CSC path needs `sortperm`, provided below.
+# host round-trips through `SparseMatrixCSC`). The `csc_type/csr_type/coo_type` hooks above
+# route to these. The COO→CSR/CSC path needs `sortperm`, provided below.
 JLSparseMatrixCOO(A::Union{JLSparseMatrixCSC,JLSparseMatrixCSR}) =
     convert(JLSparseMatrixCOO, A)
 JLSparseMatrixCSR(A::Union{JLSparseMatrixCSC,JLSparseMatrixCOO}) =
diff --git a/src/host/sparse.jl b/src/host/sparse.jl
index 7c381b0d8..c3bf5d4b6 100644
--- a/src/host/sparse.jl
+++ b/src/host/sparse.jl
@@ -79,7 +79,7 @@ end
 Build a sparse array of type `ST` from the dense GPU array `A`, on the device (no host
 round-trip); the inverse of [`to_dense`](@ref). `ST` must be a concrete
 `AbstractGPUSparseVector` or `AbstractGPUSparseMatrixCOO` (other matrix formats follow by
-converting the resulting COO with `sparse_csr`/`sparse_csc`). The target format is required
+converting the resulting COO with `csr_type`/`csc_type`). The target format is required
 because the result layout would otherwise be ambiguous; `to_dense` needs no such argument.
 """
 function to_sparse(::Type{ST}, A::AnyGPUVector) where {Tv,Ti,ST<:AbstractGPUSparseVector{Tv,Ti}}
@@ -97,8 +97,8 @@ function to_sparse(::Type{ST}, A::AnyGPUVector) where {Tv,Ti,ST<:AbstractGPUSpar
 end
 
 # Densifying a dense array naturally produces coordinate (COO) format, so this builds a
-# COO directly. Other formats are obtained by converting the COO with `sparse_csr`/
-# `sparse_csc` (which back-ends already provide as constructors), avoiding any need to map
+# COO directly. Other formats are obtained by converting the COO with `csr_type`/
+# `csc_type` (which back-ends already provide as constructors), avoiding any need to map
 # a target type to its COO sibling.
 function to_sparse(::Type{ST}, A::AnyGPUMatrix) where {Tv,Ti,ST<:AbstractGPUSparseMatrixCOO{Tv,Ti}}
     check_sparse_target(ST, Tv, Ti)
@@ -147,7 +147,7 @@ function to_dense(x::AbstractGPUSparseVector{Tv}) where {Tv}
 end
 
 function to_dense(A::AbstractGPUSparseMatrix{Tv}) where {Tv}
-    coo = sparse_coo(A)
+    coo = coo_type(A)(A)
     out = fill!(similar(nonzeros(A), Tv, size(A)), zero(Tv))
     if nnz(coo) > 0
         densify_matrix_kernel!(get_backend(out))(out, coo.rowInd, coo.colInd, nonzeros(coo);
@@ -195,13 +195,17 @@ end
 
 sparse_op_size(A, trans::Bool) = trans ? reverse(size(A)) : size(A)
 
-# Format-conversion verbs: converting to the format already held is the identity. The
-# cross-format conversions are `convert` (below). A back-end supplies the 3 verb methods
-# naming its concrete sibling types — `sparse_coo(A::MyCSC) = convert(MyCOO, A)` — which is
-# also where its `MyCOO(::MyCSC)` constructors route.
-sparse_csc(A::AbstractGPUSparseMatrixCSC) = A
-sparse_csr(A::AbstractGPUSparseMatrixCSR) = A
-sparse_coo(A::AbstractGPUSparseMatrixCOO) = A
+# Format-conversion hooks: a back-end maps any of its sparse-matrix types to the *type* of
+# its sibling format (`coo_type(::Type{<:MyCSC}) = MyCOO`); generic code then converts with
+# `coo_type(A)(A)`. This is a type-level hook rather than plain `convert(Dest, A)` because the
+# format is the wrapper's identity (distinct structs/fields), not a type parameter — so unlike
+# an eltype change there is no generic wrapper→sibling-wrapper operation, and only the back-end
+# knows its sibling types. The cross-format `convert` algorithm below is the engine those
+# constructors route through; the identity case (`coo_type(coo)(coo)`) is the back-end's
+# identity constructor.
+coo_type(a) = coo_type(typeof(a))
+csr_type(a) = csr_type(typeof(a))
+csc_type(a) = csc_type(typeof(a))
 
 ## Generic format conversions (CSC↔CSR↔COO), on-device.
 #
@@ -362,11 +366,11 @@ function Base.convert(::Type{ST}, A::AbstractGPUSparseMatrixCOO) where {ST<:Abst
     return ST(colPtr, rowVal, nzVal, size(A))
 end
 
-# CSC ↔ CSR route through COO (the back-end's verb yields its concrete COO type).
+# CSC ↔ CSR route through COO (the back-end's hook yields its concrete COO type).
 Base.convert(::Type{ST}, A::AbstractGPUSparseMatrixCSC) where {ST<:AbstractGPUSparseMatrixCSR} =
-    convert(ST, sparse_coo(A))
+    convert(ST, coo_type(A)(A))
 Base.convert(::Type{ST}, A::AbstractGPUSparseMatrixCSR) where {ST<:AbstractGPUSparseMatrixCSC} =
-    convert(ST, sparse_coo(A))
+    convert(ST, coo_type(A)(A))
 
 @inline function sparse_atomic_add!(A::AbstractArray{<:Real}, parts, I::Integer, x)
     @inbounds Atomix.@atomic A[Int(I)] += x
@@ -587,7 +591,7 @@ function LinearAlgebra.generic_matvecmul!(y::AnyGPUVector{Ty}, tA::AbstractChar,
     y === x && throw(ArgumentError("output vector must not be aliased with input vector"))
     isempty(y) && return y
 
-    return spmv_coo!(y, sparse_coo(A), x, alpha, beta, trans, conjval)
+    return spmv_coo!(y, coo_type(A)(A), x, alpha, beta, trans, conjval)
 end
 
 LinearAlgebra.generic_matmatmul!(C::AnyGPUMatrix{Tc}, tA, tB,
@@ -617,7 +621,7 @@ function LinearAlgebra.generic_matmatmul!(C::AnyGPUMatrix{Tc}, tA, tB,
     C === B && throw(ArgumentError("output matrix must not be aliased with input matrix"))
     isempty(C) && return C
 
-    return spmm_coo!(C, sparse_coo(A), Bop, alpha, beta, trans, conjval)
+    return spmm_coo!(C, coo_type(A)(A), Bop, alpha, beta, trans, conjval)
 end
 
 LinearAlgebra.generic_matmatmul!(C::AnyGPUMatrix{Tc}, tA, tB,
@@ -647,7 +651,7 @@ function LinearAlgebra.generic_matmatmul!(C::AnyGPUMatrix{Tc}, tA, tB,
     C === A && throw(ArgumentError("output matrix must not be aliased with input matrix"))
     isempty(C) && return C
 
-    return dense_spmm_coo!(C, Aop, sparse_coo(B), alpha, beta, trans, conjval)
+    return dense_spmm_coo!(C, Aop, coo_type(B)(B), alpha, beta, trans, conjval)
 end
 
 function LinearAlgebra.opnorm(A::AbstractGPUSparseMatrixCSR, p::Real=2)
@@ -660,7 +664,7 @@ function LinearAlgebra.opnorm(A::AbstractGPUSparseMatrixCSR, p::Real=2)
     end
 end
 
-LinearAlgebra.opnorm(A::AbstractGPUSparseMatrixCSC, p::Real=2) = opnorm(sparse_csr(A), p)
+LinearAlgebra.opnorm(A::AbstractGPUSparseMatrixCSC, p::Real=2) = opnorm(csr_type(A)(A), p)
 
 function LinearAlgebra.norm(A::AbstractGPUSparseMatrix{T}, p::Real=2) where T
     if p == Inf
@@ -675,7 +679,7 @@ function LinearAlgebra.norm(A::AbstractGPUSparseMatrix{T}, p::Real=2) where T
 end
 
 function SparseArrays.findnz(S::MT) where {MT <: AbstractGPUSparseMatrix}
-    S2 = sparse_coo(S)
+    S2 = coo_type(S)(S)
     I = S2.rowInd
     J = S2.colInd
     V = S2.nzVal
@@ -715,61 +719,61 @@ end
 for SparseMatrixType in [:AbstractGPUSparseMatrixCSC, :AbstractGPUSparseMatrixCSR]
     @eval begin
         LinearAlgebra.triu(A::ST, k::Integer) where {T, ST<:$SparseMatrixType{T}} =
-            ST( triu(sparse_coo(A), k) )
+            ST( triu(coo_type(A)(A), k) )
         LinearAlgebra.triu(A::Transpose{T,<:ST}, k::Integer) where {T, ST<:$SparseMatrixType{T}} =
-            ST( triu(sparse_coo(_sptranspose(parent(A))), k) )
+            ST( triu(coo_type(parent(A))(_sptranspose(parent(A))), k) )
         LinearAlgebra.triu(A::Adjoint{T,<:ST}, k::Integer) where {T, ST<:$SparseMatrixType{T}} =
-            ST( triu(sparse_coo(_spadjoint(parent(A))), k) )
+            ST( triu(coo_type(parent(A))(_spadjoint(parent(A))), k) )
 
         LinearAlgebra.tril(A::ST, k::Integer) where {T, ST<:$SparseMatrixType{T}} =
-            ST( tril(sparse_coo(A), k) )
+            ST( tril(coo_type(A)(A), k) )
         LinearAlgebra.tril(A::Transpose{T,<:ST}, k::Integer) where {T, ST<:$SparseMatrixType{T}} =
-            ST( tril(sparse_coo(_sptranspose(parent(A))), k) )
+            ST( tril(coo_type(parent(A))(_sptranspose(parent(A))), k) )
         LinearAlgebra.tril(A::Adjoint{T,<:ST}, k::Integer) where {T, ST<:$SparseMatrixType{T}} =
-            ST( tril(sparse_coo(_spadjoint(parent(A))), k) )
+            ST( tril(coo_type(parent(A))(_spadjoint(parent(A))), k) )
 
         LinearAlgebra.triu(A::Union{ST, Transpose{T,<:ST}, Adjoint{T,<:ST}}) where {T, ST<:$SparseMatrixType{T}} =
-            ST( triu(sparse_coo(A), 0) )
+            ST( triu(coo_type(A)(A), 0) )
         LinearAlgebra.tril(A::Union{ST,Transpose{T,<:ST}, Adjoint{T,<:ST}}) where {T, ST<:$SparseMatrixType{T}} =
-            ST( tril(sparse_coo(A), 0) )
+            ST( tril(coo_type(A)(A), 0) )
 
         LinearAlgebra.kron(A::ST, B::ST) where {T, ST<:$SparseMatrixType{T}} =
-            ST( kron(sparse_coo(A), sparse_coo(B)) )
+            ST( kron(coo_type(A)(A), coo_type(B)(B)) )
         LinearAlgebra.kron(A::ST, B::Diagonal) where {T, ST<:$SparseMatrixType{T}} =
-            ST( kron(sparse_coo(A), B) )
+            ST( kron(coo_type(A)(A), B) )
         LinearAlgebra.kron(A::Diagonal, B::ST) where {T, ST<:$SparseMatrixType{T}} =
-            ST( kron(A, sparse_coo(B)) )
+            ST( kron(A, coo_type(B)(B)) )
 
         LinearAlgebra.kron(A::Transpose{T,<:ST}, B::ST) where {T, ST<:$SparseMatrixType{T}} =
-            ST( kron(sparse_coo(_sptranspose(parent(A))), sparse_coo(B)) )
+            ST( kron(coo_type(parent(A))(_sptranspose(parent(A))), coo_type(B)(B)) )
         LinearAlgebra.kron(A::ST, B::Transpose{T,<:ST}) where {T, ST<:$SparseMatrixType{T}} =
-            ST( kron(sparse_coo(A), sparse_coo(_sptranspose(parent(B)))) )
+            ST( kron(coo_type(A)(A), coo_type(parent(B))(_sptranspose(parent(B)))) )
         LinearAlgebra.kron(A::Transpose{T,<:ST}, B::Transpose{T,<:ST}) where {T, ST<:$SparseMatrixType{T}} =
-            ST( kron(sparse_coo(_sptranspose(parent(A))), sparse_coo(_sptranspose(parent(B)))) )
+            ST( kron(coo_type(parent(A))(_sptranspose(parent(A))), coo_type(parent(B))(_sptranspose(parent(B)))) )
         LinearAlgebra.kron(A::Transpose{T,<:ST}, B::Diagonal) where {T, ST<:$SparseMatrixType{T}} =
-            ST( kron(sparse_coo(_sptranspose(parent(A))), B) )
+            ST( kron(coo_type(parent(A))(_sptranspose(parent(A))), B) )
         LinearAlgebra.kron(A::Diagonal, B::Transpose{T,<:ST}) where {T, ST<:$SparseMatrixType{T}} =
-            ST( kron(A, sparse_coo(_sptranspose(parent(B)))) )
+            ST( kron(A, coo_type(parent(B))(_sptranspose(parent(B)))) )
 
         LinearAlgebra.kron(A::Adjoint{T,<:ST}, B::ST) where {T, ST<:$SparseMatrixType{T}} =
-            ST( kron(sparse_coo(_spadjoint(parent(A))), sparse_coo(B)) )
+            ST( kron(coo_type(parent(A))(_spadjoint(parent(A))), coo_type(B)(B)) )
         LinearAlgebra.kron(A::ST, B::Adjoint{T,<:ST}) where {T, ST<:$SparseMatrixType{T}} =
-            ST( kron(sparse_coo(A), sparse_coo(_spadjoint(parent(B)))) )
+            ST( kron(coo_type(A)(A), coo_type(parent(B))(_spadjoint(parent(B)))) )
         LinearAlgebra.kron(A::Adjoint{T,<:ST}, B::Adjoint{T,<:ST}) where {T, ST<:$SparseMatrixType{T}} =
-            ST( kron(sparse_coo(_spadjoint(parent(A))), sparse_coo(_spadjoint(parent(B)))) )
+            ST( kron(coo_type(parent(A))(_spadjoint(parent(A))), coo_type(parent(B))(_spadjoint(parent(B)))) )
         LinearAlgebra.kron(A::Adjoint{T,<:ST}, B::Diagonal) where {T, ST<:$SparseMatrixType{T}} =
-            ST( kron(sparse_coo(_spadjoint(parent(A))), B) )
+            ST( kron(coo_type(parent(A))(_spadjoint(parent(A))), B) )
         LinearAlgebra.kron(A::Diagonal, B::Adjoint{T,<:ST}) where {T, ST<:$SparseMatrixType{T}} =
-            ST( kron(A, sparse_coo(_spadjoint(parent(B)))) )
+            ST( kron(A, coo_type(parent(B))(_spadjoint(parent(B)))) )
 
 
         function Base.reshape(A::ST, dims::Dims) where {ST<:$SparseMatrixType}
-            B = sparse_coo(A)
+            B = coo_type(A)(A)
             ST(reshape(B, dims))
         end
 
         function SparseArrays.droptol!(A::ST, tol::Real) where {ST<:$SparseMatrixType}
-            B = sparse_coo(A)
+            B = coo_type(A)(A)
             droptol!(B, tol)
             copyto!(A, ST(B))
         end
@@ -1585,9 +1589,9 @@ function Base.mapreduce(f, op, A::AbstractGPUSparseMatrix; dims=:, init=nothing)
     in(dims, [Colon(), 1, 2]) || error("only dims=:, dims=1 or dims=2 is supported")
 
     if A isa AbstractGPUSparseMatrixCSR && dims == 1
-        A = sparse_csc(A)
+        A = csc_type(A)(A)
     elseif A isa AbstractGPUSparseMatrixCSC && dims == 2
-        A = sparse_csr(A)
+        A = csr_type(A)(A)
     end
     m, n      = size(A)
     val_array = nonzeros(A)

From 5115577d7e5c59b590596472e4f708f3ce56dba3 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 23 Jun 2026 23:22:48 +0200
Subject: [PATCH 15/15] Only run the matmul accumulation test on GPU sparse
 formats

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 test/testsuite/sparse.jl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/testsuite/sparse.jl b/test/testsuite/sparse.jl
index 823e4e925..f54ffbf85 100644
--- a/test/testsuite/sparse.jl
+++ b/test/testsuite/sparse.jl
@@ -8,13 +8,13 @@
             broadcasting_vector(sparse_AT, AT, eltypes)
             iszero_vector(sparse_AT, eltypes)
         elseif sparse_AT <: AbstractSparseMatrix
-            # `matmul`/`matmul_accumulation` run for every matrix format (incl. COO, which
-            # natively supports matmul). The rest of the matrix battery is CSC/CSR-only:
-            # COO lacks the generic operations they exercise — broadcast (used by
-            # `iszero_matrix` and several construction paths), dims-reductions, `opnorm`,
-            # and `issymmetric`/`similar`-with-shape.
+            # `matmul` runs for every format (incl. COO). The matrix battery below is
+            # CSC/CSR-only: COO lacks the broadcast/reductions/`opnorm`/`issymmetric` it uses.
             matmul(sparse_AT, AT, eltypes)
-            matmul_accumulation(sparse_AT, AT, eltypes)
+            # Float32 accumulation is a GPU-kernel contract; stdlib `Array` accumulates
+            # narrowly, so assert it only for GPU formats.
+            sparse_AT <: GPUArrays.AbstractGPUSparseMatrix &&
+                matmul_accumulation(sparse_AT, AT, eltypes)
             if sparse_AT <: Union{GPUArrays.AbstractGPUSparseMatrixCSC,
                                   GPUArrays.AbstractGPUSparseMatrixCSR}
                 matrix(sparse_AT, eltypes)