diff --git a/Project.toml b/Project.toml
index 707ee08d..d0e72995 100644
--- a/Project.toml
+++ b/Project.toml
@@ -7,6 +7,7 @@ projects = ["lib/GPUArraysCore", "lib/JLArrays", "test", "docs"]
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
+Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
@@ -27,6 +28,7 @@ JLD2Ext = "JLD2"
 
 [compat]
 Adapt = "4.6.1"
+Atomix = "1.1"
 GPUArraysCore = "= 0.2.0"
 JLD2 = "0.4, 0.5, 0.6"
 KernelAbstractions = "0.9.28, 0.10"
diff --git a/docs/src/interface.md b/docs/src/interface.md
index a9c57c53..7be10900 100644
--- a/docs/src/interface.md
+++ b/docs/src/interface.md
@@ -31,6 +31,81 @@ KernelAbstractions.get_backend(a::CA) where CA <: CustomArray = CustomBackend()
 
 There are numerous examples of potential interfaces for GPUArrays, such as with [JLArrays](https://github.com/JuliaGPU/GPUArrays.jl/blob/main/lib/JLArrays/src/JLArrays.jl), [CuArrays](https://github.com/JuliaGPU/CUDA.jl/blob/main/src/gpuarrays.jl), and [ROCArrays](https://github.com/JuliaGPU/AMDGPU.jl/blob/main/src/gpuarrays.jl).
 
+## Sparse arrays
+
+A sparse array can't share the `AbstractGPUArray` supertype — that is a `DenseArray`,
+whereas a sparse array must be an `AbstractSparseArray` — so GPUArrays keeps a parallel
+sparse hierarchy with its own generic functionality. Integrating a back-end has three
+parts: the storage types it provides, the methods it implements to plug them in, and the
+functionality it then gets for free.
+
+### Storage types to provide
+
+One mutable struct per supported format, subtyping the matching abstract type and using
+the conventional field names (generic code reads them directly):
+
+| supertype | fields |
+|:--|:--|
+| `AbstractGPUSparseVector{Tv,Ti}`    | `iPtr`, `nzVal`, `len`, `nnz` |
+| `AbstractGPUSparseMatrixCSC{Tv,Ti}` | `colPtr`, `rowVal`, `nzVal`, `dims`, `nnz` |
+| `AbstractGPUSparseMatrixCSR{Tv,Ti}` | `rowPtr`, `colVal`, `nzVal`, `dims`, `nnz` |
+| `AbstractGPUSparseMatrixCOO{Tv,Ti}` | `rowInd`, `colInd`, `nzVal`, `dims`, `nnz` |
+
+The pointer/index/value arrays are the back-end's own dense vector type. Provide only the
+formats you need, but note that several generic operations route through COO.
+
+### Interface to implement
+
+  * **Constructors** — from component arrays (`MyCSR(rowPtr, colVal, nzVal, dims)`),
+    between formats (`MyCSR(::MyCOO)`, …), and to/from host `SparseArrays`
+    (`MyCSC(::SparseMatrixCSC)`, `SparseMatrixCSC(::MyCSC)`).
+  * **`undef` constructors** — `MyCSC{Tv,Ti}(undef, dims)` / `MyVec{Tv,Ti}(undef, n)`,
+    building a *structurally-empty* array (no stored entries), mirroring dense
+    `Array{T}(undef, dims)` and `SparseArrays`' `SparseMatrixCSC{Tv,Ti}(undef, m, n)`. This
+    is the empty-of-a-shape allocation primitive. Note there is no uninitialized-structure
+    analogue: for a sparse array `undef` means empty, exactly as in `SparseArrays`.
+    Implementing these through a `spzeros(Tv, Ti, dims…; fmt=…)` helper (the value-level
+    analogue of `SparseArrays.spzeros`, with a format selector) is recommended — it also
+    serves as a convenient public, format-polymorphic entry point — but `spzeros` itself is
+    not mandated, since its signature is back-end-flavored (format symbols, storage modes)
+    whereas the `undef` constructor is uniform.
+  * **`Base.similar`** — structure-preserving (`similar(A)`, `similar(A, ::Type)`) and
+    empty-of-a-shape (`similar(A, ::Type, dims)`), as for dense arrays; generic code
+    allocates its outputs through `similar`, never by naming a type. The empty-of-a-shape
+    form just delegates to the `undef` constructor (threading the source's storage mode),
+    so the constructor is the real primitive.
+  * **Format-conversion hooks** `GPUArrays.coo_type`/`csr_type`/`csc_type` — map any of your
+    sparse-matrix types to the *type* of the named sibling format
+    (`coo_type(::Type{<:MyCSC}) = MyCOO`); generic code converts with `coo_type(A)(A)`. These
+    are type-level hooks rather than plain `convert(Dest, A)` because a format is the
+    wrapper's identity (distinct structs), not a type parameter — so, unlike an eltype change,
+    there is no generic wrapper→sibling-wrapper operation, and only the back-end knows its
+    sibling types. The cross-format `convert` methods above are the engine the resulting
+    constructors route through; the identity case (`coo_type(coo)(coo)`) is your identity
+    constructor.
+  * **`KernelAbstractions.get_backend`** for the sparse types (usually
+    `get_backend(nonzeros(A))`).
+  * **`Adapt.adapt_structure`** converting each host struct to its device counterpart
+    (`GPUArrays.GPUSparseDeviceVector`, `GPUSparseDeviceMatrixCSC`/`CSR`/`COO`), so the
+    generic kernels can consume it inside `@kernel`s.
+  * **`GPUArrays._sptranspose`/`_spadjoint`** — materialize a (conjugate) transpose; used
+    by `kron`/`triu`/`tril` on lazily wrapped operands.
+
+`SparseArrays`' accessors (`nnz`, `nonzeros`, `nonzeroinds`, `rowvals`, `getcolptr`) come
+for free from the field names. Dense↔sparse conversion is generic and on-device:
+`to_sparse(::Type{ST}, dense)` scans into a sparse array (`ST` a vector or COO type;
+CSR/CSC follow via the verbs) and `to_dense(A)` scatters back to a dense array of the
+back-end — so a back-end's `MyArray(::MySparse…)` and dense→sparse constructors can simply
+call them.
+
+### Functionality you get
+
+Broadcasting; `mapreduce` and reductions (`sum`, `norm`, `opnorm`); sparse–dense and
+sparse–vector multiplication (`*`, `mul!`, including transposed/adjoint operands);
+`findnz`, `triu`/`tril`/`kron`/`reshape`/`droptol!`; `iszero`/`issymmetric`/`ishermitian`;
+scalar and slice indexing; `copy`/`copyto!`/`collect`/`Array`; and conversion between
+formats and to/from dense.
+
 ## Caching Allocator
 
 ```@docs
diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl
index 0efc5a5e..5ebfa26b 100644
--- a/lib/JLArrays/src/JLArrays.jl
+++ b/lib/JLArrays/src/JLArrays.jl
@@ -6,15 +6,14 @@
 
 module JLArrays
 
-export JLArray, JLVector, JLMatrix, jl, JLBackend, JLSparseVector, JLSparseMatrixCSC, JLSparseMatrixCSR
+export JLArray, JLVector, JLMatrix, jl, JLBackend, JLSparseVector,
+       JLSparseMatrixCSC, JLSparseMatrixCSR, JLSparseMatrixCOO
 
 using GPUArrays
 
 using Adapt
 using SparseArrays, LinearAlgebra
 
-import GPUArrays: dense_array_type
-
 import KernelAbstractions
 import KernelAbstractions: Adapt, StaticArrays, Backend, Kernel, StaticSize, DynamicSize, partition, blocks, workitems, launch_config
 
@@ -74,6 +73,11 @@ Adapt.adapt_structure(to::Adaptor, r::Base.RefValue) = JlRefValue(adapt(to, r[])
 
     @inline Base.getindex(A::JLDeviceArray, index::Integer) = getindex(typed_data(A), index)
     @inline Base.setindex!(A::JLDeviceArray, x, index::Integer) = setindex!(typed_data(A), x, index)
+    Base.pointer(x::JLDeviceArray{T}) where {T} =
+        convert(Ptr{T}, pointer(x.data)) + x.offset
+    @inline function Base.pointer(x::JLDeviceArray{T}, i::Integer) where T
+        pointer(x) + (i - 1) * sizeof(T)
+    end
 end
 
 #
@@ -188,6 +192,48 @@ end
 
 JLSparseMatrixCSC(Mat::Union{Transpose{Tv, <:SparseMatrixCSC}, Adjoint{Tv, <:SparseMatrixCSC}}) where {Tv} = JLSparseMatrixCSC(JLSparseMatrixCSR(Mat))
 
+mutable struct JLSparseMatrixCOO{Tv, Ti} <: GPUArrays.AbstractGPUSparseMatrixCOO{Tv, Ti}
+    rowInd::JLArray{Ti, 1}
+    colInd::JLArray{Ti, 1}
+    nzVal::JLArray{Tv, 1}
+    dims::NTuple{2,Int}
+    nnz::Ti
+
+    function JLSparseMatrixCOO{Tv, Ti}(rowInd::JLArray{Ti, 1}, colInd::JLArray{Ti, 1},
+                                       nzVal::JLArray{Tv, 1}, dims::NTuple{2,<:Integer},
+                                       nnz::Integer=length(nzVal)) where {Tv, Ti <: Integer}
+        length(rowInd) == length(colInd) == length(nzVal) ||
+            throw(DimensionMismatch("COO row, column, and value arrays have different lengths"))
+        new{Tv, Ti}(rowInd, colInd, nzVal, (Int(dims[1]), Int(dims[2])), Ti(nnz))
+    end
+end
+JLSparseMatrixCOO(rowInd::JLArray{Ti, 1}, colInd::JLArray{Ti, 1},
+                  nzVal::JLArray{Tv, 1}, dims::NTuple{2,<:Integer},
+                  nnz::Integer=length(nzVal)) where {Tv, Ti <: Integer} =
+    JLSparseMatrixCOO{Tv, Ti}(rowInd, colInd, nzVal, dims, nnz)
+
+JLSparseMatrixCOO(A::JLSparseMatrixCOO) = A
+
+Base.length(x::JLSparseMatrixCOO) = prod(x.dims)
+Base.size(x::JLSparseMatrixCOO) = x.dims
+SparseArrays.nnz(x::JLSparseMatrixCOO) = x.nnz
+SparseArrays.nonzeros(x::JLSparseMatrixCOO) = x.nzVal
+
+function SparseArrays.SparseMatrixCSC(x::JLSparseMatrixCOO)
+    return sparse(Array(x.rowInd), Array(x.colInd), Array(x.nzVal), size(x)...)
+end
+
+function Base.getindex(A::JLSparseMatrixCOO{Tv}, i::Integer, j::Integer) where {Tv}
+    @boundscheck checkbounds(A, i, j)
+    acc = zero(Tv)
+    for k in 1:Int(nnz(A))
+        @inbounds if A.rowInd[k] == i && A.colInd[k] == j
+            acc += A.nzVal[k]
+        end
+    end
+    return acc
+end
+
 function Base.size(g::JLSparseMatrixCSR, d::Integer)
     if 1 <= d <= 2
         return g.dims[d]
@@ -218,33 +264,68 @@ function Base.getindex(A::JLSparseMatrixCSR{Tv, Ti}, i0::Integer, i1::Integer) w
 end
 
 GPUArrays.storage(a::JLArray) = a.data
-GPUArrays.dense_array_type(a::JLArray{T, N}) where {T, N} = JLArray{T, N}
-GPUArrays.dense_array_type(::Type{JLArray{T, N}}) where {T, N} = JLArray{T, N}
-GPUArrays.dense_vector_type(a::JLArray{T, N}) where {T, N} = JLArray{T, 1}
-GPUArrays.dense_vector_type(::Type{JLArray{T, N}}) where {T, N} = JLArray{T, 1}
-
-GPUArrays.sparse_array_type(sa::JLSparseMatrixCSC) = JLSparseMatrixCSC
-GPUArrays.sparse_array_type(::Type{<:JLSparseMatrixCSC}) = JLSparseMatrixCSC
-GPUArrays.sparse_array_type(sa::JLSparseMatrixCSR) = JLSparseMatrixCSR
-GPUArrays.sparse_array_type(::Type{<:JLSparseMatrixCSR}) = JLSparseMatrixCSR
-GPUArrays.sparse_array_type(sa::JLSparseVector) = JLSparseVector
-GPUArrays.sparse_array_type(::Type{<:JLSparseVector}) = JLSparseVector
-
-GPUArrays.dense_array_type(sa::JLSparseVector) = JLArray
-GPUArrays.dense_array_type(::Type{<:JLSparseVector}) = JLArray
-GPUArrays.dense_array_type(sa::JLSparseMatrixCSC) = JLArray
-GPUArrays.dense_array_type(::Type{<:JLSparseMatrixCSC}) = JLArray
-GPUArrays.dense_array_type(sa::JLSparseMatrixCSR) = JLArray
-GPUArrays.dense_array_type(::Type{<:JLSparseMatrixCSR}) = JLArray
-
-GPUArrays.csc_type(sa::JLSparseMatrixCSR) = JLSparseMatrixCSC
-GPUArrays.csr_type(sa::JLSparseMatrixCSC) = JLSparseMatrixCSR
+const JLSparseMatrix{Tv,Ti} = Union{JLSparseMatrixCSC{Tv,Ti},JLSparseMatrixCSR{Tv,Ti},JLSparseMatrixCOO{Tv,Ti}}
+# format-conversion hooks: map any JLArrays sparse matrix to the *type* of its sibling
+# format. Generic GPUArrays code converts with `coo_type(A)(A)` etc., routing through the
+# format constructors defined below.
+GPUArrays.csc_type(::Type{<:JLSparseMatrix}) = JLSparseMatrixCSC
+GPUArrays.csr_type(::Type{<:JLSparseMatrix}) = JLSparseMatrixCSR
+GPUArrays.coo_type(::Type{<:JLSparseMatrix}) = JLSparseMatrixCOO
 
 Base.similar(Mat::JLSparseMatrixCSR) = JLSparseMatrixCSR(copy(Mat.rowPtr), copy(Mat.colVal), similar(nonzeros(Mat)), size(Mat))
 Base.similar(Mat::JLSparseMatrixCSR, T::Type) = JLSparseMatrixCSR(copy(Mat.rowPtr), copy(Mat.colVal), similar(nonzeros(Mat), T), size(Mat))
-
-Base.similar(Mat::JLSparseMatrixCSC, T::Type, N::Int, M::Int) =  JLSparseMatrixCSC(JLVector([zero(Int32)]), JLVector{Int32}(undef, 0), JLVector{T}(undef, 0), (N, M))
-Base.similar(Mat::JLSparseMatrixCSR, T::Type, N::Int, M::Int) =  JLSparseMatrixCSR(JLVector([zero(Int32)]), JLVector{Int32}(undef, 0), JLVector{T}(undef, 0), (N, M))
+Base.similar(Mat::JLSparseMatrixCOO) = JLSparseMatrixCOO(copy(Mat.rowInd), copy(Mat.colInd), similar(nonzeros(Mat)), size(Mat), nnz(Mat))
+Base.similar(Mat::JLSparseMatrixCOO, T::Type) = JLSparseMatrixCOO(copy(Mat.rowInd), copy(Mat.colInd), similar(nonzeros(Mat), T), size(Mat), nnz(Mat))
+
+# CSC and vector structure-preserving `similar` (mirror the CSR/COO methods above; these
+# used to come for free from GPUArrays via a type-reconstruction helper).
+Base.similar(Mat::JLSparseMatrixCSC{<:Any, Ti}, ::Type{T}) where {T, Ti} =
+    JLSparseMatrixCSC{T, Ti}(copy(Mat.colPtr), copy(Mat.rowVal), similar(nonzeros(Mat), T), size(Mat))
+Base.similar(Vec::JLSparseVector{<:Any, Ti}, ::Type{T}) where {T, Ti} =
+    JLSparseVector{T, Ti}(copy(Vec.iPtr), similar(nonzeros(Vec), T), length(Vec))
+
+# Empty (structurally-zero) sparse allocation — the value-level analogue of
+# `SparseArrays.spzeros` and the shared implementation behind the `undef` constructors and
+# the empty-of-shape `similar` below. The matrix method selects the format via `fmt`
+# (mirroring `sparse(A; fmt=…)`); the vector method takes a length. Unexported (so it never
+# clashes with `SparseArrays.spzeros`); call it as `JLArrays.spzeros`.
+function spzeros(::Type{Tv}, ::Type{Ti}, dims::Dims{2}; fmt=:csc) where {Tv, Ti}
+    if fmt === :csc
+        JLSparseMatrixCSC{Tv, Ti}(JLVector(ones(Ti, dims[2] + 1)), JLVector{Ti}(undef, 0), JLVector{Tv}(undef, 0), dims)
+    elseif fmt === :csr
+        JLSparseMatrixCSR{Tv, Ti}(JLVector(ones(Ti, dims[1] + 1)), JLVector{Ti}(undef, 0), JLVector{Tv}(undef, 0), dims)
+    elseif fmt === :coo
+        JLSparseMatrixCOO{Tv, Ti}(JLVector{Ti}(undef, 0), JLVector{Ti}(undef, 0), JLVector{Tv}(undef, 0), dims)
+    else
+        throw(ArgumentError("format :$fmt not available, use :csc, :csr, or :coo"))
+    end
+end
+spzeros(::Type{Tv}, ::Type{Ti}, dims::Dims{1}) where {Tv, Ti} =
+    JLSparseVector{Tv, Ti}(JLVector{Ti}(undef, 0), JLVector{Tv}(undef, 0), dims[1])
+spzeros(::Type{Tv}, ::Type{Ti}, dims::Integer...; kw...) where {Tv, Ti} =
+    spzeros(Tv, Ti, map(Int, dims); kw...)
+spzeros(::Type{Tv}, dims::Union{Integer,Dims}...; kw...) where {Tv} =
+    spzeros(Tv, Int, dims...; kw...)
+
+# `undef` constructors — the mandated empty-of-shape allocation primitive, mirroring the
+# dense `JLArray{T,N}(undef, dims)` and `SparseArrays`' `SparseMatrixCSC{Tv,Ti}(undef, m, n)`.
+JLSparseVector{Tv, Ti}(::UndefInitializer, len::Integer) where {Tv, Ti} =
+    spzeros(Tv, Ti, (Int(len),))
+JLSparseMatrixCSC{Tv, Ti}(::UndefInitializer, dims::Dims{2}) where {Tv, Ti} = spzeros(Tv, Ti, dims; fmt=:csc)
+JLSparseMatrixCSR{Tv, Ti}(::UndefInitializer, dims::Dims{2}) where {Tv, Ti} = spzeros(Tv, Ti, dims; fmt=:csr)
+JLSparseMatrixCOO{Tv, Ti}(::UndefInitializer, dims::Dims{2}) where {Tv, Ti} = spzeros(Tv, Ti, dims; fmt=:coo)
+JLSparseMatrixCSC{Tv, Ti}(u::UndefInitializer, m::Integer, n::Integer) where {Tv, Ti} = JLSparseMatrixCSC{Tv, Ti}(u, (Int(m), Int(n)))
+JLSparseMatrixCSR{Tv, Ti}(u::UndefInitializer, m::Integer, n::Integer) where {Tv, Ti} = JLSparseMatrixCSR{Tv, Ti}(u, (Int(m), Int(n)))
+JLSparseMatrixCOO{Tv, Ti}(u::UndefInitializer, m::Integer, n::Integer) where {Tv, Ti} = JLSparseMatrixCOO{Tv, Ti}(u, (Int(m), Int(n)))
+
+# empty sparse arrays of a given shape, used by the generic broadcast machinery as the
+# output container before its storage is sized and filled. the index type is preserved.
+Base.similar(Mat::JLSparseMatrixCSC{<:Any, Ti}, ::Type{T}, N::Int, M::Int) where {T, Ti} =
+    JLSparseMatrixCSC{T, Ti}(undef, (N, M))
+Base.similar(Mat::JLSparseMatrixCSR{<:Any, Ti}, ::Type{T}, N::Int, M::Int) where {T, Ti} =
+    JLSparseMatrixCSR{T, Ti}(undef, (N, M))
+Base.similar(Vec::JLSparseVector{<:Any, Ti}, ::Type{T}, dims::Dims{1}) where {T, Ti} =
+    JLSparseVector{T, Ti}(undef, dims[1])
 
 Base.similar(Mat::JLSparseMatrixCSC{Tv, Ti}, N::Int, M::Int) where {Tv, Ti} = similar(Mat, Tv, N, M)
 Base.similar(Mat::JLSparseMatrixCSR{Tv, Ti}, N::Int, M::Int) where {Tv, Ti} = similar(Mat, Tv, N, M)
@@ -255,9 +336,10 @@ Base.similar(Mat::JLSparseMatrixCSR, T::Type, dims::Tuple{Int, Int}) = similar(M
 Base.similar(Mat::JLSparseMatrixCSC, dims::Tuple{Int, Int}) = similar(Mat, dims...)
 Base.similar(Mat::JLSparseMatrixCSR, dims::Tuple{Int, Int}) = similar(Mat, dims...)
 
-JLArray(x::JLSparseVector)    = JLArray(collect(SparseVector(x)))
-JLArray(x::JLSparseMatrixCSC) = JLArray(collect(SparseMatrixCSC(x)))
-JLArray(x::JLSparseMatrixCSR) = JLArray(collect(SparseMatrixCSC(x)))
+JLArray(x::JLSparseVector)    = GPUArrays.to_dense(x)
+JLArray(x::JLSparseMatrixCSC) = GPUArrays.to_dense(x)
+JLArray(x::JLSparseMatrixCSR) = GPUArrays.to_dense(x)
+JLArray(x::JLSparseMatrixCOO) = GPUArrays.to_dense(x)
 
 # conversion of untyped data to a typed Array
 function typed_data(x::JLArray{T}) where {T}
@@ -370,6 +452,9 @@ function JLSparseVector(xs::SparseVector{Tv, Ti}) where {Ti, Tv}
     copyto!(nzVal, convert(Vector{Tv}, xs.nzval))
     return JLSparseVector{Tv, Ti}(iPtr, nzVal, length(xs),)
 end
+SparseArrays.SparseVector(xs::JLArray{<:Any,1}) = SparseVector(JLSparseVector(xs))
+JLSparseVector(xs::JLArray{T,1}) where {T} =
+    GPUArrays.to_sparse(JLSparseVector{T,Int}, xs)
 Base.length(x::JLSparseVector) = x.len
 Base.size(x::JLSparseVector) = (x.len,)
 
@@ -382,7 +467,9 @@ function JLSparseMatrixCSC(xs::SparseMatrixCSC{Tv, Ti}) where {Ti, Tv}
     copyto!(nzVal,  convert(Vector{Tv}, xs.nzval))
     return JLSparseMatrixCSC{Tv, Ti}(colPtr, rowVal, nzVal, (xs.m, xs.n))
 end
+SparseArrays.SparseMatrixCSC(xs::JLArray{<:Any,2}) = SparseMatrixCSC(JLSparseMatrixCSC(xs))
 JLSparseMatrixCSC(xs::SparseVector) = JLSparseMatrixCSC(SparseMatrixCSC(xs))
+JLSparseMatrixCSC(xs::JLArray{T,2}) where {T} = JLSparseMatrixCSC(JLSparseMatrixCOO(xs))
 Base.length(x::JLSparseMatrixCSC) = prod(x.dims)
 Base.size(x::JLSparseMatrixCSC) = x.dims
 
@@ -397,12 +484,47 @@ function JLSparseMatrixCSR(xs::SparseMatrixCSC{Tv, Ti}) where {Ti, Tv}
     return JLSparseMatrixCSR{Tv, Ti}(rowPtr, colVal, nzVal, (xs.m, xs.n))
 end
 JLSparseMatrixCSR(xs::SparseVector{Tv, Ti}) where {Ti, Tv} = JLSparseMatrixCSR(SparseMatrixCSC(xs))
-function JLSparseMatrixCSR(xs::JLSparseMatrixCSC{Tv, Ti}) where {Ti, Tv}
-    return JLSparseMatrixCSR(SparseMatrixCSC(xs))
-end
-function JLSparseMatrixCSC(xs::JLSparseMatrixCSR{Tv, Ti}) where {Ti, Tv}
-    return JLSparseMatrixCSC(SparseMatrixCSC(xs))
+JLSparseMatrixCSR(xs::JLArray{T,2}) where {T} = JLSparseMatrixCSR(JLSparseMatrixCOO(xs))
+function JLSparseMatrixCOO(xs::SparseMatrixCSC{Tv, Ti}) where {Ti, Tv}
+    rowInd, colInd, nzVal = findnz(xs)
+    return JLSparseMatrixCOO{Tv, Ti}(JLVector{Ti}(rowInd), JLVector{Ti}(colInd),
+                                     JLVector{Tv}(nzVal), (xs.m, xs.n))
+end
+JLSparseMatrixCOO(xs::SparseVector) = JLSparseMatrixCOO(SparseMatrixCSC(xs))
+JLSparseMatrixCOO(xs::JLArray{T,2}) where {T} =
+    GPUArrays.to_sparse(JLSparseMatrixCOO{T,Int}, xs)
+# Cross-format device→device conversions (CSC↔CSR↔COO): the back-end owns the constructors;
+# the on-device conversion algorithm is GPUArrays' generic `convert` (replacing the former
+# host round-trips through `SparseMatrixCSC`). The `csc_type/csr_type/coo_type` hooks above
+# route to these. The COO→CSR/CSC path needs `sortperm`, provided below.
+JLSparseMatrixCOO(A::Union{JLSparseMatrixCSC,JLSparseMatrixCSR}) =
+    convert(JLSparseMatrixCOO, A)
+JLSparseMatrixCSR(A::Union{JLSparseMatrixCSC,JLSparseMatrixCOO}) =
+    convert(JLSparseMatrixCSR, A)
+JLSparseMatrixCSC(A::Union{JLSparseMatrixCSR,JLSparseMatrixCOO}) =
+    convert(JLSparseMatrixCSC, A)
+
+# `sortperm` on a JLArray vector, required by the generic COO→CSR/CSC conversions (and by
+# `findnz`). Computed on the host and wrapped back into a device vector so that the gather
+# kernel can index the permutation on-device (mirrors Metal's `sortperm`, which also returns
+# a device `Int` vector).
+Base.sortperm(x::JLVector; kwargs...) = JLArray(sortperm(Array(x); kwargs...))
+
+SparseArrays.sparse(xs::JLArray{T,1}) where {T} =
+    GPUArrays.to_sparse(JLSparseVector{T,Int}, xs)
+
+function SparseArrays.sparse(xs::JLArray{T,2}; fmt=:csc) where {T}
+    if fmt == :csc
+        return JLSparseMatrixCSC(xs)
+    elseif fmt == :csr
+        return JLSparseMatrixCSR(xs)
+    elseif fmt == :coo
+        return JLSparseMatrixCOO(xs)
+    else
+        throw(ArgumentError("format :$fmt not available, use :csc, :csr, or :coo"))
+    end
 end
+
 function Base.copyto!(dst::JLSparseMatrixCSR, src::JLSparseMatrixCSR)
     if size(dst) != size(src)
         throw(ArgumentError("Inconsistent Sparse Matrix size"))
@@ -456,6 +578,13 @@ Adapt.adapt_storage(::Type{Array}, xs::JLArray) = convert(Array, xs)
 Base.convert(::Type{T}, x::T) where T <: JLArray = x
 
 
+## indexing
+
+Base.findall(bools::JLArray{Bool}) = JLArray(findall(Array(bools)))
+Base.findall(f::Function, A::JLArray) = JLArray(findall(f, Array(A)))
+Base.findall(f::Base.Fix2{typeof(in)}, A::JLArray) = JLArray(findall(f, Array(A)))
+
+
 ## broadcast
 
 import Base.Broadcast: BroadcastStyle, Broadcasted
@@ -568,13 +697,15 @@ Adapt.adapt_structure(to::Adaptor, x::JLSparseMatrixCSC{Tv,Ti}) where {Tv,Ti} =
 GPUSparseDeviceMatrixCSC{Tv,Ti,JLDeviceArray{Ti, 1}, JLDeviceArray{Tv, 1}, AS.Generic}(adapt(to, x.colPtr), adapt(to, x.rowVal), adapt(to, x.nzVal), x.dims, x.nnz)
 Adapt.adapt_structure(to::Adaptor, x::JLSparseMatrixCSR{Tv,Ti}) where {Tv,Ti} =
 GPUSparseDeviceMatrixCSR{Tv,Ti,JLDeviceArray{Ti, 1}, JLDeviceArray{Tv, 1}, AS.Generic}(adapt(to, x.rowPtr), adapt(to, x.colVal), adapt(to, x.nzVal), x.dims, x.nnz)
+Adapt.adapt_structure(to::Adaptor, x::JLSparseMatrixCOO{Tv,Ti}) where {Tv,Ti} =
+GPUSparseDeviceMatrixCOO{Tv,Ti,JLDeviceArray{Ti, 1}, JLDeviceArray{Tv, 1}, AS.Generic}(adapt(to, x.rowInd), adapt(to, x.colInd), adapt(to, x.nzVal), x.dims, x.nnz)
 Adapt.adapt_structure(to::Adaptor, x::JLSparseVector{Tv,Ti}) where {Tv,Ti} =
 GPUSparseDeviceVector{Tv,Ti,JLDeviceArray{Ti, 1}, JLDeviceArray{Tv, 1}, AS.Generic}(adapt(to, x.iPtr), adapt(to, x.nzVal), x.len, x.nnz)
 
 ## KernelAbstractions interface
 
 KernelAbstractions.get_backend(a::JLA) where JLA <: JLArray = JLBackend()
-KernelAbstractions.get_backend(a::JLA) where JLA <: Union{JLSparseMatrixCSC, JLSparseMatrixCSR, JLSparseVector} = JLBackend()
+KernelAbstractions.get_backend(a::JLA) where JLA <: Union{JLSparseMatrixCSC, JLSparseMatrixCSR, JLSparseMatrixCOO, JLSparseVector} = JLBackend()
 
 function KernelAbstractions.mkcontext(kernel::Kernel{JLBackend}, I, _ndrange, iterspace, ::Dynamic) where Dynamic
     return KernelAbstractions.CompilerMetadata{KernelAbstractions.ndrange(kernel), Dynamic}(I, _ndrange, iterspace)
diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl
index 17a4d897..d50dcee0 100644
--- a/src/GPUArrays.jl
+++ b/src/GPUArrays.jl
@@ -9,6 +9,7 @@ using Printf
 using LinearAlgebra.BLAS
 using Base.Cartesian
 
+using Atomix
 using Adapt
 using LLVM.Interop
 
diff --git a/src/host/sparse.jl b/src/host/sparse.jl
index 5c84ee77..c3bf5d4b 100644
--- a/src/host/sparse.jl
+++ b/src/host/sparse.jl
@@ -44,29 +44,616 @@ Base.iszero(A::AbstractGPUSparseArray) = SparseArrays.nnz(A) == 0 || all(iszero,
 SparseArrays.SparseVector(x::AbstractGPUSparseVector) = SparseVector(length(x), Array(SparseArrays.nonzeroinds(x)), Array(SparseArrays.nonzeros(x)))
 SparseArrays.SparseMatrixCSC(x::AbstractGPUSparseMatrixCSC) = SparseMatrixCSC(size(x)..., Array(SparseArrays.getcolptr(x)), Array(SparseArrays.rowvals(x)), Array(SparseArrays.nonzeros(x)))
 
+function check_sparse_target(::Type{ST}, ::Type{Tv}, ::Type{Ti}) where {ST,Tv,Ti}
+    Tv isa Type && Ti isa Type ||
+        throw(ArgumentError("sparse target type $ST must specify value and index eltypes"))
+    return
+end
+
+@kernel function dense_sparse_vector_kernel!(iPtr, nzVal, A, indices)
+    i = @index(Global, Linear)
+    if i <= length(indices)
+        @inbounds begin
+            idx = indices[i]
+            iPtr[i] = idx
+            nzVal[i] = A[idx]
+        end
+    end
+end
+
+@kernel function dense_sparse_matrix_kernel!(rowInd, colInd, nzVal, A, indices)
+    i = @index(Global, Linear)
+    if i <= length(indices)
+        @inbounds begin
+            idx = indices[i]
+            rowInd[i] = idx[1]
+            colInd[i] = idx[2]
+            nzVal[i] = A[idx]
+        end
+    end
+end
+
+"""
+    to_sparse(ST, A)
+
+Build a sparse array of type `ST` from the dense GPU array `A`, on the device (no host
+round-trip); the inverse of [`to_dense`](@ref). `ST` must be a concrete
+`AbstractGPUSparseVector` or `AbstractGPUSparseMatrixCOO` (other matrix formats follow by
+converting the resulting COO with `csr_type`/`csc_type`). The target format is required
+because the result layout would otherwise be ambiguous; `to_dense` needs no such argument.
+"""
+function to_sparse(::Type{ST}, A::AnyGPUVector) where {Tv,Ti,ST<:AbstractGPUSparseVector{Tv,Ti}}
+    check_sparse_target(ST, Tv, Ti)
+    indices = findall(!iszero, A)
+    nstored = length(indices)
+    iPtr = similar(A, Ti, nstored)
+    nzVal = similar(A, Tv, nstored)
+    if nstored > 0
+        dense_sparse_vector_kernel!(get_backend(A))(iPtr, nzVal, A, indices;
+                                                    ndrange=nstored)
+    end
+    unsafe_free!(indices)
+    return ST(iPtr, nzVal, length(A))
+end
+
+# Densifying a dense array naturally produces coordinate (COO) format, so this builds a
+# COO directly. Other formats are obtained by converting the COO with `csr_type`/
+# `csc_type` (which back-ends already provide as constructors), avoiding any need to map
+# a target type to its COO sibling.
+function to_sparse(::Type{ST}, A::AnyGPUMatrix) where {Tv,Ti,ST<:AbstractGPUSparseMatrixCOO{Tv,Ti}}
+    check_sparse_target(ST, Tv, Ti)
+    indices = findall(!iszero, A)
+    nstored = length(indices)
+    rowInd = similar(A, Ti, nstored)
+    colInd = similar(A, Ti, nstored)
+    nzVal = similar(A, Tv, nstored)
+    if nstored > 0
+        dense_sparse_matrix_kernel!(get_backend(A))(rowInd, colInd, nzVal, A, indices;
+                                                    ndrange=nstored)
+    end
+    unsafe_free!(indices)
+    return ST(rowInd, colInd, nzVal, size(A), nstored)
+end
+
+@kernel function densify_vector_kernel!(out, iPtr, nzVal)
+    k = @index(Global, Linear)
+    if k <= length(nzVal)
+        @inbounds out[iPtr[k]] = nzVal[k]
+    end
+end
+
+@kernel function densify_matrix_kernel!(out, rowInd, colInd, nzVal)
+    k = @index(Global, Linear)
+    if k <= length(nzVal)
+        @inbounds out[rowInd[k], colInd[k]] = nzVal[k]
+    end
+end
+
+"""
+    to_dense(A)
+
+Materialize the sparse GPU array `A` into a dense array of the same back-end, on the
+device (no host round-trip); the inverse of [`to_sparse`](@ref). Stored coordinates are
+assumed unique (true for CSC/CSR and for COO produced by conversion); duplicate COO
+coordinates resolve to the last value written.
+"""
+function to_dense(x::AbstractGPUSparseVector{Tv}) where {Tv}
+    out = fill!(similar(nonzeros(x), Tv, length(x)), zero(Tv))
+    if nnz(x) > 0
+        densify_vector_kernel!(get_backend(out))(out, SparseArrays.nonzeroinds(x), nonzeros(x);
+                                                 ndrange=nnz(x))
+    end
+    return out
+end
+
+function to_dense(A::AbstractGPUSparseMatrix{Tv}) where {Tv}
+    coo = coo_type(A)(A)
+    out = fill!(similar(nonzeros(A), Tv, size(A)), zero(Tv))
+    if nnz(coo) > 0
+        densify_matrix_kernel!(get_backend(out))(out, coo.rowInd, coo.colInd, nonzeros(coo);
+                                                 ndrange=nnz(coo))
+    end
+    return out
+end
+
 # similar
 Base.similar(Vec::V) where {V<:AbstractGPUSparseVector} = V(copy(SparseArrays.nonzeroinds(Vec)), similar(SparseArrays.nonzeros(Vec)), length(Vec))
 Base.similar(Mat::M) where {M<:AbstractGPUSparseMatrixCSC} = M(copy(SparseArrays.getcolptr(Mat)), copy(SparseArrays.rowvals(Mat)), similar(SparseArrays.nonzeros(Mat)), size(Mat))
 
-Base.similar(Vec::V, T::Type) where {Tv, Ti, V<:AbstractGPUSparseVector{Tv, Ti}} = sparse_array_type(V){T, Ti}(copy(SparseArrays.nonzeroinds(Vec)), similar(SparseArrays.nonzeros(Vec), T), length(Vec))
-Base.similar(Mat::M, T::Type) where {M<:AbstractGPUSparseMatrixCSC} = sparse_array_type(M)(copy(SparseArrays.getcolptr(Mat)), copy(SparseArrays.rowvals(Mat)), similar(SparseArrays.nonzeros(Mat), T), size(Mat))
-
-dense_array_type(a::AbstractArray)     = dense_array_type(typeof(a))
-sparse_array_type(a::AbstractArray)    = sparse_array_type(typeof(a))
-dense_vector_type(a::AbstractArray)    = dense_vector_type(typeof(a))
-
-dense_array_type(::Type{<:SparseVector}) = SparseVector
-sparse_array_type(::Type{<:SparseVector}) = SparseVector
-dense_vector_type(::Type{<:AbstractSparseArray}) = Vector
-dense_vector_type(::Type{<:AbstractArray})       = Vector
-dense_array_type(::Type{<:SparseMatrixCSC}) = SparseMatrixCSC
-sparse_array_type(::Type{<:SparseMatrixCSC})    = SparseMatrixCSC
+# `similar(A, ::Type)` (structure-preserving, new element type) is a back-end method, just
+# like the dense `similar`. The generic algorithms below always build their outputs with
+# `similar` (dispatched on a representative value), so there is no need for backends to
+# provide `*_array_type` mapping functions to reconstruct a sparse type from a type.
 
-coo_type(sa) = coo_type(typeof(sa))
+# Stored (minor-axis) index buffer, accessed generically through the conventional field;
+# the analogue of `SparseArrays.storedinds`. Used by the broadcast/construction code.
+_stored_inds(A::AbstractGPUSparseMatrixCSR) = A.colVal
+_stored_inds(A::AbstractGPUSparseMatrixCSC) = A.rowVal
+_stored_inds(A::AbstractGPUSparseVector)    = A.iPtr
 
 function _spadjoint end
 function _sptranspose end
 
+const SparseMatmulEltypes = Union{Float16,Float32,ComplexF16,ComplexF32}
+
+sparse_matmul_accumulator(::Type{T}) where {T} = T
+sparse_matmul_accumulator(::Type{Float16}) = Float32
+sparse_matmul_accumulator(::Type{ComplexF16}) = ComplexF32
+
+const GPUSparseMatrixOrWrapper = Union{
+    AbstractGPUSparseMatrix,
+    Transpose{<:Any,<:AbstractGPUSparseMatrix},
+    Adjoint{<:Any,<:AbstractGPUSparseMatrix},
+}
+
+function sparse_op(t::AbstractChar)
+    t in ('N', 'n', 'S', 's', 'H', 'h') && return false, false
+    t in ('T', 't') && return true, false
+    t in ('C', 'c') && return true, true
+    throw(ArgumentError("unsupported sparse matrix operation '$t'"))
+end
+
+sparse_op_size(A, trans::Bool) = trans ? reverse(size(A)) : size(A)
+
+# Format-conversion hooks: a back-end maps any of its sparse-matrix types to the *type* of
+# its sibling format (`coo_type(::Type{<:MyCSC}) = MyCOO`); generic code then converts with
+# `coo_type(A)(A)`. This is a type-level hook rather than plain `convert(Dest, A)` because the
+# format is the wrapper's identity (distinct structs/fields), not a type parameter — so unlike
+# an eltype change there is no generic wrapper→sibling-wrapper operation, and only the back-end
+# knows its sibling types. The cross-format `convert` algorithm below is the engine those
+# constructors route through; the identity case (`coo_type(coo)(coo)`) is the back-end's
+# identity constructor.
+coo_type(a) = coo_type(typeof(a))
+csr_type(a) = csr_type(typeof(a))
+csc_type(a) = csc_type(typeof(a))
+
+## Generic format conversions (CSC↔CSR↔COO), on-device.
+#
+# Conversion is owned by `convert` (dispatched on GPUArrays' own abstract types, so not type
+# piracy); construction is owned by the back-end through its conventional field constructor,
+# which `convert` calls — the same division `SparseArrays` uses (`convert`/constructors for
+# conversion, `SparseMatrixCSC(m, n, colptr, rowval, nzval)` for construction). Conversions
+# preserve the source's value/index eltypes; every buffer is allocated from a source array
+# via `similar`, so the storage mode and back-end are inherited. `sortperm` on a back-end
+# vector is required (the COO→CSR/CSC path; already implied by `findnz`).
+
+@inline function sparse_lower_bound(xs, lo::Ti, hi::Ti, key::Ti) where {Ti}
+    l = lo
+    r = hi
+    while l < r
+        m = (l + r) >>> 1
+        @inbounds v = xs[m]
+        if v < key
+            l = m + one(Ti)
+        else
+            r = m
+        end
+    end
+    return l
+end
+
+# Expand a compressed pointer array (CSC `colPtr` / CSR `rowPtr`, length `dim+1`) into one
+# explicit lead index per stored entry: thread `lead` writes its index into the slots
+# `ptr[lead]:ptr[lead+1]-1` of `out`.
+@kernel function expand_ptr_kernel!(out, ptr)
+    lead = @index(Global, Linear)
+    if lead <= length(ptr) - 1
+        T = eltype(out)
+        @inbounds begin
+            k = ptr[lead]
+            stop = ptr[lead + 1]
+            v = T(lead)
+            while k < stop
+                out[k] = v
+                k += oneunit(k)
+            end
+        end
+    end
+end
+
+# Build a lead-major linear sort key per stored COO entry (lead-major: sorting by the key
+# orders entries by `(lead, sub)`); `subdim` is the size of the sub axis.
+@kernel function coo_sort_keys_kernel!(keys, leadInd, subInd, subdim)
+    i = @index(Global, Linear)
+    if i <= length(keys)
+        @inbounds keys[i] = (Int64(leadInd[i]) - Int64(1)) * Int64(subdim) + Int64(subInd[i])
+    end
+end
+
+# Gather the three COO arrays through a permutation (e.g. from `sortperm`).
+@kernel function coo_gather_kernel!(leadOut, subOut, nzOut, leadIn, subIn, nzIn, perm)
+    i = @index(Global, Linear)
+    if i <= length(perm)
+        @inbounds begin
+            src = perm[i]
+            leadOut[i] = leadIn[src]
+            subOut[i] = subIn[src]
+            nzOut[i] = nzIn[src]
+        end
+    end
+end
+
+# Build a compressed pointer array from sorted lead indices: `ptr[i]` is the first 1-based
+# position of a stored entry whose lead index is ≥ `i`, and `ptr[end] = nnz+1`.
+@kernel function compressed_ptr_from_sorted_kernel!(ptr, sortedLead, nnz)
+    i = @index(Global, Linear)
+    Ti = eltype(ptr)
+    if i <= length(ptr)
+        @inbounds ptr[i] = if i == length(ptr)
+            Ti(nnz) + one(Ti)
+        else
+            sparse_lower_bound(sortedLead, one(Ti), Ti(nnz) + one(Ti), Ti(i))
+        end
+    end
+end
+
+# Order a COO matrix's stored entries lead-major, returning fresh (lead, sub, nz) arrays at
+# the source eltypes. Duplicate coordinates are preserved (no summing).
+function _sorted_coo(A::AbstractGPUSparseMatrixCOO{Tv,Ti}, ::Val{lead}) where {Tv,Ti,lead}
+    lead === :row || lead === :col ||
+        throw(ArgumentError("COO sort lead must be :row or :col"))
+    total = Int(nnz(A))
+    leadIn = lead === :row ? A.rowInd : A.colInd
+    subIn  = lead === :row ? A.colInd : A.rowInd
+    subdim = lead === :row ? size(A, 2) : size(A, 1)
+
+    leadOut = similar(leadIn, Ti, total)
+    subOut  = similar(subIn, Ti, total)
+    nzOut   = similar(nonzeros(A), Tv, total)
+    total == 0 && return leadOut, subOut, nzOut
+
+    backend = get_backend(leadOut)
+    keys = similar(leadIn, Int64, total)
+    coo_sort_keys_kernel!(backend)(keys, leadIn, subIn, subdim; ndrange=total)
+    perm = sortperm(keys)
+    coo_gather_kernel!(backend)(leadOut, subOut, nzOut, leadIn, subIn, nonzeros(A), perm;
+                                ndrange=total)
+    unsafe_free!(keys)
+    unsafe_free!(perm)
+    return leadOut, subOut, nzOut
+end
+
+# compressed pointer array of length `dim+1` from (possibly empty) sorted lead indices,
+# inheriting the lead array's index eltype and storage.
+function _compressed_ptr(sortedLead, dim::Integer, n::Integer)
+    ptr = similar(sortedLead, dim + 1)
+    if n > 0
+        compressed_ptr_from_sorted_kernel!(get_backend(ptr))(ptr, sortedLead, n;
+                                                             ndrange=dim + 1)
+    else
+        fill!(ptr, one(eltype(ptr)))
+    end
+    return ptr
+end
+
+# Cross-format conversions are ordinary `convert` methods, dispatched on GPUArrays' own
+# abstract types (so this is not type piracy and avoids the ambiguity a generic constructor
+# would have with a back-end's host-array constructor). The result is built with the
+# back-end's conventional field constructor `ST(components..., dims[, nnz])` — the write-side
+# analogue of the `colPtr`/`rowVal`/`nzVal` read contract, and how `SparseArrays` itself
+# builds (`SparseMatrixCSC(m, n, colptr, rowval, nzval)`). A back-end with unconventional
+# construction overrides the relevant `convert` method.
+
+# CSR → COO: expand `rowPtr` into explicit row indices; columns and values are shared.
+function Base.convert(::Type{ST}, A::AbstractGPUSparseMatrixCSR) where {ST<:AbstractGPUSparseMatrixCOO}
+    n = Int(nnz(A))
+    rowInd = similar(A.colVal, n)
+    n > 0 && expand_ptr_kernel!(get_backend(rowInd))(rowInd, A.rowPtr; ndrange=size(A, 1))
+    return ST(rowInd, A.colVal, nonzeros(A), size(A), n)
+end
+
+# CSC → COO: expand `colPtr` into explicit column indices; rows and values are shared.
+function Base.convert(::Type{ST}, A::AbstractGPUSparseMatrixCSC) where {ST<:AbstractGPUSparseMatrixCOO}
+    n = Int(nnz(A))
+    colInd = similar(A.rowVal, n)
+    n > 0 && expand_ptr_kernel!(get_backend(colInd))(colInd, A.colPtr; ndrange=size(A, 2))
+    return ST(A.rowVal, colInd, nonzeros(A), size(A), n)
+end
+
+# COO → CSR: sort row-major, then compress the row indices into a `rowPtr`.
+function Base.convert(::Type{ST}, A::AbstractGPUSparseMatrixCOO) where {ST<:AbstractGPUSparseMatrixCSR}
+    rowInd, colVal, nzVal = _sorted_coo(A, Val(:row))
+    rowPtr = _compressed_ptr(rowInd, size(A, 1), Int(nnz(A)))
+    unsafe_free!(rowInd)
+    return ST(rowPtr, colVal, nzVal, size(A))
+end
+
+# COO → CSC: sort column-major, then compress the column indices into a `colPtr`.
+function Base.convert(::Type{ST}, A::AbstractGPUSparseMatrixCOO) where {ST<:AbstractGPUSparseMatrixCSC}
+    colInd, rowVal, nzVal = _sorted_coo(A, Val(:col))
+    colPtr = _compressed_ptr(colInd, size(A, 2), Int(nnz(A)))
+    unsafe_free!(colInd)
+    return ST(colPtr, rowVal, nzVal, size(A))
+end
+
+# CSC ↔ CSR route through COO (the back-end's hook yields its concrete COO type).
+Base.convert(::Type{ST}, A::AbstractGPUSparseMatrixCSC) where {ST<:AbstractGPUSparseMatrixCSR} =
+    convert(ST, coo_type(A)(A))
+Base.convert(::Type{ST}, A::AbstractGPUSparseMatrixCSR) where {ST<:AbstractGPUSparseMatrixCSC} =
+    convert(ST, coo_type(A)(A))
+
+@inline function sparse_atomic_add!(A::AbstractArray{<:Real}, parts, I::Integer, x)
+    @inbounds Atomix.@atomic A[Int(I)] += x
+    return
+end
+
+@inline function sparse_atomic_add!(A::AbstractArray{<:Complex}, parts, I::Integer, x)
+    j = 2 * Int(I)
+    @inbounds begin
+        Atomix.@atomic parts[j - 1] += real(x)
+        Atomix.@atomic parts[j] += imag(x)
+    end
+    return
+end
+
+sparse_atomic_parts(A::AbstractArray{<:Real}) = A
+sparse_atomic_parts(A::AbstractArray{<:Complex{T}}) where {T} = reinterpret(T, A)
+
+@kernel function sparse_scale_kernel!(Y, beta)
+    i = @index(Global, Linear)
+    if i <= length(Y)
+        T = eltype(Y)
+        @inbounds Y[i] = iszero(beta) ? zero(T) : T(beta) * Y[i]
+    end
+end
+
+@kernel function sparse_combine_kernel!(Y, acc, alpha, beta, ::Val{Tacc}) where {Tacc}
+    i = @index(Global, Linear)
+    if i <= length(Y)
+        Ty = eltype(Y)
+        @inbounds Y[i] = iszero(beta) ?
+            Ty(Tacc(alpha) * Tacc(acc[i])) :
+            Ty(Tacc(alpha) * Tacc(acc[i]) + Tacc(beta) * Tacc(Y[i]))
+    end
+end
+
+function sparse_scale!(Y, beta)
+    isempty(Y) && return Y
+    sparse_scale_kernel!(get_backend(Y))(Y, beta; ndrange=length(Y))
+    return Y
+end
+
+function sparse_combine!(Y::AnyGPUArray{Ty}, acc, alpha, beta) where {Ty}
+    isempty(Y) && return Y
+    Tacc = sparse_matmul_accumulator(Ty)
+    sparse_combine_kernel!(get_backend(Y))(Y, acc, Tacc(alpha), Tacc(beta), Val(Tacc);
+                                           ndrange=length(Y))
+    return Y
+end
+
+@kernel function spmv_coo_kernel!(y, yparts, alpha, rowInd, colInd, nzVal, x,
+                                  trans::Bool, conjval::Bool, ::Val{Tacc}) where {Tacc}
+    k = @index(Global, Linear)
+    if k <= length(nzVal)
+        @inbounds begin
+            out = trans ? colInd[k] : rowInd[k]
+            src = trans ? rowInd[k] : colInd[k]
+            v = Tacc(nzVal[k])
+            v = conjval ? conj(v) : v
+            sparse_atomic_add!(y, yparts, out, Tacc(alpha) * v * Tacc(x[src]))
+        end
+    end
+end
+
+@kernel function spmm_coo_kernel!(C, Cparts, alpha, rowInd, colInd, nzVal, B,
+                                  trans::Bool, conjval::Bool, ::Val{Tacc}) where {Tacc}
+    k, j = @index(Global, NTuple)
+    if k <= length(nzVal) && j <= size(B, 2)
+        @inbounds begin
+            outrow = trans ? colInd[k] : rowInd[k]
+            srcrow = trans ? rowInd[k] : colInd[k]
+            v = Tacc(nzVal[k])
+            v = conjval ? conj(v) : v
+            lin = Int(outrow) + (Int(j) - 1) * size(C, 1)
+            sparse_atomic_add!(C, Cparts, lin, Tacc(alpha) * v * Tacc(B[srcrow, j]))
+        end
+    end
+end
+
+@kernel function dense_spmm_coo_kernel!(C, Cparts, alpha, A, rowInd, colInd, nzVal,
+                                        trans::Bool, conjval::Bool, ::Val{Tacc}) where {Tacc}
+    i, k = @index(Global, NTuple)
+    if i <= size(C, 1) && k <= length(nzVal)
+        @inbounds begin
+            outcol = trans ? rowInd[k] : colInd[k]
+            srccol = trans ? colInd[k] : rowInd[k]
+            v = Tacc(nzVal[k])
+            v = conjval ? conj(v) : v
+            lin = Int(i) + (Int(outcol) - 1) * size(C, 1)
+            sparse_atomic_add!(C, Cparts, lin, Tacc(alpha) * Tacc(A[i, srccol]) * v)
+        end
+    end
+end
+
+function spmv_coo!(y::AnyGPUVector{Ty}, A::AbstractGPUSparseMatrixCOO, x,
+                   alpha::Number, beta::Number, trans::Bool, conjval::Bool) where {Ty}
+    Tacc = sparse_matmul_accumulator(Ty)
+    if Tacc === Ty
+        sparse_scale!(y, beta)
+        target = y
+        scatter_alpha = Tacc(alpha)
+    else
+        target = similar(y, Tacc, size(y))
+        fill!(target, zero(Tacc))
+        scatter_alpha = one(Tacc)
+    end
+
+    if nnz(A) > 0
+        parts = sparse_atomic_parts(target)
+        spmv_coo_kernel!(get_backend(y))(target, parts, scatter_alpha, A.rowInd, A.colInd,
+                                         nonzeros(A), x, trans, conjval, Val(Tacc);
+                                         ndrange=nnz(A))
+    end
+
+    if !(Tacc === Ty)
+        sparse_combine!(y, target, alpha, beta)
+        unsafe_free!(target)
+    end
+    return y
+end
+
+function spmm_coo!(C::AnyGPUMatrix{Tc}, A::AbstractGPUSparseMatrixCOO, B,
+                   alpha::Number, beta::Number, trans::Bool, conjval::Bool) where {Tc}
+    Tacc = sparse_matmul_accumulator(Tc)
+    if Tacc === Tc
+        sparse_scale!(C, beta)
+        target = C
+        scatter_alpha = Tacc(alpha)
+    else
+        target = similar(C, Tacc, size(C))
+        fill!(target, zero(Tacc))
+        scatter_alpha = one(Tacc)
+    end
+
+    if nnz(A) > 0 && size(C, 2) > 0
+        parts = sparse_atomic_parts(target)
+        spmm_coo_kernel!(get_backend(C))(target, parts, scatter_alpha, A.rowInd, A.colInd,
+                                         nonzeros(A), B, trans, conjval, Val(Tacc);
+                                         ndrange=(nnz(A), size(C, 2)))
+    end
+
+    if !(Tacc === Tc)
+        sparse_combine!(C, target, alpha, beta)
+        unsafe_free!(target)
+    end
+    return C
+end
+
+function dense_spmm_coo!(C::AnyGPUMatrix{Tc}, A, B::AbstractGPUSparseMatrixCOO,
+                         alpha::Number, beta::Number, trans::Bool, conjval::Bool) where {Tc}
+    Tacc = sparse_matmul_accumulator(Tc)
+    if Tacc === Tc
+        sparse_scale!(C, beta)
+        target = C
+        scatter_alpha = Tacc(alpha)
+    else
+        target = similar(C, Tacc, size(C))
+        fill!(target, zero(Tacc))
+        scatter_alpha = one(Tacc)
+    end
+
+    if nnz(B) > 0 && size(C, 1) > 0
+        parts = sparse_atomic_parts(target)
+        dense_spmm_coo_kernel!(get_backend(C))(target, parts, scatter_alpha, A, B.rowInd,
+                                               B.colInd, nonzeros(B), trans, conjval,
+                                               Val(Tacc);
+                                               ndrange=(size(C, 1), nnz(B)))
+    end
+
+    if !(Tacc === Tc)
+        sparse_combine!(C, target, alpha, beta)
+        unsafe_free!(target)
+    end
+    return C
+end
+
+function Base.:(*)(A::GPUSparseMatrixOrWrapper, x::AnyGPUVector)
+    size(A, 2) == length(x) || throw(DimensionMismatch())
+    y = similar(x, promote_type(eltype(A), eltype(x)), size(A, 1))
+    return LinearAlgebra.mul!(y, A, x)
+end
+
+function Base.:(*)(A::GPUSparseMatrixOrWrapper, B::AnyGPUMatrix)
+    size(A, 2) == size(B, 1) || throw(DimensionMismatch())
+    C = similar(B, promote_type(eltype(A), eltype(B)), (size(A, 1), size(B, 2)))
+    return LinearAlgebra.mul!(C, A, B)
+end
+
+function Base.:(*)(A::AnyGPUMatrix, B::GPUSparseMatrixOrWrapper)
+    size(A, 2) == size(B, 1) || throw(DimensionMismatch())
+    C = similar(A, promote_type(eltype(A), eltype(B)), (size(A, 1), size(B, 2)))
+    return LinearAlgebra.mul!(C, A, B)
+end
+
+LinearAlgebra.generic_matvecmul!(y::AnyGPUVector{Ty}, tA::AbstractChar,
+                                 A::AbstractGPUSparseMatrix{Ta}, x::AnyGPUVector{Tx},
+                                 muladd::MulAddMul) where {
+                                     Ty<:SparseMatmulEltypes,
+                                     Ta<:SparseMatmulEltypes,
+                                     Tx<:SparseMatmulEltypes,
+                                 } =
+    LinearAlgebra.generic_matvecmul!(y, tA, A, x, muladd.alpha, muladd.beta)
+
+function LinearAlgebra.generic_matvecmul!(y::AnyGPUVector{Ty}, tA::AbstractChar,
+                                          A::AbstractGPUSparseMatrix{Ta},
+                                          x::AnyGPUVector{Tx},
+                                          alpha::Number, beta::Number) where {
+                                              Ty<:SparseMatmulEltypes,
+                                              Ta<:SparseMatmulEltypes,
+                                              Tx<:SparseMatmulEltypes,
+                                          }
+    trans, conjval = sparse_op(tA)
+    Asize = sparse_op_size(A, trans)
+    Asize[2] == length(x) ||
+        throw(DimensionMismatch("A has dimensions $Asize but x has length $(length(x))"))
+    Asize[1] == length(y) ||
+        throw(DimensionMismatch("A has dimensions $Asize but y has length $(length(y))"))
+    y === x && throw(ArgumentError("output vector must not be aliased with input vector"))
+    isempty(y) && return y
+
+    return spmv_coo!(y, coo_type(A)(A), x, alpha, beta, trans, conjval)
+end
+
+LinearAlgebra.generic_matmatmul!(C::AnyGPUMatrix{Tc}, tA, tB,
+                                 A::AbstractGPUSparseMatrix{Ta}, B::AnyGPUMatrix{Tb},
+                                 muladd::MulAddMul) where {
+                                     Tc<:SparseMatmulEltypes,
+                                     Ta<:SparseMatmulEltypes,
+                                     Tb<:SparseMatmulEltypes,
+                                 } =
+    LinearAlgebra.generic_matmatmul!(C, tA, tB, A, B, muladd.alpha, muladd.beta)
+
+function LinearAlgebra.generic_matmatmul!(C::AnyGPUMatrix{Tc}, tA, tB,
+                                          A::AbstractGPUSparseMatrix{Ta},
+                                          B::AnyGPUMatrix{Tb},
+                                          alpha::Number, beta::Number) where {
+                                              Tc<:SparseMatmulEltypes,
+                                              Ta<:SparseMatmulEltypes,
+                                              Tb<:SparseMatmulEltypes,
+                                          }
+    trans, conjval = sparse_op(tA)
+    Bop = LinearAlgebra.wrap(B, tB)
+    Asize = sparse_op_size(A, trans)
+    Asize[2] == size(Bop, 1) ||
+        throw(DimensionMismatch("A has dimensions $Asize but B has dimensions $(size(Bop))"))
+    size(C) == (Asize[1], size(Bop, 2)) ||
+        throw(DimensionMismatch("C has dimensions $(size(C)), should have $((Asize[1], size(Bop, 2)))"))
+    C === B && throw(ArgumentError("output matrix must not be aliased with input matrix"))
+    isempty(C) && return C
+
+    return spmm_coo!(C, coo_type(A)(A), Bop, alpha, beta, trans, conjval)
+end
+
+LinearAlgebra.generic_matmatmul!(C::AnyGPUMatrix{Tc}, tA, tB,
+                                 A::AnyGPUMatrix{Ta}, B::AbstractGPUSparseMatrix{Tb},
+                                 muladd::MulAddMul) where {
+                                     Tc<:SparseMatmulEltypes,
+                                     Ta<:SparseMatmulEltypes,
+                                     Tb<:SparseMatmulEltypes,
+                                 } =
+    LinearAlgebra.generic_matmatmul!(C, tA, tB, A, B, muladd.alpha, muladd.beta)
+
+function LinearAlgebra.generic_matmatmul!(C::AnyGPUMatrix{Tc}, tA, tB,
+                                          A::AnyGPUMatrix{Ta},
+                                          B::AbstractGPUSparseMatrix{Tb},
+                                          alpha::Number, beta::Number) where {
+                                              Tc<:SparseMatmulEltypes,
+                                              Ta<:SparseMatmulEltypes,
+                                              Tb<:SparseMatmulEltypes,
+                                          }
+    Aop = LinearAlgebra.wrap(A, tA)
+    trans, conjval = sparse_op(tB)
+    Bsize = sparse_op_size(B, trans)
+    size(Aop, 2) == Bsize[1] ||
+        throw(DimensionMismatch("A has dimensions $(size(Aop)) but B has dimensions $Bsize"))
+    size(C) == (size(Aop, 1), Bsize[2]) ||
+        throw(DimensionMismatch("C has dimensions $(size(C)), should have $((size(Aop, 1), Bsize[2]))"))
+    C === A && throw(ArgumentError("output matrix must not be aliased with input matrix"))
+    isempty(C) && return C
+
+    return dense_spmm_coo!(C, Aop, coo_type(B)(B), alpha, beta, trans, conjval)
+end
+
 function LinearAlgebra.opnorm(A::AbstractGPUSparseMatrixCSR, p::Real=2)
     if p == Inf
         return maximum(sum(abs, A; dims=2))
@@ -92,7 +679,7 @@ function LinearAlgebra.norm(A::AbstractGPUSparseMatrix{T}, p::Real=2) where T
 end
 
 function SparseArrays.findnz(S::MT) where {MT <: AbstractGPUSparseMatrix}
-    S2 = coo_type(MT)(S)
+    S2 = coo_type(S)(S)
     I = S2.rowInd
     J = S2.colInd
     V = S2.nzVal
@@ -134,16 +721,16 @@ for SparseMatrixType in [:AbstractGPUSparseMatrixCSC, :AbstractGPUSparseMatrixCS
         LinearAlgebra.triu(A::ST, k::Integer) where {T, ST<:$SparseMatrixType{T}} =
             ST( triu(coo_type(A)(A), k) )
         LinearAlgebra.triu(A::Transpose{T,<:ST}, k::Integer) where {T, ST<:$SparseMatrixType{T}} =
-            ST( triu(coo_type(A)(_sptranspose(parent(A))), k) )
+            ST( triu(coo_type(parent(A))(_sptranspose(parent(A))), k) )
         LinearAlgebra.triu(A::Adjoint{T,<:ST}, k::Integer) where {T, ST<:$SparseMatrixType{T}} =
-            ST( triu(coo_type(A)(_spadjoint(parent(A))), k) )
+            ST( triu(coo_type(parent(A))(_spadjoint(parent(A))), k) )
 
         LinearAlgebra.tril(A::ST, k::Integer) where {T, ST<:$SparseMatrixType{T}} =
             ST( tril(coo_type(A)(A), k) )
         LinearAlgebra.tril(A::Transpose{T,<:ST}, k::Integer) where {T, ST<:$SparseMatrixType{T}} =
-            ST( tril(coo_type(A)(_sptranspose(parent(A))), k) )
+            ST( tril(coo_type(parent(A))(_sptranspose(parent(A))), k) )
         LinearAlgebra.tril(A::Adjoint{T,<:ST}, k::Integer) where {T, ST<:$SparseMatrixType{T}} =
-            ST( tril(coo_type(A)(_spadjoint(parent(A))), k) )
+            ST( tril(coo_type(parent(A))(_spadjoint(parent(A))), k) )
 
         LinearAlgebra.triu(A::Union{ST, Transpose{T,<:ST}, Adjoint{T,<:ST}}) where {T, ST<:$SparseMatrixType{T}} =
             ST( triu(coo_type(A)(A), 0) )
@@ -158,18 +745,18 @@ for SparseMatrixType in [:AbstractGPUSparseMatrixCSC, :AbstractGPUSparseMatrixCS
             ST( kron(A, coo_type(B)(B)) )
 
         LinearAlgebra.kron(A::Transpose{T,<:ST}, B::ST) where {T, ST<:$SparseMatrixType{T}} =
-            ST( kron(coo_type(A)(_sptranspose(parent(A))), coo_type(B)(B)) )
+            ST( kron(coo_type(parent(A))(_sptranspose(parent(A))), coo_type(B)(B)) )
         LinearAlgebra.kron(A::ST, B::Transpose{T,<:ST}) where {T, ST<:$SparseMatrixType{T}} =
             ST( kron(coo_type(A)(A), coo_type(parent(B))(_sptranspose(parent(B)))) )
         LinearAlgebra.kron(A::Transpose{T,<:ST}, B::Transpose{T,<:ST}) where {T, ST<:$SparseMatrixType{T}} =
-            ST( kron(coo_type(A)(_sptranspose(parent(A))), coo_type(parent(B))(_sptranspose(parent(B)))) )
+            ST( kron(coo_type(parent(A))(_sptranspose(parent(A))), coo_type(parent(B))(_sptranspose(parent(B)))) )
         LinearAlgebra.kron(A::Transpose{T,<:ST}, B::Diagonal) where {T, ST<:$SparseMatrixType{T}} =
-            ST( kron(coo_type(A)(_sptranspose(parent(A))), B) )
+            ST( kron(coo_type(parent(A))(_sptranspose(parent(A))), B) )
         LinearAlgebra.kron(A::Diagonal, B::Transpose{T,<:ST}) where {T, ST<:$SparseMatrixType{T}} =
-            ST( kron(A, coo_type(B)(_sptranspose(parent(B)))) )
+            ST( kron(A, coo_type(parent(B))(_sptranspose(parent(B)))) )
 
         LinearAlgebra.kron(A::Adjoint{T,<:ST}, B::ST) where {T, ST<:$SparseMatrixType{T}} =
-            ST( kron(coo_type(A)(_spadjoint(parent(A))), coo_type(B)(B)) )
+            ST( kron(coo_type(parent(A))(_spadjoint(parent(A))), coo_type(B)(B)) )
         LinearAlgebra.kron(A::ST, B::Adjoint{T,<:ST}) where {T, ST<:$SparseMatrixType{T}} =
             ST( kron(coo_type(A)(A), coo_type(parent(B))(_spadjoint(parent(B)))) )
         LinearAlgebra.kron(A::Adjoint{T,<:ST}, B::Adjoint{T,<:ST}) where {T, ST<:$SparseMatrixType{T}} =
@@ -814,17 +1401,10 @@ function Broadcast.copy(bc::Broadcasted{<:Union{GPUSparseVecStyle,GPUSparseMatSt
     elseif length(sparse_args) == 1 && sparse_typ <: Union{AbstractGPUSparseMatrixCSR, AbstractGPUSparseMatrixCSC}
         # we only have a single sparse input, so we can reuse its structure for the output.
         # this avoids a kernel launch and costly synchronization.
-        if sparse_typ <: AbstractGPUSparseMatrixCSR
-            offsets = rowPtr = sparse_arg.rowPtr
-            colVal  = similar(sparse_arg.colVal)
-            nzVal   = similar(sparse_arg.nzVal, Tv)
-            output  = sparse_array_type(sparse_typ)(rowPtr, colVal, nzVal, size(bc))
-        elseif sparse_typ <: AbstractGPUSparseMatrixCSC
-            offsets = colPtr = sparse_arg.colPtr
-            rowVal  = similar(sparse_arg.rowVal)
-            nzVal   = similar(sparse_arg.nzVal, Tv)
-            output  = sparse_array_type(sparse_typ)(colPtr, rowVal, nzVal, size(bc))
-        end
+        # structure is preserved, so a structure-preserving `similar` gives the output
+        # container directly; the kernel rewrites the (identical) pointer array from `offsets`.
+        output  = similar(sparse_arg, Tv)
+        offsets = sparse_typ <: AbstractGPUSparseMatrixCSR ? sparse_arg.rowPtr : sparse_arg.colPtr
     else
         # determine the number of non-zero elements per row so that we can create an
         # appropriately-structured output container
@@ -870,17 +1450,22 @@ function Broadcast.copy(bc::Broadcasted{<:Union{GPUSparseVecStyle,GPUSparseMatSt
             total_nnz = mapreduce(x->first(x) != typemax(first(x)), +, offsets)
         end
         output = if sparse_typ <: Union{AbstractGPUSparseMatrixCSR,AbstractGPUSparseMatrixCSC}
-            ixVal = similar(offsets, Ti, total_nnz)
-            nzVal = similar(offsets, Tv, total_nnz)
-            output_sparse_typ = sparse_array_type(sparse_typ) 
-            output_sparse_typ(offsets, ixVal, nzVal, size(bc))
+            # build an empty sparse output of the same kind, then size its index/value
+            # storage to the computed nnz. The kernel fills the pointer array (from
+            # `offsets`), the stored indices, and the values.
+            out = similar(sparse_arg, Tv, size(bc))
+            resize!(_stored_inds(out), total_nnz)
+            resize!(nonzeros(out), total_nnz)
+            out.nnz = total_nnz
+            out
         elseif sparse_typ <: AbstractGPUSparseVector && !fpreszeros
-            val_array = bc.args[first(sparse_args)].nzVal
-            similar(val_array, Tv, size(bc))
+            similar(nonzeros(sparse_arg), Tv, size(bc))
         elseif sparse_typ <: AbstractGPUSparseVector && fpreszeros
-            iPtr   = similar(offsets, Ti, total_nnz)
-            nzVal  = similar(offsets, Tv, total_nnz)
-            sparse_array_type(sparse_arg){Tv, Ti}(iPtr, nzVal, rows)
+            out = similar(sparse_arg, Tv, (rows,))
+            resize!(_stored_inds(out), total_nnz)
+            resize!(nonzeros(out), total_nnz)
+            out.nnz = total_nnz
+            out
         end
         if sparse_typ <: AbstractGPUSparseVector && !fpreszeros
             nonsparse_args = map(bc.args) do arg
@@ -983,9 +1568,6 @@ end
 end
 ## COV_EXCL_STOP
 
-function csc_type end
-function csr_type end
-
 # TODO: implement mapreducedim!
 function Base.mapreduce(f, op, A::AbstractGPUSparseMatrix; dims=:, init=nothing)
     # figure out the destination container type by looking at the initializer element,
diff --git a/test/runtests.jl b/test/runtests.jl
index a91715b9..f60c43e6 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -8,7 +8,7 @@ const init_worker_code = quote
 
     include("testsuite.jl")
 
-    TestSuite.sparse_types(::Type{<:JLArray}) = (JLSparseVector, JLSparseMatrixCSC, JLSparseMatrixCSR)
+    TestSuite.sparse_types(::Type{<:JLArray}) = (JLSparseVector, JLSparseMatrixCSC, JLSparseMatrixCSR, JLSparseMatrixCOO)
     TestSuite.sparse_types(::Type{<:Array})   = (SparseVector, SparseMatrixCSC)
 
     # Disable Float16-related tests until JuliaGPU/KernelAbstractions#600 is resolved
diff --git a/test/testsuite/sparse.jl b/test/testsuite/sparse.jl
index 3e0d92fe..f54ffbf8 100644
--- a/test/testsuite/sparse.jl
+++ b/test/testsuite/sparse.jl
@@ -1,27 +1,296 @@
 @testsuite "sparse" (AT, eltypes)->begin
     sparse_ATs = sparse_types(AT)
+    matrix_ATs = filter(T -> T <: AbstractSparseMatrix, sparse_ATs)
     for sparse_AT in sparse_ATs
         if sparse_AT <: AbstractSparseVector
-            vector(sparse_AT, eltypes)
-            vector_construction(sparse_AT, eltypes)
-            broadcasting_vector(sparse_AT, eltypes)
+            vector(sparse_AT, AT, eltypes)
+            vector_construction(sparse_AT, AT, eltypes)
+            broadcasting_vector(sparse_AT, AT, eltypes)
             iszero_vector(sparse_AT, eltypes)
         elseif sparse_AT <: AbstractSparseMatrix
-            matrix(sparse_AT, eltypes)
-            matrix_construction(sparse_AT, eltypes)
-            broadcasting_matrix(sparse_AT, eltypes)
-            mapreduce_matrix(sparse_AT, eltypes)
-            linalg(sparse_AT, eltypes)
-            iszero_matrix(sparse_AT, eltypes)
+            # `matmul` runs for every format (incl. COO). The matrix battery below is
+            # CSC/CSR-only: COO lacks the broadcast/reductions/`opnorm`/`issymmetric` it uses.
+            matmul(sparse_AT, AT, eltypes)
+            # Float32 accumulation is a GPU-kernel contract; stdlib `Array` accumulates
+            # narrowly, so assert it only for GPU formats.
+            sparse_AT <: GPUArrays.AbstractGPUSparseMatrix &&
+                matmul_accumulation(sparse_AT, AT, eltypes)
+            if sparse_AT <: Union{GPUArrays.AbstractGPUSparseMatrixCSC,
+                                  GPUArrays.AbstractGPUSparseMatrixCSR}
+                matrix(sparse_AT, eltypes)
+                matrix_construction(sparse_AT, AT, eltypes)
+                broadcasting_matrix(sparse_AT, AT, eltypes)
+                mapreduce_matrix(sparse_AT, eltypes)
+                linalg(sparse_AT, eltypes)
+                iszero_matrix(sparse_AT, eltypes)
+            end
        end
     end
+    undef_construction(sparse_ATs, eltypes)
+    # cross-format conversions need ≥2 registered matrix formats to be meaningful.
+    length(matrix_ATs) > 1 && conversions(matrix_ATs, AT, eltypes)
+    # the repeated-index COO matmul test runs against whichever COO type the back-end
+    # registers in `sparse_types` (no separate `sparse_coo_type` hook).
+    i = findfirst(T -> T <: GPUArrays.AbstractGPUSparseMatrixCOO, sparse_ATs)
+    i === nothing || coo_matmul(sparse_ATs[i], AT, eltypes)
 end
 
 using SparseArrays
 using SparseArrays: nonzeroinds, nonzeros, rowvals
 
-function vector(AT, eltypes)
-    dense_AT = GPUArrays.dense_array_type(AT)
+function coo_values(::Type{T}) where {T<:Real}
+    return T[1, 2, 3, 4]
+end
+
+function coo_values(::Type{T}) where {T<:Complex}
+    return T[1 + im, 2 - 3im, 3 + 2im, 4 - im]
+end
+
+function coo_vector(::Type{T}) where {T<:Real}
+    return T[2, 3, 5]
+end
+
+function coo_vector(::Type{T}) where {T<:Complex}
+    return T[2 - im, 3 + 4im, 5 - 2im]
+end
+
+function coo_rhs(::Type{T}) where {T<:Real}
+    return T[
+        1 2;
+        3 4;
+        5 6
+    ]
+end
+
+function coo_rhs(::Type{T}) where {T<:Complex}
+    return T[
+        1 + im 2 - im;
+        3 + 2im 4 - 3im;
+        5 - im 6 + im
+    ]
+end
+
+function coo_lhs(::Type{T}) where {T<:Real}
+    return T[
+        1 2 3;
+        4 5 6
+    ]
+end
+
+function coo_lhs(::Type{T}) where {T<:Complex}
+    return T[
+        1 - im 2 + im 3 - 2im;
+        4 + 3im 5 - im 6 + im
+    ]
+end
+
+# A non-zero-preserving broadcast densifies its result: GPU back-ends return the dense
+# array type `dense_AT`, whereas CPU `SparseArrays` returns its sparse type with every
+# entry explicitly stored. Accept either so the suite is meaningful for both.
+is_densified(x, dense_AT, ::Type{ET}) where {ET} =
+    dense_AT <: GPUArrays.AnyGPUArray ? x isa dense_AT{ET} : (issparse(x) && eltype(x) === ET)
+
+function coo_matmul(AT, dense_AT, eltypes)
+    for ET in (Float16, Float32, ComplexF16, ComplexF32)
+        ET in eltypes || continue
+        @testset "COO repeated-index matmul($ET)" begin
+            I = Int32[1, 1, 2, 3]
+            J = Int32[1, 1, 2, 1]
+            V = coo_values(ET)
+            A = sparse(Int.(I), Int.(J), V, 3, 3)
+            dA = AT(dense_AT(I), dense_AT(J), dense_AT(V), (3, 3), length(V))
+
+            x = coo_vector(ET)
+            dx = dense_AT(x)
+            @test Array(dA * dx) ≈ Array(A * x)
+            @test Array(transpose(dA) * dx) ≈ Array(transpose(A) * x)
+            @test Array(adjoint(dA) * dx) ≈ Array(adjoint(A) * x)
+
+            old_y = dense_AT(fill(ET(1), 3))
+            mul!(old_y, dA, dx, ET(2), ET(3))
+            @test Array(old_y) ≈ Array(ET(2) .* (A * x) .+ ET(3))
+
+            B = coo_rhs(ET)
+            dB = dense_AT(B)
+            @test Array(dA * dB) ≈ Array(A * B)
+            @test Array(transpose(dA) * dB) ≈ Array(transpose(A) * B)
+            @test Array(adjoint(dA) * dB) ≈ Array(adjoint(A) * B)
+
+            L = coo_lhs(ET)
+            dL = dense_AT(L)
+            @test Array(dL * dA) ≈ Array(L * A)
+            @test Array(dL * transpose(dA)) ≈ Array(L * transpose(A))
+            @test Array(dL * adjoint(dA)) ≈ Array(L * adjoint(A))
+        end
+    end
+end
+
+# Narrow complex test data to the matmul eltype: complex eltypes keep both parts (so
+# `transpose` and `adjoint` differ); real eltypes take the real part, since a complex value
+# with a nonzero imaginary part can't be converted to a real type directly. Projecting onto
+# the real part reproduces the proven real test data exactly (e.g. `1 + 2im` → `1`).
+matmul_cast(x::AbstractArray, ::Type{ET}) where {ET<:Complex} = ET.(x)
+matmul_cast(x::AbstractArray, ::Type{ET}) where {ET<:Real}    = ET.(real.(x))
+matmul_cast(x::Number, ::Type{ET}) where {ET<:Complex} = ET(x)
+matmul_cast(x::Number, ::Type{ET}) where {ET<:Real}    = ET(real(x))
+
+# Per matrix format, general (duplicate-free) matmul: SpMV, SpMM, and dense·sparse, each
+# with transpose/adjoint and the 5-arg `mul!`. The repeated-coordinate COO case is covered
+# separately by `coo_matmul`.
+function matmul(AT, dense_AT, eltypes)
+    for ET in (Float16, Float32, ComplexF16, ComplexF32)
+        ET in eltypes || continue
+        @testset "$(nameof(AT)) matmul($ET)" begin
+            V = matmul_cast(ComplexF32[1 + 2im, 2 - im, 3 + im, 4 - 3im, 5 + 2im], ET)
+            A = sparse([1, 1, 2, 4, 4], [1, 3, 2, 1, 4], V, 4, 4)
+            dA = AT(A)
+
+            α = matmul_cast(2 + im, ET)
+            β = matmul_cast(3 - im, ET)
+
+            # SpMV, transpose/adjoint, and the 5-arg and bool `mul!`
+            x = matmul_cast(ComplexF32[2 - im, 3 + 2im, 5 - 4im, 7 + im], ET)
+            dx = dense_AT(x)
+            @test Array(dA * dx) ≈ Array(A * x)
+            @test Array(transpose(dA) * dx) ≈ Array(transpose(A) * x)
+            @test Array(adjoint(dA) * dx) ≈ Array(adjoint(A) * x)
+
+            y = matmul_cast(ComplexF32[1 + im, 2 - im, 3 + 2im, 4 - im], ET)
+            dy = dense_AT(copy(y))
+            mul!(dy, dA, dx, α, β)
+            @test Array(dy) ≈ α .* Array(A * x) .+ β .* y
+
+            dz = dense_AT(zeros(ET, size(A, 1)))
+            mul!(dz, dA, dx, true, false)
+            @test Array(dz) ≈ Array(A * x)
+
+            # SpMM, transpose/adjoint, and the 5-arg `mul!`
+            B = matmul_cast(ComplexF32[
+                1 + im 2 - im 3 + 2im;
+                4 - im 5 + im 6 - 2im;
+                7 + im 8 - im 9 + 3im;
+                2 + 4im 3 - 5im 4 + im
+            ], ET)
+            dB = dense_AT(B)
+            @test Array(dA * dB) ≈ Array(A * B)
+            @test Array(transpose(dA) * dB) ≈ Array(transpose(A) * B)
+            @test Array(adjoint(dA) * dB) ≈ Array(adjoint(A) * B)
+
+            C = matmul_cast(ComplexF32[
+                1 - im 2 + im 3 - 2im;
+                4 + im 5 - im 6 + 2im;
+                7 - im 8 + im 9 - 3im;
+                2 - 4im 3 + 5im 4 - im
+            ], ET)
+            dC = dense_AT(copy(C))
+            mul!(dC, dA, dB, α, β)
+            @test Array(dC) ≈ α .* Array(A * B) .+ β .* C
+
+            # dense·sparse, transpose/adjoint, and the 5-arg `mul!`
+            L = matmul_cast(ComplexF32[
+                1 - im 2 + im 3 - 2im 4 + im;
+                5 + 2im 6 - im 7 + im 8 - 3im
+            ], ET)
+            dL = dense_AT(L)
+            @test Array(dL * dA) ≈ Array(L * A)
+            @test Array(dL * transpose(dA)) ≈ Array(L * transpose(A))
+            @test Array(dL * adjoint(dA)) ≈ Array(L * adjoint(A))
+
+            D = matmul_cast(ComplexF32[
+                1 + im 2 - im 3 + 2im 4 - im;
+                2 + 3im 3 - im 5 + im 7 - 2im
+            ], ET)
+            dD = dense_AT(copy(D))
+            mul!(dD, dL, dA, α, β)
+            @test Array(dD) ≈ α .* Array(L * A) .+ β .* D
+        end
+    end
+end
+
+# Per matrix format, the Float16/ComplexF16 accumulation contract: the back-end must
+# accumulate in Float32/ComplexF32 and round once at the end. Every product feeding a given
+# output entry is identical, so the n-term sum is independent of accumulation order, and the
+# result must match a host reference accumulated in `Tacc` *exactly* (`==`, not `≈`); a
+# narrow Float16 accumulation would lose precision and the equality would fail.
+function matmul_accumulation(AT, dense_AT, eltypes)
+    for ET in (Float16, ComplexF16)
+        ET in eltypes || continue
+        Tacc = ET <: Complex ? ComplexF32 : Float32
+        @testset "$(nameof(AT)) matmul accumulation($ET)" begin
+            n = 128
+            av = matmul_cast(0.1 + 0.2im, ET)
+            xv = matmul_cast(0.1 - 0.3im, ET)
+
+            # SpMV: a 1×n row times a length-n vector
+            A = sparse(fill(1, n), 1:n, fill(av, n), 1, n)
+            x = fill(xv, n)
+            dA = AT(A)
+            dx = dense_AT(x)
+            @test Array(dA * dx) == ET.(SparseMatrixCSC{Tacc,Int}(A) * Tacc.(x))
+
+            # SpMM: the same row times an n×3 dense matrix
+            B = fill(xv, n, 3)
+            dB = dense_AT(B)
+            @test Array(dA * dB) == ET.(SparseMatrixCSC{Tacc,Int}(A) * Tacc.(B))
+
+            # dense·sparse: a 3×n dense matrix times an n×1 column
+            S = sparse(1:n, fill(1, n), fill(av, n), n, 1)
+            dS = AT(S)
+            L = fill(xv, 3, n)
+            dL = dense_AT(L)
+            @test Array(dL * dS) == ET.(Tacc.(L) * SparseMatrixCSC{Tacc,Int}(S))
+        end
+    end
+end
+
+# Round-trip a duplicate-free matrix (and an empty one) through every ordered pair of
+# registered matrix formats: `src == dst` exercises the identity path and `src != dst` the
+# cross-format `convert`. Conversions preserve coordinates, so the round-trip is exact.
+function conversions(matrix_ATs, dense_AT, eltypes)
+    @testset "sparse format conversions" begin
+        A = sparse([1, 4, 2, 4, 1], [3, 1, 2, 5, 1], Float32[3, 4, 5, 6, 7], 4, 5)
+        Z = spzeros(Float32, 3, 4)
+        for src in matrix_ATs, dst in matrix_ATs
+            @test SparseMatrixCSC(dst(src(A))) == A
+            @test SparseMatrixCSC(dst(src(Z))) == Z
+        end
+    end
+end
+
+# The mandated empty-of-shape primitive: `ST{Tv,Ti}(undef, dims)` builds a structurally
+# empty array (no stored entries), mirroring dense `undef` and `SparseArrays`' undef
+# constructors. Exercised per registered format; the back-end's index type is read off a
+# reference instance so the test stays back-end agnostic.
+function undef_construction(sparse_ATs, eltypes)
+    @testset "undef construction" begin
+        for ST in sparse_ATs
+            if ST <: AbstractSparseVector
+                Ti = typeof(ST(spzeros(Float32, 3))).parameters[2]
+                for Tv in (Float32, ComplexF32)
+                    Tv in eltypes || continue
+                    v = ST{Tv,Ti}(undef, 5)
+                    @test v isa ST{Tv,Ti}
+                    @test size(v) == (5,)
+                    @test nnz(v) == 0
+                    @test SparseVector(v) == spzeros(Tv, 5)
+                end
+            else
+                Ti = typeof(ST(spzeros(Float32, 3, 4))).parameters[2]
+                for Tv in (Float32, ComplexF32)
+                    Tv in eltypes || continue
+                    A = ST{Tv,Ti}(undef, (3, 4))
+                    @test A isa ST{Tv,Ti}
+                    @test size(A) == (3, 4)
+                    @test nnz(A) == 0
+                    @test SparseMatrixCSC(A) == spzeros(Tv, 3, 4)
+                    @test size(ST{Tv,Ti}(undef, 3, 4)) == (3, 4)  # varargs form
+                end
+            end
+        end
+    end
+end
+
+function vector(AT, dense_AT, eltypes)
     for ET in eltypes
         @testset "Sparse vector properties($ET)" begin
             m = 25
@@ -58,8 +327,7 @@ function vector(AT, eltypes)
     end
 end
 
-function vector_construction(AT, eltypes)
-    dense_AT = GPUArrays.dense_array_type(AT)
+function vector_construction(AT, dense_AT, eltypes)
     for ET in eltypes
         m = 25
         n = 35
@@ -71,11 +339,20 @@ function vector_construction(AT, eltypes)
         @test collect(d_x) == collect(x)
         @test similar(d_x) isa AT{ET}
         @test similar(d_x, Float32) isa AT{Float32}
+
+        dense_x = dense_AT(collect(x))
+        @test SparseVector(dense_x) == x
+        if dense_x isa GPUArrays.AnyGPUArray
+            typed_d_x = AT(dense_x)
+            @test typed_d_x isa typeof(d_x)
+            @test SparseVector(typed_d_x) == x
+        end
+        @test sparse(dense_x) isa AT{ET}
+        @test SparseVector(sparse(dense_x)) == x
     end
 end
 
 function matrix(AT, eltypes)
-    dense_AT = GPUArrays.dense_array_type(AT)
     for ET in eltypes
         @testset "Sparse matrix properties($ET)" begin
             m = 25
@@ -131,8 +408,7 @@ function matrix(AT, eltypes)
     end
 end
 
-function matrix_construction(AT, eltypes)
-    dense_AT = GPUArrays.dense_array_type(AT)
+function matrix_construction(AT, dense_AT, eltypes)
     for ET in eltypes
         m = 25
         n = 35
@@ -148,11 +424,23 @@ function matrix_construction(AT, eltypes)
         @test similar(d_x, Float32, (n, m)) isa AT{Float32}
         @test similar(d_x, (3, 4)) isa AT{ET}
         @test size(similar(d_x, (3, 4))) == (3, 4)
+
+        dense_x = dense_AT(collect(x))
+        @test SparseMatrixCSC(dense_x) == x
+        if dense_x isa GPUArrays.AnyGPUArray
+            typed_d_x = AT(dense_x)
+            @test typed_d_x isa typeof(d_x)
+            @test SparseMatrixCSC(typed_d_x) == x
+        end
+        @test issparse(sparse(dense_x))
+        @test SparseMatrixCSC(sparse(dense_x)) == x
+        if dense_x isa GPUArrays.AnyGPUArray
+            @test SparseMatrixCSC(sparse(dense_x; fmt=:csr)) == x
+        end
     end
 end
 
-function broadcasting_vector(AT, eltypes)
-    dense_AT = GPUArrays.dense_array_type(AT)
+function broadcasting_vector(AT, dense_AT, eltypes)
     for ET in eltypes
         @testset "SparseVector($ET)" begin
             m  = 64
@@ -172,14 +460,14 @@ function broadcasting_vector(AT, eltypes)
             # not zero-preserving
             y  = x  .+ ET(1)
             dy = dx .+ ET(1)
-            @test dy isa dense_AT{ET}
+            @test is_densified(dy, dense_AT, ET)
             hy = Array(dy)
             @test Array(y) == hy
 
             # involving something dense
             y  = x  .+ ones(ET, m)
             dy = dx .+ dense_AT(ones(ET, m))
-            @test dy isa dense_AT{ET}
+            @test is_densified(dy, dense_AT, ET)
             @test Array(y) == Array(dy)
 
             # sparse to sparse
@@ -210,7 +498,7 @@ function broadcasting_vector(AT, eltypes)
             dw = AT(w)
             z  = @. x  * y  * w  * dense_arr
             dz = @. dx * dy * dw * d_dense_arr
-            @test dz isa dense_AT{ET}
+            @test is_densified(dz, dense_AT, ET)
             @test Array(z) == Array(dz)
             
             y  = sprand(ET, m, p)
@@ -226,7 +514,7 @@ function broadcasting_vector(AT, eltypes)
             dx = AT(x)
             dy = dx .+ 1
             y  = x .+ 1
-            @test dy isa dense_AT{promote_type(ET, Int)}
+            @test is_densified(dy, dense_AT, promote_type(ET, Int))
             @test Array(y) == Array(dy)
             ## zero-preserving
             dy = dx .* 1
@@ -240,8 +528,7 @@ function broadcasting_vector(AT, eltypes)
     end
 end
 
-function broadcasting_matrix(AT, eltypes)
-    dense_AT = GPUArrays.dense_array_type(AT)
+function broadcasting_matrix(AT, dense_AT, eltypes)
     for ET in eltypes
        @testset "SparseMatrix($ET)" begin
             m, n = 5, 6
@@ -257,7 +544,7 @@ function broadcasting_matrix(AT, eltypes)
             # not zero-preserving
             y  = x  .+ ET(1)
             dy = dx .+ ET(1)
-            @test dy isa dense_AT{ET}
+            @test is_densified(dy, dense_AT, ET)
             hy = Array(dy)
             dense_y = Array(y)
             @test Array(y) == Array(dy)
@@ -265,7 +552,7 @@ function broadcasting_matrix(AT, eltypes)
             # involving something dense
             y  = x  .* ones(ET, m, n)
             dy = dx .* dense_AT(ones(ET, m, n))
-            @test dy isa dense_AT{ET}
+            @test is_densified(dy, dense_AT, ET)
             @test Array(y) == Array(dy)
             
             # multiple inputs
@@ -301,7 +588,6 @@ function broadcasting_matrix(AT, eltypes)
 end
 
 function mapreduce_matrix(AT, eltypes)
-    dense_AT = GPUArrays.dense_array_type(AT)
     for ET in eltypes
         @testset "SparseMatrix($ET)" begin
             m,n = 5,6
@@ -309,32 +595,45 @@ function mapreduce_matrix(AT, eltypes)
             x = sprand(ET, m, n, p)
             dx = AT(x)
 
+            # The `abs`-reductions below produce a result of element type `RT`; for some
+            # input eltypes that is a type the back-end can't allocate (e.g. `abs` of a
+            # `Complex{<:Integer}` is `Float64`, unsupported on Metal). Only exercise them
+            # when the back-end supports the result type; `sum` keeps the input eltype and
+            # is always tested.
+            RT = typeof(abs(zero(ET)))
+
             # dim=:
             y  = sum(x)
             dy = sum(dx)
             @test y ≈ dy
 
-            y  = mapreduce(abs, +, x)
-            dy = mapreduce(abs, +, dx)
-            @test y ≈ dy
+            if RT in eltypes
+                y  = mapreduce(abs, +, x)
+                dy = mapreduce(abs, +, dx)
+                @test y ≈ dy
+            end
 
             # dim=1
             y  = sum(x, dims=1)
             dy = sum(dx, dims=1)
             @test y ≈ Array(dy)
 
-            y  = mapreduce(abs, +, x, dims=1)
-            dy = mapreduce(abs, +, dx, dims=1)
-            @test y ≈ Array(dy)
+            if RT in eltypes
+                y  = mapreduce(abs, +, x, dims=1)
+                dy = mapreduce(abs, +, dx, dims=1)
+                @test y ≈ Array(dy)
+            end
 
             # dim=2
             y = sum(x, dims=2)
             dy = sum(dx, dims=2)
             @test y ≈ Array(dy)
 
-            y  = mapreduce(abs, +, x, dims=2)
-            dy = mapreduce(abs, +, dx, dims=2)
-            @test y ≈ Array(dy)
+            if RT in eltypes
+                y  = mapreduce(abs, +, x, dims=2)
+                dy = mapreduce(abs, +, dx, dims=2)
+                @test y ≈ Array(dy)
+            end
             if ET in (Float32, Float64)
                 dy = mapreduce(abs, +, dx; init=zero(ET))
                 y  = mapreduce(abs, +, x; init=zero(ET))
@@ -342,19 +641,20 @@ function mapreduce_matrix(AT, eltypes)
             end
 
             # test with a matrix with fully empty rows
-            x = zeros(ET, m, n)
-            x[2, :] .= -one(ET)
-            x[2, end] = -ET(16)
-            dx = AT(sparse(x))
-            y  = mapreduce(abs, max, x)
-            dy = mapreduce(abs, max, dx)
-            @test y ≈ dy
+            if RT in eltypes
+                x = zeros(ET, m, n)
+                x[2, :] .= -one(ET)
+                x[2, end] = -ET(16)
+                dx = AT(sparse(x))
+                y  = mapreduce(abs, max, x)
+                dy = mapreduce(abs, max, dx)
+                @test y ≈ dy
+            end
         end
     end
 end
 
 function linalg(AT, eltypes)
-    dense_AT = GPUArrays.dense_array_type(AT)
     for ET in eltypes
         # sprandn doesn't work nicely with these...
         if !(ET <: Union{Int16, Int32, Int64, Complex{Int16}, Complex{Int32}, Complex{Int64}})