diff --git a/Project.toml b/Project.toml index 707ee08d..d0e72995 100644 --- a/Project.toml +++ b/Project.toml @@ -7,6 +7,7 @@ projects = ["lib/GPUArraysCore", "lib/JLArrays", "test", "docs"] [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458" GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" @@ -27,6 +28,7 @@ JLD2Ext = "JLD2" [compat] Adapt = "4.6.1" +Atomix = "1.1" GPUArraysCore = "= 0.2.0" JLD2 = "0.4, 0.5, 0.6" KernelAbstractions = "0.9.28, 0.10" diff --git a/docs/src/interface.md b/docs/src/interface.md index a9c57c53..7be10900 100644 --- a/docs/src/interface.md +++ b/docs/src/interface.md @@ -31,6 +31,81 @@ KernelAbstractions.get_backend(a::CA) where CA <: CustomArray = CustomBackend() There are numerous examples of potential interfaces for GPUArrays, such as with [JLArrays](https://github.com/JuliaGPU/GPUArrays.jl/blob/main/lib/JLArrays/src/JLArrays.jl), [CuArrays](https://github.com/JuliaGPU/CUDA.jl/blob/main/src/gpuarrays.jl), and [ROCArrays](https://github.com/JuliaGPU/AMDGPU.jl/blob/main/src/gpuarrays.jl). +## Sparse arrays + +A sparse array can't share the `AbstractGPUArray` supertype — that is a `DenseArray`, +whereas a sparse array must be an `AbstractSparseArray` — so GPUArrays keeps a parallel +sparse hierarchy with its own generic functionality. Integrating a back-end has three +parts: the storage types it provides, the methods it implements to plug them in, and the +functionality it then gets for free. + +### Storage types to provide + +One mutable struct per supported format, subtyping the matching abstract type and using +the conventional field names (generic code reads them directly): + +| supertype | fields | +|:--|:--| +| `AbstractGPUSparseVector{Tv,Ti}` | `iPtr`, `nzVal`, `len`, `nnz` | +| `AbstractGPUSparseMatrixCSC{Tv,Ti}` | `colPtr`, `rowVal`, `nzVal`, `dims`, `nnz` | +| `AbstractGPUSparseMatrixCSR{Tv,Ti}` | `rowPtr`, `colVal`, `nzVal`, `dims`, `nnz` | +| `AbstractGPUSparseMatrixCOO{Tv,Ti}` | `rowInd`, `colInd`, `nzVal`, `dims`, `nnz` | + +The pointer/index/value arrays are the back-end's own dense vector type. Provide only the +formats you need, but note that several generic operations route through COO. + +### Interface to implement + + * **Constructors** — from component arrays (`MyCSR(rowPtr, colVal, nzVal, dims)`), + between formats (`MyCSR(::MyCOO)`, …), and to/from host `SparseArrays` + (`MyCSC(::SparseMatrixCSC)`, `SparseMatrixCSC(::MyCSC)`). + * **`undef` constructors** — `MyCSC{Tv,Ti}(undef, dims)` / `MyVec{Tv,Ti}(undef, n)`, + building a *structurally-empty* array (no stored entries), mirroring dense + `Array{T}(undef, dims)` and `SparseArrays`' `SparseMatrixCSC{Tv,Ti}(undef, m, n)`. This + is the empty-of-a-shape allocation primitive. Note there is no uninitialized-structure + analogue: for a sparse array `undef` means empty, exactly as in `SparseArrays`. + Implementing these through a `spzeros(Tv, Ti, dims…; fmt=…)` helper (the value-level + analogue of `SparseArrays.spzeros`, with a format selector) is recommended — it also + serves as a convenient public, format-polymorphic entry point — but `spzeros` itself is + not mandated, since its signature is back-end-flavored (format symbols, storage modes) + whereas the `undef` constructor is uniform. + * **`Base.similar`** — structure-preserving (`similar(A)`, `similar(A, ::Type)`) and + empty-of-a-shape (`similar(A, ::Type, dims)`), as for dense arrays; generic code + allocates its outputs through `similar`, never by naming a type. The empty-of-a-shape + form just delegates to the `undef` constructor (threading the source's storage mode), + so the constructor is the real primitive. + * **Format-conversion hooks** `GPUArrays.coo_type`/`csr_type`/`csc_type` — map any of your + sparse-matrix types to the *type* of the named sibling format + (`coo_type(::Type{<:MyCSC}) = MyCOO`); generic code converts with `coo_type(A)(A)`. These + are type-level hooks rather than plain `convert(Dest, A)` because a format is the + wrapper's identity (distinct structs), not a type parameter — so, unlike an eltype change, + there is no generic wrapper→sibling-wrapper operation, and only the back-end knows its + sibling types. The cross-format `convert` methods above are the engine the resulting + constructors route through; the identity case (`coo_type(coo)(coo)`) is your identity + constructor. + * **`KernelAbstractions.get_backend`** for the sparse types (usually + `get_backend(nonzeros(A))`). + * **`Adapt.adapt_structure`** converting each host struct to its device counterpart + (`GPUArrays.GPUSparseDeviceVector`, `GPUSparseDeviceMatrixCSC`/`CSR`/`COO`), so the + generic kernels can consume it inside `@kernel`s. + * **`GPUArrays._sptranspose`/`_spadjoint`** — materialize a (conjugate) transpose; used + by `kron`/`triu`/`tril` on lazily wrapped operands. + +`SparseArrays`' accessors (`nnz`, `nonzeros`, `nonzeroinds`, `rowvals`, `getcolptr`) come +for free from the field names. Dense↔sparse conversion is generic and on-device: +`to_sparse(::Type{ST}, dense)` scans into a sparse array (`ST` a vector or COO type; +CSR/CSC follow via the verbs) and `to_dense(A)` scatters back to a dense array of the +back-end — so a back-end's `MyArray(::MySparse…)` and dense→sparse constructors can simply +call them. + +### Functionality you get + +Broadcasting; `mapreduce` and reductions (`sum`, `norm`, `opnorm`); sparse–dense and +sparse–vector multiplication (`*`, `mul!`, including transposed/adjoint operands); +`findnz`, `triu`/`tril`/`kron`/`reshape`/`droptol!`; `iszero`/`issymmetric`/`ishermitian`; +scalar and slice indexing; `copy`/`copyto!`/`collect`/`Array`; and conversion between +formats and to/from dense. + ## Caching Allocator ```@docs diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl index 0efc5a5e..5ebfa26b 100644 --- a/lib/JLArrays/src/JLArrays.jl +++ b/lib/JLArrays/src/JLArrays.jl @@ -6,15 +6,14 @@ module JLArrays -export JLArray, JLVector, JLMatrix, jl, JLBackend, JLSparseVector, JLSparseMatrixCSC, JLSparseMatrixCSR +export JLArray, JLVector, JLMatrix, jl, JLBackend, JLSparseVector, + JLSparseMatrixCSC, JLSparseMatrixCSR, JLSparseMatrixCOO using GPUArrays using Adapt using SparseArrays, LinearAlgebra -import GPUArrays: dense_array_type - import KernelAbstractions import KernelAbstractions: Adapt, StaticArrays, Backend, Kernel, StaticSize, DynamicSize, partition, blocks, workitems, launch_config @@ -74,6 +73,11 @@ Adapt.adapt_structure(to::Adaptor, r::Base.RefValue) = JlRefValue(adapt(to, r[]) @inline Base.getindex(A::JLDeviceArray, index::Integer) = getindex(typed_data(A), index) @inline Base.setindex!(A::JLDeviceArray, x, index::Integer) = setindex!(typed_data(A), x, index) + Base.pointer(x::JLDeviceArray{T}) where {T} = + convert(Ptr{T}, pointer(x.data)) + x.offset + @inline function Base.pointer(x::JLDeviceArray{T}, i::Integer) where T + pointer(x) + (i - 1) * sizeof(T) + end end # @@ -188,6 +192,48 @@ end JLSparseMatrixCSC(Mat::Union{Transpose{Tv, <:SparseMatrixCSC}, Adjoint{Tv, <:SparseMatrixCSC}}) where {Tv} = JLSparseMatrixCSC(JLSparseMatrixCSR(Mat)) +mutable struct JLSparseMatrixCOO{Tv, Ti} <: GPUArrays.AbstractGPUSparseMatrixCOO{Tv, Ti} + rowInd::JLArray{Ti, 1} + colInd::JLArray{Ti, 1} + nzVal::JLArray{Tv, 1} + dims::NTuple{2,Int} + nnz::Ti + + function JLSparseMatrixCOO{Tv, Ti}(rowInd::JLArray{Ti, 1}, colInd::JLArray{Ti, 1}, + nzVal::JLArray{Tv, 1}, dims::NTuple{2,<:Integer}, + nnz::Integer=length(nzVal)) where {Tv, Ti <: Integer} + length(rowInd) == length(colInd) == length(nzVal) || + throw(DimensionMismatch("COO row, column, and value arrays have different lengths")) + new{Tv, Ti}(rowInd, colInd, nzVal, (Int(dims[1]), Int(dims[2])), Ti(nnz)) + end +end +JLSparseMatrixCOO(rowInd::JLArray{Ti, 1}, colInd::JLArray{Ti, 1}, + nzVal::JLArray{Tv, 1}, dims::NTuple{2,<:Integer}, + nnz::Integer=length(nzVal)) where {Tv, Ti <: Integer} = + JLSparseMatrixCOO{Tv, Ti}(rowInd, colInd, nzVal, dims, nnz) + +JLSparseMatrixCOO(A::JLSparseMatrixCOO) = A + +Base.length(x::JLSparseMatrixCOO) = prod(x.dims) +Base.size(x::JLSparseMatrixCOO) = x.dims +SparseArrays.nnz(x::JLSparseMatrixCOO) = x.nnz +SparseArrays.nonzeros(x::JLSparseMatrixCOO) = x.nzVal + +function SparseArrays.SparseMatrixCSC(x::JLSparseMatrixCOO) + return sparse(Array(x.rowInd), Array(x.colInd), Array(x.nzVal), size(x)...) +end + +function Base.getindex(A::JLSparseMatrixCOO{Tv}, i::Integer, j::Integer) where {Tv} + @boundscheck checkbounds(A, i, j) + acc = zero(Tv) + for k in 1:Int(nnz(A)) + @inbounds if A.rowInd[k] == i && A.colInd[k] == j + acc += A.nzVal[k] + end + end + return acc +end + function Base.size(g::JLSparseMatrixCSR, d::Integer) if 1 <= d <= 2 return g.dims[d] @@ -218,33 +264,68 @@ function Base.getindex(A::JLSparseMatrixCSR{Tv, Ti}, i0::Integer, i1::Integer) w end GPUArrays.storage(a::JLArray) = a.data -GPUArrays.dense_array_type(a::JLArray{T, N}) where {T, N} = JLArray{T, N} -GPUArrays.dense_array_type(::Type{JLArray{T, N}}) where {T, N} = JLArray{T, N} -GPUArrays.dense_vector_type(a::JLArray{T, N}) where {T, N} = JLArray{T, 1} -GPUArrays.dense_vector_type(::Type{JLArray{T, N}}) where {T, N} = JLArray{T, 1} - -GPUArrays.sparse_array_type(sa::JLSparseMatrixCSC) = JLSparseMatrixCSC -GPUArrays.sparse_array_type(::Type{<:JLSparseMatrixCSC}) = JLSparseMatrixCSC -GPUArrays.sparse_array_type(sa::JLSparseMatrixCSR) = JLSparseMatrixCSR -GPUArrays.sparse_array_type(::Type{<:JLSparseMatrixCSR}) = JLSparseMatrixCSR -GPUArrays.sparse_array_type(sa::JLSparseVector) = JLSparseVector -GPUArrays.sparse_array_type(::Type{<:JLSparseVector}) = JLSparseVector - -GPUArrays.dense_array_type(sa::JLSparseVector) = JLArray -GPUArrays.dense_array_type(::Type{<:JLSparseVector}) = JLArray -GPUArrays.dense_array_type(sa::JLSparseMatrixCSC) = JLArray -GPUArrays.dense_array_type(::Type{<:JLSparseMatrixCSC}) = JLArray -GPUArrays.dense_array_type(sa::JLSparseMatrixCSR) = JLArray -GPUArrays.dense_array_type(::Type{<:JLSparseMatrixCSR}) = JLArray - -GPUArrays.csc_type(sa::JLSparseMatrixCSR) = JLSparseMatrixCSC -GPUArrays.csr_type(sa::JLSparseMatrixCSC) = JLSparseMatrixCSR +const JLSparseMatrix{Tv,Ti} = Union{JLSparseMatrixCSC{Tv,Ti},JLSparseMatrixCSR{Tv,Ti},JLSparseMatrixCOO{Tv,Ti}} +# format-conversion hooks: map any JLArrays sparse matrix to the *type* of its sibling +# format. Generic GPUArrays code converts with `coo_type(A)(A)` etc., routing through the +# format constructors defined below. +GPUArrays.csc_type(::Type{<:JLSparseMatrix}) = JLSparseMatrixCSC +GPUArrays.csr_type(::Type{<:JLSparseMatrix}) = JLSparseMatrixCSR +GPUArrays.coo_type(::Type{<:JLSparseMatrix}) = JLSparseMatrixCOO Base.similar(Mat::JLSparseMatrixCSR) = JLSparseMatrixCSR(copy(Mat.rowPtr), copy(Mat.colVal), similar(nonzeros(Mat)), size(Mat)) Base.similar(Mat::JLSparseMatrixCSR, T::Type) = JLSparseMatrixCSR(copy(Mat.rowPtr), copy(Mat.colVal), similar(nonzeros(Mat), T), size(Mat)) - -Base.similar(Mat::JLSparseMatrixCSC, T::Type, N::Int, M::Int) = JLSparseMatrixCSC(JLVector([zero(Int32)]), JLVector{Int32}(undef, 0), JLVector{T}(undef, 0), (N, M)) -Base.similar(Mat::JLSparseMatrixCSR, T::Type, N::Int, M::Int) = JLSparseMatrixCSR(JLVector([zero(Int32)]), JLVector{Int32}(undef, 0), JLVector{T}(undef, 0), (N, M)) +Base.similar(Mat::JLSparseMatrixCOO) = JLSparseMatrixCOO(copy(Mat.rowInd), copy(Mat.colInd), similar(nonzeros(Mat)), size(Mat), nnz(Mat)) +Base.similar(Mat::JLSparseMatrixCOO, T::Type) = JLSparseMatrixCOO(copy(Mat.rowInd), copy(Mat.colInd), similar(nonzeros(Mat), T), size(Mat), nnz(Mat)) + +# CSC and vector structure-preserving `similar` (mirror the CSR/COO methods above; these +# used to come for free from GPUArrays via a type-reconstruction helper). +Base.similar(Mat::JLSparseMatrixCSC{<:Any, Ti}, ::Type{T}) where {T, Ti} = + JLSparseMatrixCSC{T, Ti}(copy(Mat.colPtr), copy(Mat.rowVal), similar(nonzeros(Mat), T), size(Mat)) +Base.similar(Vec::JLSparseVector{<:Any, Ti}, ::Type{T}) where {T, Ti} = + JLSparseVector{T, Ti}(copy(Vec.iPtr), similar(nonzeros(Vec), T), length(Vec)) + +# Empty (structurally-zero) sparse allocation — the value-level analogue of +# `SparseArrays.spzeros` and the shared implementation behind the `undef` constructors and +# the empty-of-shape `similar` below. The matrix method selects the format via `fmt` +# (mirroring `sparse(A; fmt=…)`); the vector method takes a length. Unexported (so it never +# clashes with `SparseArrays.spzeros`); call it as `JLArrays.spzeros`. +function spzeros(::Type{Tv}, ::Type{Ti}, dims::Dims{2}; fmt=:csc) where {Tv, Ti} + if fmt === :csc + JLSparseMatrixCSC{Tv, Ti}(JLVector(ones(Ti, dims[2] + 1)), JLVector{Ti}(undef, 0), JLVector{Tv}(undef, 0), dims) + elseif fmt === :csr + JLSparseMatrixCSR{Tv, Ti}(JLVector(ones(Ti, dims[1] + 1)), JLVector{Ti}(undef, 0), JLVector{Tv}(undef, 0), dims) + elseif fmt === :coo + JLSparseMatrixCOO{Tv, Ti}(JLVector{Ti}(undef, 0), JLVector{Ti}(undef, 0), JLVector{Tv}(undef, 0), dims) + else + throw(ArgumentError("format :$fmt not available, use :csc, :csr, or :coo")) + end +end +spzeros(::Type{Tv}, ::Type{Ti}, dims::Dims{1}) where {Tv, Ti} = + JLSparseVector{Tv, Ti}(JLVector{Ti}(undef, 0), JLVector{Tv}(undef, 0), dims[1]) +spzeros(::Type{Tv}, ::Type{Ti}, dims::Integer...; kw...) where {Tv, Ti} = + spzeros(Tv, Ti, map(Int, dims); kw...) +spzeros(::Type{Tv}, dims::Union{Integer,Dims}...; kw...) where {Tv} = + spzeros(Tv, Int, dims...; kw...) + +# `undef` constructors — the mandated empty-of-shape allocation primitive, mirroring the +# dense `JLArray{T,N}(undef, dims)` and `SparseArrays`' `SparseMatrixCSC{Tv,Ti}(undef, m, n)`. +JLSparseVector{Tv, Ti}(::UndefInitializer, len::Integer) where {Tv, Ti} = + spzeros(Tv, Ti, (Int(len),)) +JLSparseMatrixCSC{Tv, Ti}(::UndefInitializer, dims::Dims{2}) where {Tv, Ti} = spzeros(Tv, Ti, dims; fmt=:csc) +JLSparseMatrixCSR{Tv, Ti}(::UndefInitializer, dims::Dims{2}) where {Tv, Ti} = spzeros(Tv, Ti, dims; fmt=:csr) +JLSparseMatrixCOO{Tv, Ti}(::UndefInitializer, dims::Dims{2}) where {Tv, Ti} = spzeros(Tv, Ti, dims; fmt=:coo) +JLSparseMatrixCSC{Tv, Ti}(u::UndefInitializer, m::Integer, n::Integer) where {Tv, Ti} = JLSparseMatrixCSC{Tv, Ti}(u, (Int(m), Int(n))) +JLSparseMatrixCSR{Tv, Ti}(u::UndefInitializer, m::Integer, n::Integer) where {Tv, Ti} = JLSparseMatrixCSR{Tv, Ti}(u, (Int(m), Int(n))) +JLSparseMatrixCOO{Tv, Ti}(u::UndefInitializer, m::Integer, n::Integer) where {Tv, Ti} = JLSparseMatrixCOO{Tv, Ti}(u, (Int(m), Int(n))) + +# empty sparse arrays of a given shape, used by the generic broadcast machinery as the +# output container before its storage is sized and filled. the index type is preserved. +Base.similar(Mat::JLSparseMatrixCSC{<:Any, Ti}, ::Type{T}, N::Int, M::Int) where {T, Ti} = + JLSparseMatrixCSC{T, Ti}(undef, (N, M)) +Base.similar(Mat::JLSparseMatrixCSR{<:Any, Ti}, ::Type{T}, N::Int, M::Int) where {T, Ti} = + JLSparseMatrixCSR{T, Ti}(undef, (N, M)) +Base.similar(Vec::JLSparseVector{<:Any, Ti}, ::Type{T}, dims::Dims{1}) where {T, Ti} = + JLSparseVector{T, Ti}(undef, dims[1]) Base.similar(Mat::JLSparseMatrixCSC{Tv, Ti}, N::Int, M::Int) where {Tv, Ti} = similar(Mat, Tv, N, M) Base.similar(Mat::JLSparseMatrixCSR{Tv, Ti}, N::Int, M::Int) where {Tv, Ti} = similar(Mat, Tv, N, M) @@ -255,9 +336,10 @@ Base.similar(Mat::JLSparseMatrixCSR, T::Type, dims::Tuple{Int, Int}) = similar(M Base.similar(Mat::JLSparseMatrixCSC, dims::Tuple{Int, Int}) = similar(Mat, dims...) Base.similar(Mat::JLSparseMatrixCSR, dims::Tuple{Int, Int}) = similar(Mat, dims...) -JLArray(x::JLSparseVector) = JLArray(collect(SparseVector(x))) -JLArray(x::JLSparseMatrixCSC) = JLArray(collect(SparseMatrixCSC(x))) -JLArray(x::JLSparseMatrixCSR) = JLArray(collect(SparseMatrixCSC(x))) +JLArray(x::JLSparseVector) = GPUArrays.to_dense(x) +JLArray(x::JLSparseMatrixCSC) = GPUArrays.to_dense(x) +JLArray(x::JLSparseMatrixCSR) = GPUArrays.to_dense(x) +JLArray(x::JLSparseMatrixCOO) = GPUArrays.to_dense(x) # conversion of untyped data to a typed Array function typed_data(x::JLArray{T}) where {T} @@ -370,6 +452,9 @@ function JLSparseVector(xs::SparseVector{Tv, Ti}) where {Ti, Tv} copyto!(nzVal, convert(Vector{Tv}, xs.nzval)) return JLSparseVector{Tv, Ti}(iPtr, nzVal, length(xs),) end +SparseArrays.SparseVector(xs::JLArray{<:Any,1}) = SparseVector(JLSparseVector(xs)) +JLSparseVector(xs::JLArray{T,1}) where {T} = + GPUArrays.to_sparse(JLSparseVector{T,Int}, xs) Base.length(x::JLSparseVector) = x.len Base.size(x::JLSparseVector) = (x.len,) @@ -382,7 +467,9 @@ function JLSparseMatrixCSC(xs::SparseMatrixCSC{Tv, Ti}) where {Ti, Tv} copyto!(nzVal, convert(Vector{Tv}, xs.nzval)) return JLSparseMatrixCSC{Tv, Ti}(colPtr, rowVal, nzVal, (xs.m, xs.n)) end +SparseArrays.SparseMatrixCSC(xs::JLArray{<:Any,2}) = SparseMatrixCSC(JLSparseMatrixCSC(xs)) JLSparseMatrixCSC(xs::SparseVector) = JLSparseMatrixCSC(SparseMatrixCSC(xs)) +JLSparseMatrixCSC(xs::JLArray{T,2}) where {T} = JLSparseMatrixCSC(JLSparseMatrixCOO(xs)) Base.length(x::JLSparseMatrixCSC) = prod(x.dims) Base.size(x::JLSparseMatrixCSC) = x.dims @@ -397,12 +484,47 @@ function JLSparseMatrixCSR(xs::SparseMatrixCSC{Tv, Ti}) where {Ti, Tv} return JLSparseMatrixCSR{Tv, Ti}(rowPtr, colVal, nzVal, (xs.m, xs.n)) end JLSparseMatrixCSR(xs::SparseVector{Tv, Ti}) where {Ti, Tv} = JLSparseMatrixCSR(SparseMatrixCSC(xs)) -function JLSparseMatrixCSR(xs::JLSparseMatrixCSC{Tv, Ti}) where {Ti, Tv} - return JLSparseMatrixCSR(SparseMatrixCSC(xs)) -end -function JLSparseMatrixCSC(xs::JLSparseMatrixCSR{Tv, Ti}) where {Ti, Tv} - return JLSparseMatrixCSC(SparseMatrixCSC(xs)) +JLSparseMatrixCSR(xs::JLArray{T,2}) where {T} = JLSparseMatrixCSR(JLSparseMatrixCOO(xs)) +function JLSparseMatrixCOO(xs::SparseMatrixCSC{Tv, Ti}) where {Ti, Tv} + rowInd, colInd, nzVal = findnz(xs) + return JLSparseMatrixCOO{Tv, Ti}(JLVector{Ti}(rowInd), JLVector{Ti}(colInd), + JLVector{Tv}(nzVal), (xs.m, xs.n)) +end +JLSparseMatrixCOO(xs::SparseVector) = JLSparseMatrixCOO(SparseMatrixCSC(xs)) +JLSparseMatrixCOO(xs::JLArray{T,2}) where {T} = + GPUArrays.to_sparse(JLSparseMatrixCOO{T,Int}, xs) +# Cross-format device→device conversions (CSC↔CSR↔COO): the back-end owns the constructors; +# the on-device conversion algorithm is GPUArrays' generic `convert` (replacing the former +# host round-trips through `SparseMatrixCSC`). The `csc_type/csr_type/coo_type` hooks above +# route to these. The COO→CSR/CSC path needs `sortperm`, provided below. +JLSparseMatrixCOO(A::Union{JLSparseMatrixCSC,JLSparseMatrixCSR}) = + convert(JLSparseMatrixCOO, A) +JLSparseMatrixCSR(A::Union{JLSparseMatrixCSC,JLSparseMatrixCOO}) = + convert(JLSparseMatrixCSR, A) +JLSparseMatrixCSC(A::Union{JLSparseMatrixCSR,JLSparseMatrixCOO}) = + convert(JLSparseMatrixCSC, A) + +# `sortperm` on a JLArray vector, required by the generic COO→CSR/CSC conversions (and by +# `findnz`). Computed on the host and wrapped back into a device vector so that the gather +# kernel can index the permutation on-device (mirrors Metal's `sortperm`, which also returns +# a device `Int` vector). +Base.sortperm(x::JLVector; kwargs...) = JLArray(sortperm(Array(x); kwargs...)) + +SparseArrays.sparse(xs::JLArray{T,1}) where {T} = + GPUArrays.to_sparse(JLSparseVector{T,Int}, xs) + +function SparseArrays.sparse(xs::JLArray{T,2}; fmt=:csc) where {T} + if fmt == :csc + return JLSparseMatrixCSC(xs) + elseif fmt == :csr + return JLSparseMatrixCSR(xs) + elseif fmt == :coo + return JLSparseMatrixCOO(xs) + else + throw(ArgumentError("format :$fmt not available, use :csc, :csr, or :coo")) + end end + function Base.copyto!(dst::JLSparseMatrixCSR, src::JLSparseMatrixCSR) if size(dst) != size(src) throw(ArgumentError("Inconsistent Sparse Matrix size")) @@ -456,6 +578,13 @@ Adapt.adapt_storage(::Type{Array}, xs::JLArray) = convert(Array, xs) Base.convert(::Type{T}, x::T) where T <: JLArray = x +## indexing + +Base.findall(bools::JLArray{Bool}) = JLArray(findall(Array(bools))) +Base.findall(f::Function, A::JLArray) = JLArray(findall(f, Array(A))) +Base.findall(f::Base.Fix2{typeof(in)}, A::JLArray) = JLArray(findall(f, Array(A))) + + ## broadcast import Base.Broadcast: BroadcastStyle, Broadcasted @@ -568,13 +697,15 @@ Adapt.adapt_structure(to::Adaptor, x::JLSparseMatrixCSC{Tv,Ti}) where {Tv,Ti} = GPUSparseDeviceMatrixCSC{Tv,Ti,JLDeviceArray{Ti, 1}, JLDeviceArray{Tv, 1}, AS.Generic}(adapt(to, x.colPtr), adapt(to, x.rowVal), adapt(to, x.nzVal), x.dims, x.nnz) Adapt.adapt_structure(to::Adaptor, x::JLSparseMatrixCSR{Tv,Ti}) where {Tv,Ti} = GPUSparseDeviceMatrixCSR{Tv,Ti,JLDeviceArray{Ti, 1}, JLDeviceArray{Tv, 1}, AS.Generic}(adapt(to, x.rowPtr), adapt(to, x.colVal), adapt(to, x.nzVal), x.dims, x.nnz) +Adapt.adapt_structure(to::Adaptor, x::JLSparseMatrixCOO{Tv,Ti}) where {Tv,Ti} = +GPUSparseDeviceMatrixCOO{Tv,Ti,JLDeviceArray{Ti, 1}, JLDeviceArray{Tv, 1}, AS.Generic}(adapt(to, x.rowInd), adapt(to, x.colInd), adapt(to, x.nzVal), x.dims, x.nnz) Adapt.adapt_structure(to::Adaptor, x::JLSparseVector{Tv,Ti}) where {Tv,Ti} = GPUSparseDeviceVector{Tv,Ti,JLDeviceArray{Ti, 1}, JLDeviceArray{Tv, 1}, AS.Generic}(adapt(to, x.iPtr), adapt(to, x.nzVal), x.len, x.nnz) ## KernelAbstractions interface KernelAbstractions.get_backend(a::JLA) where JLA <: JLArray = JLBackend() -KernelAbstractions.get_backend(a::JLA) where JLA <: Union{JLSparseMatrixCSC, JLSparseMatrixCSR, JLSparseVector} = JLBackend() +KernelAbstractions.get_backend(a::JLA) where JLA <: Union{JLSparseMatrixCSC, JLSparseMatrixCSR, JLSparseMatrixCOO, JLSparseVector} = JLBackend() function KernelAbstractions.mkcontext(kernel::Kernel{JLBackend}, I, _ndrange, iterspace, ::Dynamic) where Dynamic return KernelAbstractions.CompilerMetadata{KernelAbstractions.ndrange(kernel), Dynamic}(I, _ndrange, iterspace) diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl index 17a4d897..d50dcee0 100644 --- a/src/GPUArrays.jl +++ b/src/GPUArrays.jl @@ -9,6 +9,7 @@ using Printf using LinearAlgebra.BLAS using Base.Cartesian +using Atomix using Adapt using LLVM.Interop diff --git a/src/host/sparse.jl b/src/host/sparse.jl index 5c84ee77..c3bf5d4b 100644 --- a/src/host/sparse.jl +++ b/src/host/sparse.jl @@ -44,29 +44,616 @@ Base.iszero(A::AbstractGPUSparseArray) = SparseArrays.nnz(A) == 0 || all(iszero, SparseArrays.SparseVector(x::AbstractGPUSparseVector) = SparseVector(length(x), Array(SparseArrays.nonzeroinds(x)), Array(SparseArrays.nonzeros(x))) SparseArrays.SparseMatrixCSC(x::AbstractGPUSparseMatrixCSC) = SparseMatrixCSC(size(x)..., Array(SparseArrays.getcolptr(x)), Array(SparseArrays.rowvals(x)), Array(SparseArrays.nonzeros(x))) +function check_sparse_target(::Type{ST}, ::Type{Tv}, ::Type{Ti}) where {ST,Tv,Ti} + Tv isa Type && Ti isa Type || + throw(ArgumentError("sparse target type $ST must specify value and index eltypes")) + return +end + +@kernel function dense_sparse_vector_kernel!(iPtr, nzVal, A, indices) + i = @index(Global, Linear) + if i <= length(indices) + @inbounds begin + idx = indices[i] + iPtr[i] = idx + nzVal[i] = A[idx] + end + end +end + +@kernel function dense_sparse_matrix_kernel!(rowInd, colInd, nzVal, A, indices) + i = @index(Global, Linear) + if i <= length(indices) + @inbounds begin + idx = indices[i] + rowInd[i] = idx[1] + colInd[i] = idx[2] + nzVal[i] = A[idx] + end + end +end + +""" + to_sparse(ST, A) + +Build a sparse array of type `ST` from the dense GPU array `A`, on the device (no host +round-trip); the inverse of [`to_dense`](@ref). `ST` must be a concrete +`AbstractGPUSparseVector` or `AbstractGPUSparseMatrixCOO` (other matrix formats follow by +converting the resulting COO with `csr_type`/`csc_type`). The target format is required +because the result layout would otherwise be ambiguous; `to_dense` needs no such argument. +""" +function to_sparse(::Type{ST}, A::AnyGPUVector) where {Tv,Ti,ST<:AbstractGPUSparseVector{Tv,Ti}} + check_sparse_target(ST, Tv, Ti) + indices = findall(!iszero, A) + nstored = length(indices) + iPtr = similar(A, Ti, nstored) + nzVal = similar(A, Tv, nstored) + if nstored > 0 + dense_sparse_vector_kernel!(get_backend(A))(iPtr, nzVal, A, indices; + ndrange=nstored) + end + unsafe_free!(indices) + return ST(iPtr, nzVal, length(A)) +end + +# Densifying a dense array naturally produces coordinate (COO) format, so this builds a +# COO directly. Other formats are obtained by converting the COO with `csr_type`/ +# `csc_type` (which back-ends already provide as constructors), avoiding any need to map +# a target type to its COO sibling. +function to_sparse(::Type{ST}, A::AnyGPUMatrix) where {Tv,Ti,ST<:AbstractGPUSparseMatrixCOO{Tv,Ti}} + check_sparse_target(ST, Tv, Ti) + indices = findall(!iszero, A) + nstored = length(indices) + rowInd = similar(A, Ti, nstored) + colInd = similar(A, Ti, nstored) + nzVal = similar(A, Tv, nstored) + if nstored > 0 + dense_sparse_matrix_kernel!(get_backend(A))(rowInd, colInd, nzVal, A, indices; + ndrange=nstored) + end + unsafe_free!(indices) + return ST(rowInd, colInd, nzVal, size(A), nstored) +end + +@kernel function densify_vector_kernel!(out, iPtr, nzVal) + k = @index(Global, Linear) + if k <= length(nzVal) + @inbounds out[iPtr[k]] = nzVal[k] + end +end + +@kernel function densify_matrix_kernel!(out, rowInd, colInd, nzVal) + k = @index(Global, Linear) + if k <= length(nzVal) + @inbounds out[rowInd[k], colInd[k]] = nzVal[k] + end +end + +""" + to_dense(A) + +Materialize the sparse GPU array `A` into a dense array of the same back-end, on the +device (no host round-trip); the inverse of [`to_sparse`](@ref). Stored coordinates are +assumed unique (true for CSC/CSR and for COO produced by conversion); duplicate COO +coordinates resolve to the last value written. +""" +function to_dense(x::AbstractGPUSparseVector{Tv}) where {Tv} + out = fill!(similar(nonzeros(x), Tv, length(x)), zero(Tv)) + if nnz(x) > 0 + densify_vector_kernel!(get_backend(out))(out, SparseArrays.nonzeroinds(x), nonzeros(x); + ndrange=nnz(x)) + end + return out +end + +function to_dense(A::AbstractGPUSparseMatrix{Tv}) where {Tv} + coo = coo_type(A)(A) + out = fill!(similar(nonzeros(A), Tv, size(A)), zero(Tv)) + if nnz(coo) > 0 + densify_matrix_kernel!(get_backend(out))(out, coo.rowInd, coo.colInd, nonzeros(coo); + ndrange=nnz(coo)) + end + return out +end + # similar Base.similar(Vec::V) where {V<:AbstractGPUSparseVector} = V(copy(SparseArrays.nonzeroinds(Vec)), similar(SparseArrays.nonzeros(Vec)), length(Vec)) Base.similar(Mat::M) where {M<:AbstractGPUSparseMatrixCSC} = M(copy(SparseArrays.getcolptr(Mat)), copy(SparseArrays.rowvals(Mat)), similar(SparseArrays.nonzeros(Mat)), size(Mat)) -Base.similar(Vec::V, T::Type) where {Tv, Ti, V<:AbstractGPUSparseVector{Tv, Ti}} = sparse_array_type(V){T, Ti}(copy(SparseArrays.nonzeroinds(Vec)), similar(SparseArrays.nonzeros(Vec), T), length(Vec)) -Base.similar(Mat::M, T::Type) where {M<:AbstractGPUSparseMatrixCSC} = sparse_array_type(M)(copy(SparseArrays.getcolptr(Mat)), copy(SparseArrays.rowvals(Mat)), similar(SparseArrays.nonzeros(Mat), T), size(Mat)) - -dense_array_type(a::AbstractArray) = dense_array_type(typeof(a)) -sparse_array_type(a::AbstractArray) = sparse_array_type(typeof(a)) -dense_vector_type(a::AbstractArray) = dense_vector_type(typeof(a)) - -dense_array_type(::Type{<:SparseVector}) = SparseVector -sparse_array_type(::Type{<:SparseVector}) = SparseVector -dense_vector_type(::Type{<:AbstractSparseArray}) = Vector -dense_vector_type(::Type{<:AbstractArray}) = Vector -dense_array_type(::Type{<:SparseMatrixCSC}) = SparseMatrixCSC -sparse_array_type(::Type{<:SparseMatrixCSC}) = SparseMatrixCSC +# `similar(A, ::Type)` (structure-preserving, new element type) is a back-end method, just +# like the dense `similar`. The generic algorithms below always build their outputs with +# `similar` (dispatched on a representative value), so there is no need for backends to +# provide `*_array_type` mapping functions to reconstruct a sparse type from a type. -coo_type(sa) = coo_type(typeof(sa)) +# Stored (minor-axis) index buffer, accessed generically through the conventional field; +# the analogue of `SparseArrays.storedinds`. Used by the broadcast/construction code. +_stored_inds(A::AbstractGPUSparseMatrixCSR) = A.colVal +_stored_inds(A::AbstractGPUSparseMatrixCSC) = A.rowVal +_stored_inds(A::AbstractGPUSparseVector) = A.iPtr function _spadjoint end function _sptranspose end +const SparseMatmulEltypes = Union{Float16,Float32,ComplexF16,ComplexF32} + +sparse_matmul_accumulator(::Type{T}) where {T} = T +sparse_matmul_accumulator(::Type{Float16}) = Float32 +sparse_matmul_accumulator(::Type{ComplexF16}) = ComplexF32 + +const GPUSparseMatrixOrWrapper = Union{ + AbstractGPUSparseMatrix, + Transpose{<:Any,<:AbstractGPUSparseMatrix}, + Adjoint{<:Any,<:AbstractGPUSparseMatrix}, +} + +function sparse_op(t::AbstractChar) + t in ('N', 'n', 'S', 's', 'H', 'h') && return false, false + t in ('T', 't') && return true, false + t in ('C', 'c') && return true, true + throw(ArgumentError("unsupported sparse matrix operation '$t'")) +end + +sparse_op_size(A, trans::Bool) = trans ? reverse(size(A)) : size(A) + +# Format-conversion hooks: a back-end maps any of its sparse-matrix types to the *type* of +# its sibling format (`coo_type(::Type{<:MyCSC}) = MyCOO`); generic code then converts with +# `coo_type(A)(A)`. This is a type-level hook rather than plain `convert(Dest, A)` because the +# format is the wrapper's identity (distinct structs/fields), not a type parameter — so unlike +# an eltype change there is no generic wrapper→sibling-wrapper operation, and only the back-end +# knows its sibling types. The cross-format `convert` algorithm below is the engine those +# constructors route through; the identity case (`coo_type(coo)(coo)`) is the back-end's +# identity constructor. +coo_type(a) = coo_type(typeof(a)) +csr_type(a) = csr_type(typeof(a)) +csc_type(a) = csc_type(typeof(a)) + +## Generic format conversions (CSC↔CSR↔COO), on-device. +# +# Conversion is owned by `convert` (dispatched on GPUArrays' own abstract types, so not type +# piracy); construction is owned by the back-end through its conventional field constructor, +# which `convert` calls — the same division `SparseArrays` uses (`convert`/constructors for +# conversion, `SparseMatrixCSC(m, n, colptr, rowval, nzval)` for construction). Conversions +# preserve the source's value/index eltypes; every buffer is allocated from a source array +# via `similar`, so the storage mode and back-end are inherited. `sortperm` on a back-end +# vector is required (the COO→CSR/CSC path; already implied by `findnz`). + +@inline function sparse_lower_bound(xs, lo::Ti, hi::Ti, key::Ti) where {Ti} + l = lo + r = hi + while l < r + m = (l + r) >>> 1 + @inbounds v = xs[m] + if v < key + l = m + one(Ti) + else + r = m + end + end + return l +end + +# Expand a compressed pointer array (CSC `colPtr` / CSR `rowPtr`, length `dim+1`) into one +# explicit lead index per stored entry: thread `lead` writes its index into the slots +# `ptr[lead]:ptr[lead+1]-1` of `out`. +@kernel function expand_ptr_kernel!(out, ptr) + lead = @index(Global, Linear) + if lead <= length(ptr) - 1 + T = eltype(out) + @inbounds begin + k = ptr[lead] + stop = ptr[lead + 1] + v = T(lead) + while k < stop + out[k] = v + k += oneunit(k) + end + end + end +end + +# Build a lead-major linear sort key per stored COO entry (lead-major: sorting by the key +# orders entries by `(lead, sub)`); `subdim` is the size of the sub axis. +@kernel function coo_sort_keys_kernel!(keys, leadInd, subInd, subdim) + i = @index(Global, Linear) + if i <= length(keys) + @inbounds keys[i] = (Int64(leadInd[i]) - Int64(1)) * Int64(subdim) + Int64(subInd[i]) + end +end + +# Gather the three COO arrays through a permutation (e.g. from `sortperm`). +@kernel function coo_gather_kernel!(leadOut, subOut, nzOut, leadIn, subIn, nzIn, perm) + i = @index(Global, Linear) + if i <= length(perm) + @inbounds begin + src = perm[i] + leadOut[i] = leadIn[src] + subOut[i] = subIn[src] + nzOut[i] = nzIn[src] + end + end +end + +# Build a compressed pointer array from sorted lead indices: `ptr[i]` is the first 1-based +# position of a stored entry whose lead index is ≥ `i`, and `ptr[end] = nnz+1`. +@kernel function compressed_ptr_from_sorted_kernel!(ptr, sortedLead, nnz) + i = @index(Global, Linear) + Ti = eltype(ptr) + if i <= length(ptr) + @inbounds ptr[i] = if i == length(ptr) + Ti(nnz) + one(Ti) + else + sparse_lower_bound(sortedLead, one(Ti), Ti(nnz) + one(Ti), Ti(i)) + end + end +end + +# Order a COO matrix's stored entries lead-major, returning fresh (lead, sub, nz) arrays at +# the source eltypes. Duplicate coordinates are preserved (no summing). +function _sorted_coo(A::AbstractGPUSparseMatrixCOO{Tv,Ti}, ::Val{lead}) where {Tv,Ti,lead} + lead === :row || lead === :col || + throw(ArgumentError("COO sort lead must be :row or :col")) + total = Int(nnz(A)) + leadIn = lead === :row ? A.rowInd : A.colInd + subIn = lead === :row ? A.colInd : A.rowInd + subdim = lead === :row ? size(A, 2) : size(A, 1) + + leadOut = similar(leadIn, Ti, total) + subOut = similar(subIn, Ti, total) + nzOut = similar(nonzeros(A), Tv, total) + total == 0 && return leadOut, subOut, nzOut + + backend = get_backend(leadOut) + keys = similar(leadIn, Int64, total) + coo_sort_keys_kernel!(backend)(keys, leadIn, subIn, subdim; ndrange=total) + perm = sortperm(keys) + coo_gather_kernel!(backend)(leadOut, subOut, nzOut, leadIn, subIn, nonzeros(A), perm; + ndrange=total) + unsafe_free!(keys) + unsafe_free!(perm) + return leadOut, subOut, nzOut +end + +# compressed pointer array of length `dim+1` from (possibly empty) sorted lead indices, +# inheriting the lead array's index eltype and storage. +function _compressed_ptr(sortedLead, dim::Integer, n::Integer) + ptr = similar(sortedLead, dim + 1) + if n > 0 + compressed_ptr_from_sorted_kernel!(get_backend(ptr))(ptr, sortedLead, n; + ndrange=dim + 1) + else + fill!(ptr, one(eltype(ptr))) + end + return ptr +end + +# Cross-format conversions are ordinary `convert` methods, dispatched on GPUArrays' own +# abstract types (so this is not type piracy and avoids the ambiguity a generic constructor +# would have with a back-end's host-array constructor). The result is built with the +# back-end's conventional field constructor `ST(components..., dims[, nnz])` — the write-side +# analogue of the `colPtr`/`rowVal`/`nzVal` read contract, and how `SparseArrays` itself +# builds (`SparseMatrixCSC(m, n, colptr, rowval, nzval)`). A back-end with unconventional +# construction overrides the relevant `convert` method. + +# CSR → COO: expand `rowPtr` into explicit row indices; columns and values are shared. +function Base.convert(::Type{ST}, A::AbstractGPUSparseMatrixCSR) where {ST<:AbstractGPUSparseMatrixCOO} + n = Int(nnz(A)) + rowInd = similar(A.colVal, n) + n > 0 && expand_ptr_kernel!(get_backend(rowInd))(rowInd, A.rowPtr; ndrange=size(A, 1)) + return ST(rowInd, A.colVal, nonzeros(A), size(A), n) +end + +# CSC → COO: expand `colPtr` into explicit column indices; rows and values are shared. +function Base.convert(::Type{ST}, A::AbstractGPUSparseMatrixCSC) where {ST<:AbstractGPUSparseMatrixCOO} + n = Int(nnz(A)) + colInd = similar(A.rowVal, n) + n > 0 && expand_ptr_kernel!(get_backend(colInd))(colInd, A.colPtr; ndrange=size(A, 2)) + return ST(A.rowVal, colInd, nonzeros(A), size(A), n) +end + +# COO → CSR: sort row-major, then compress the row indices into a `rowPtr`. +function Base.convert(::Type{ST}, A::AbstractGPUSparseMatrixCOO) where {ST<:AbstractGPUSparseMatrixCSR} + rowInd, colVal, nzVal = _sorted_coo(A, Val(:row)) + rowPtr = _compressed_ptr(rowInd, size(A, 1), Int(nnz(A))) + unsafe_free!(rowInd) + return ST(rowPtr, colVal, nzVal, size(A)) +end + +# COO → CSC: sort column-major, then compress the column indices into a `colPtr`. +function Base.convert(::Type{ST}, A::AbstractGPUSparseMatrixCOO) where {ST<:AbstractGPUSparseMatrixCSC} + colInd, rowVal, nzVal = _sorted_coo(A, Val(:col)) + colPtr = _compressed_ptr(colInd, size(A, 2), Int(nnz(A))) + unsafe_free!(colInd) + return ST(colPtr, rowVal, nzVal, size(A)) +end + +# CSC ↔ CSR route through COO (the back-end's hook yields its concrete COO type). +Base.convert(::Type{ST}, A::AbstractGPUSparseMatrixCSC) where {ST<:AbstractGPUSparseMatrixCSR} = + convert(ST, coo_type(A)(A)) +Base.convert(::Type{ST}, A::AbstractGPUSparseMatrixCSR) where {ST<:AbstractGPUSparseMatrixCSC} = + convert(ST, coo_type(A)(A)) + +@inline function sparse_atomic_add!(A::AbstractArray{<:Real}, parts, I::Integer, x) + @inbounds Atomix.@atomic A[Int(I)] += x + return +end + +@inline function sparse_atomic_add!(A::AbstractArray{<:Complex}, parts, I::Integer, x) + j = 2 * Int(I) + @inbounds begin + Atomix.@atomic parts[j - 1] += real(x) + Atomix.@atomic parts[j] += imag(x) + end + return +end + +sparse_atomic_parts(A::AbstractArray{<:Real}) = A +sparse_atomic_parts(A::AbstractArray{<:Complex{T}}) where {T} = reinterpret(T, A) + +@kernel function sparse_scale_kernel!(Y, beta) + i = @index(Global, Linear) + if i <= length(Y) + T = eltype(Y) + @inbounds Y[i] = iszero(beta) ? zero(T) : T(beta) * Y[i] + end +end + +@kernel function sparse_combine_kernel!(Y, acc, alpha, beta, ::Val{Tacc}) where {Tacc} + i = @index(Global, Linear) + if i <= length(Y) + Ty = eltype(Y) + @inbounds Y[i] = iszero(beta) ? + Ty(Tacc(alpha) * Tacc(acc[i])) : + Ty(Tacc(alpha) * Tacc(acc[i]) + Tacc(beta) * Tacc(Y[i])) + end +end + +function sparse_scale!(Y, beta) + isempty(Y) && return Y + sparse_scale_kernel!(get_backend(Y))(Y, beta; ndrange=length(Y)) + return Y +end + +function sparse_combine!(Y::AnyGPUArray{Ty}, acc, alpha, beta) where {Ty} + isempty(Y) && return Y + Tacc = sparse_matmul_accumulator(Ty) + sparse_combine_kernel!(get_backend(Y))(Y, acc, Tacc(alpha), Tacc(beta), Val(Tacc); + ndrange=length(Y)) + return Y +end + +@kernel function spmv_coo_kernel!(y, yparts, alpha, rowInd, colInd, nzVal, x, + trans::Bool, conjval::Bool, ::Val{Tacc}) where {Tacc} + k = @index(Global, Linear) + if k <= length(nzVal) + @inbounds begin + out = trans ? colInd[k] : rowInd[k] + src = trans ? rowInd[k] : colInd[k] + v = Tacc(nzVal[k]) + v = conjval ? conj(v) : v + sparse_atomic_add!(y, yparts, out, Tacc(alpha) * v * Tacc(x[src])) + end + end +end + +@kernel function spmm_coo_kernel!(C, Cparts, alpha, rowInd, colInd, nzVal, B, + trans::Bool, conjval::Bool, ::Val{Tacc}) where {Tacc} + k, j = @index(Global, NTuple) + if k <= length(nzVal) && j <= size(B, 2) + @inbounds begin + outrow = trans ? colInd[k] : rowInd[k] + srcrow = trans ? rowInd[k] : colInd[k] + v = Tacc(nzVal[k]) + v = conjval ? conj(v) : v + lin = Int(outrow) + (Int(j) - 1) * size(C, 1) + sparse_atomic_add!(C, Cparts, lin, Tacc(alpha) * v * Tacc(B[srcrow, j])) + end + end +end + +@kernel function dense_spmm_coo_kernel!(C, Cparts, alpha, A, rowInd, colInd, nzVal, + trans::Bool, conjval::Bool, ::Val{Tacc}) where {Tacc} + i, k = @index(Global, NTuple) + if i <= size(C, 1) && k <= length(nzVal) + @inbounds begin + outcol = trans ? rowInd[k] : colInd[k] + srccol = trans ? colInd[k] : rowInd[k] + v = Tacc(nzVal[k]) + v = conjval ? conj(v) : v + lin = Int(i) + (Int(outcol) - 1) * size(C, 1) + sparse_atomic_add!(C, Cparts, lin, Tacc(alpha) * Tacc(A[i, srccol]) * v) + end + end +end + +function spmv_coo!(y::AnyGPUVector{Ty}, A::AbstractGPUSparseMatrixCOO, x, + alpha::Number, beta::Number, trans::Bool, conjval::Bool) where {Ty} + Tacc = sparse_matmul_accumulator(Ty) + if Tacc === Ty + sparse_scale!(y, beta) + target = y + scatter_alpha = Tacc(alpha) + else + target = similar(y, Tacc, size(y)) + fill!(target, zero(Tacc)) + scatter_alpha = one(Tacc) + end + + if nnz(A) > 0 + parts = sparse_atomic_parts(target) + spmv_coo_kernel!(get_backend(y))(target, parts, scatter_alpha, A.rowInd, A.colInd, + nonzeros(A), x, trans, conjval, Val(Tacc); + ndrange=nnz(A)) + end + + if !(Tacc === Ty) + sparse_combine!(y, target, alpha, beta) + unsafe_free!(target) + end + return y +end + +function spmm_coo!(C::AnyGPUMatrix{Tc}, A::AbstractGPUSparseMatrixCOO, B, + alpha::Number, beta::Number, trans::Bool, conjval::Bool) where {Tc} + Tacc = sparse_matmul_accumulator(Tc) + if Tacc === Tc + sparse_scale!(C, beta) + target = C + scatter_alpha = Tacc(alpha) + else + target = similar(C, Tacc, size(C)) + fill!(target, zero(Tacc)) + scatter_alpha = one(Tacc) + end + + if nnz(A) > 0 && size(C, 2) > 0 + parts = sparse_atomic_parts(target) + spmm_coo_kernel!(get_backend(C))(target, parts, scatter_alpha, A.rowInd, A.colInd, + nonzeros(A), B, trans, conjval, Val(Tacc); + ndrange=(nnz(A), size(C, 2))) + end + + if !(Tacc === Tc) + sparse_combine!(C, target, alpha, beta) + unsafe_free!(target) + end + return C +end + +function dense_spmm_coo!(C::AnyGPUMatrix{Tc}, A, B::AbstractGPUSparseMatrixCOO, + alpha::Number, beta::Number, trans::Bool, conjval::Bool) where {Tc} + Tacc = sparse_matmul_accumulator(Tc) + if Tacc === Tc + sparse_scale!(C, beta) + target = C + scatter_alpha = Tacc(alpha) + else + target = similar(C, Tacc, size(C)) + fill!(target, zero(Tacc)) + scatter_alpha = one(Tacc) + end + + if nnz(B) > 0 && size(C, 1) > 0 + parts = sparse_atomic_parts(target) + dense_spmm_coo_kernel!(get_backend(C))(target, parts, scatter_alpha, A, B.rowInd, + B.colInd, nonzeros(B), trans, conjval, + Val(Tacc); + ndrange=(size(C, 1), nnz(B))) + end + + if !(Tacc === Tc) + sparse_combine!(C, target, alpha, beta) + unsafe_free!(target) + end + return C +end + +function Base.:(*)(A::GPUSparseMatrixOrWrapper, x::AnyGPUVector) + size(A, 2) == length(x) || throw(DimensionMismatch()) + y = similar(x, promote_type(eltype(A), eltype(x)), size(A, 1)) + return LinearAlgebra.mul!(y, A, x) +end + +function Base.:(*)(A::GPUSparseMatrixOrWrapper, B::AnyGPUMatrix) + size(A, 2) == size(B, 1) || throw(DimensionMismatch()) + C = similar(B, promote_type(eltype(A), eltype(B)), (size(A, 1), size(B, 2))) + return LinearAlgebra.mul!(C, A, B) +end + +function Base.:(*)(A::AnyGPUMatrix, B::GPUSparseMatrixOrWrapper) + size(A, 2) == size(B, 1) || throw(DimensionMismatch()) + C = similar(A, promote_type(eltype(A), eltype(B)), (size(A, 1), size(B, 2))) + return LinearAlgebra.mul!(C, A, B) +end + +LinearAlgebra.generic_matvecmul!(y::AnyGPUVector{Ty}, tA::AbstractChar, + A::AbstractGPUSparseMatrix{Ta}, x::AnyGPUVector{Tx}, + muladd::MulAddMul) where { + Ty<:SparseMatmulEltypes, + Ta<:SparseMatmulEltypes, + Tx<:SparseMatmulEltypes, + } = + LinearAlgebra.generic_matvecmul!(y, tA, A, x, muladd.alpha, muladd.beta) + +function LinearAlgebra.generic_matvecmul!(y::AnyGPUVector{Ty}, tA::AbstractChar, + A::AbstractGPUSparseMatrix{Ta}, + x::AnyGPUVector{Tx}, + alpha::Number, beta::Number) where { + Ty<:SparseMatmulEltypes, + Ta<:SparseMatmulEltypes, + Tx<:SparseMatmulEltypes, + } + trans, conjval = sparse_op(tA) + Asize = sparse_op_size(A, trans) + Asize[2] == length(x) || + throw(DimensionMismatch("A has dimensions $Asize but x has length $(length(x))")) + Asize[1] == length(y) || + throw(DimensionMismatch("A has dimensions $Asize but y has length $(length(y))")) + y === x && throw(ArgumentError("output vector must not be aliased with input vector")) + isempty(y) && return y + + return spmv_coo!(y, coo_type(A)(A), x, alpha, beta, trans, conjval) +end + +LinearAlgebra.generic_matmatmul!(C::AnyGPUMatrix{Tc}, tA, tB, + A::AbstractGPUSparseMatrix{Ta}, B::AnyGPUMatrix{Tb}, + muladd::MulAddMul) where { + Tc<:SparseMatmulEltypes, + Ta<:SparseMatmulEltypes, + Tb<:SparseMatmulEltypes, + } = + LinearAlgebra.generic_matmatmul!(C, tA, tB, A, B, muladd.alpha, muladd.beta) + +function LinearAlgebra.generic_matmatmul!(C::AnyGPUMatrix{Tc}, tA, tB, + A::AbstractGPUSparseMatrix{Ta}, + B::AnyGPUMatrix{Tb}, + alpha::Number, beta::Number) where { + Tc<:SparseMatmulEltypes, + Ta<:SparseMatmulEltypes, + Tb<:SparseMatmulEltypes, + } + trans, conjval = sparse_op(tA) + Bop = LinearAlgebra.wrap(B, tB) + Asize = sparse_op_size(A, trans) + Asize[2] == size(Bop, 1) || + throw(DimensionMismatch("A has dimensions $Asize but B has dimensions $(size(Bop))")) + size(C) == (Asize[1], size(Bop, 2)) || + throw(DimensionMismatch("C has dimensions $(size(C)), should have $((Asize[1], size(Bop, 2)))")) + C === B && throw(ArgumentError("output matrix must not be aliased with input matrix")) + isempty(C) && return C + + return spmm_coo!(C, coo_type(A)(A), Bop, alpha, beta, trans, conjval) +end + +LinearAlgebra.generic_matmatmul!(C::AnyGPUMatrix{Tc}, tA, tB, + A::AnyGPUMatrix{Ta}, B::AbstractGPUSparseMatrix{Tb}, + muladd::MulAddMul) where { + Tc<:SparseMatmulEltypes, + Ta<:SparseMatmulEltypes, + Tb<:SparseMatmulEltypes, + } = + LinearAlgebra.generic_matmatmul!(C, tA, tB, A, B, muladd.alpha, muladd.beta) + +function LinearAlgebra.generic_matmatmul!(C::AnyGPUMatrix{Tc}, tA, tB, + A::AnyGPUMatrix{Ta}, + B::AbstractGPUSparseMatrix{Tb}, + alpha::Number, beta::Number) where { + Tc<:SparseMatmulEltypes, + Ta<:SparseMatmulEltypes, + Tb<:SparseMatmulEltypes, + } + Aop = LinearAlgebra.wrap(A, tA) + trans, conjval = sparse_op(tB) + Bsize = sparse_op_size(B, trans) + size(Aop, 2) == Bsize[1] || + throw(DimensionMismatch("A has dimensions $(size(Aop)) but B has dimensions $Bsize")) + size(C) == (size(Aop, 1), Bsize[2]) || + throw(DimensionMismatch("C has dimensions $(size(C)), should have $((size(Aop, 1), Bsize[2]))")) + C === A && throw(ArgumentError("output matrix must not be aliased with input matrix")) + isempty(C) && return C + + return dense_spmm_coo!(C, Aop, coo_type(B)(B), alpha, beta, trans, conjval) +end + function LinearAlgebra.opnorm(A::AbstractGPUSparseMatrixCSR, p::Real=2) if p == Inf return maximum(sum(abs, A; dims=2)) @@ -92,7 +679,7 @@ function LinearAlgebra.norm(A::AbstractGPUSparseMatrix{T}, p::Real=2) where T end function SparseArrays.findnz(S::MT) where {MT <: AbstractGPUSparseMatrix} - S2 = coo_type(MT)(S) + S2 = coo_type(S)(S) I = S2.rowInd J = S2.colInd V = S2.nzVal @@ -134,16 +721,16 @@ for SparseMatrixType in [:AbstractGPUSparseMatrixCSC, :AbstractGPUSparseMatrixCS LinearAlgebra.triu(A::ST, k::Integer) where {T, ST<:$SparseMatrixType{T}} = ST( triu(coo_type(A)(A), k) ) LinearAlgebra.triu(A::Transpose{T,<:ST}, k::Integer) where {T, ST<:$SparseMatrixType{T}} = - ST( triu(coo_type(A)(_sptranspose(parent(A))), k) ) + ST( triu(coo_type(parent(A))(_sptranspose(parent(A))), k) ) LinearAlgebra.triu(A::Adjoint{T,<:ST}, k::Integer) where {T, ST<:$SparseMatrixType{T}} = - ST( triu(coo_type(A)(_spadjoint(parent(A))), k) ) + ST( triu(coo_type(parent(A))(_spadjoint(parent(A))), k) ) LinearAlgebra.tril(A::ST, k::Integer) where {T, ST<:$SparseMatrixType{T}} = ST( tril(coo_type(A)(A), k) ) LinearAlgebra.tril(A::Transpose{T,<:ST}, k::Integer) where {T, ST<:$SparseMatrixType{T}} = - ST( tril(coo_type(A)(_sptranspose(parent(A))), k) ) + ST( tril(coo_type(parent(A))(_sptranspose(parent(A))), k) ) LinearAlgebra.tril(A::Adjoint{T,<:ST}, k::Integer) where {T, ST<:$SparseMatrixType{T}} = - ST( tril(coo_type(A)(_spadjoint(parent(A))), k) ) + ST( tril(coo_type(parent(A))(_spadjoint(parent(A))), k) ) LinearAlgebra.triu(A::Union{ST, Transpose{T,<:ST}, Adjoint{T,<:ST}}) where {T, ST<:$SparseMatrixType{T}} = ST( triu(coo_type(A)(A), 0) ) @@ -158,18 +745,18 @@ for SparseMatrixType in [:AbstractGPUSparseMatrixCSC, :AbstractGPUSparseMatrixCS ST( kron(A, coo_type(B)(B)) ) LinearAlgebra.kron(A::Transpose{T,<:ST}, B::ST) where {T, ST<:$SparseMatrixType{T}} = - ST( kron(coo_type(A)(_sptranspose(parent(A))), coo_type(B)(B)) ) + ST( kron(coo_type(parent(A))(_sptranspose(parent(A))), coo_type(B)(B)) ) LinearAlgebra.kron(A::ST, B::Transpose{T,<:ST}) where {T, ST<:$SparseMatrixType{T}} = ST( kron(coo_type(A)(A), coo_type(parent(B))(_sptranspose(parent(B)))) ) LinearAlgebra.kron(A::Transpose{T,<:ST}, B::Transpose{T,<:ST}) where {T, ST<:$SparseMatrixType{T}} = - ST( kron(coo_type(A)(_sptranspose(parent(A))), coo_type(parent(B))(_sptranspose(parent(B)))) ) + ST( kron(coo_type(parent(A))(_sptranspose(parent(A))), coo_type(parent(B))(_sptranspose(parent(B)))) ) LinearAlgebra.kron(A::Transpose{T,<:ST}, B::Diagonal) where {T, ST<:$SparseMatrixType{T}} = - ST( kron(coo_type(A)(_sptranspose(parent(A))), B) ) + ST( kron(coo_type(parent(A))(_sptranspose(parent(A))), B) ) LinearAlgebra.kron(A::Diagonal, B::Transpose{T,<:ST}) where {T, ST<:$SparseMatrixType{T}} = - ST( kron(A, coo_type(B)(_sptranspose(parent(B)))) ) + ST( kron(A, coo_type(parent(B))(_sptranspose(parent(B)))) ) LinearAlgebra.kron(A::Adjoint{T,<:ST}, B::ST) where {T, ST<:$SparseMatrixType{T}} = - ST( kron(coo_type(A)(_spadjoint(parent(A))), coo_type(B)(B)) ) + ST( kron(coo_type(parent(A))(_spadjoint(parent(A))), coo_type(B)(B)) ) LinearAlgebra.kron(A::ST, B::Adjoint{T,<:ST}) where {T, ST<:$SparseMatrixType{T}} = ST( kron(coo_type(A)(A), coo_type(parent(B))(_spadjoint(parent(B)))) ) LinearAlgebra.kron(A::Adjoint{T,<:ST}, B::Adjoint{T,<:ST}) where {T, ST<:$SparseMatrixType{T}} = @@ -814,17 +1401,10 @@ function Broadcast.copy(bc::Broadcasted{<:Union{GPUSparseVecStyle,GPUSparseMatSt elseif length(sparse_args) == 1 && sparse_typ <: Union{AbstractGPUSparseMatrixCSR, AbstractGPUSparseMatrixCSC} # we only have a single sparse input, so we can reuse its structure for the output. # this avoids a kernel launch and costly synchronization. - if sparse_typ <: AbstractGPUSparseMatrixCSR - offsets = rowPtr = sparse_arg.rowPtr - colVal = similar(sparse_arg.colVal) - nzVal = similar(sparse_arg.nzVal, Tv) - output = sparse_array_type(sparse_typ)(rowPtr, colVal, nzVal, size(bc)) - elseif sparse_typ <: AbstractGPUSparseMatrixCSC - offsets = colPtr = sparse_arg.colPtr - rowVal = similar(sparse_arg.rowVal) - nzVal = similar(sparse_arg.nzVal, Tv) - output = sparse_array_type(sparse_typ)(colPtr, rowVal, nzVal, size(bc)) - end + # structure is preserved, so a structure-preserving `similar` gives the output + # container directly; the kernel rewrites the (identical) pointer array from `offsets`. + output = similar(sparse_arg, Tv) + offsets = sparse_typ <: AbstractGPUSparseMatrixCSR ? sparse_arg.rowPtr : sparse_arg.colPtr else # determine the number of non-zero elements per row so that we can create an # appropriately-structured output container @@ -870,17 +1450,22 @@ function Broadcast.copy(bc::Broadcasted{<:Union{GPUSparseVecStyle,GPUSparseMatSt total_nnz = mapreduce(x->first(x) != typemax(first(x)), +, offsets) end output = if sparse_typ <: Union{AbstractGPUSparseMatrixCSR,AbstractGPUSparseMatrixCSC} - ixVal = similar(offsets, Ti, total_nnz) - nzVal = similar(offsets, Tv, total_nnz) - output_sparse_typ = sparse_array_type(sparse_typ) - output_sparse_typ(offsets, ixVal, nzVal, size(bc)) + # build an empty sparse output of the same kind, then size its index/value + # storage to the computed nnz. The kernel fills the pointer array (from + # `offsets`), the stored indices, and the values. + out = similar(sparse_arg, Tv, size(bc)) + resize!(_stored_inds(out), total_nnz) + resize!(nonzeros(out), total_nnz) + out.nnz = total_nnz + out elseif sparse_typ <: AbstractGPUSparseVector && !fpreszeros - val_array = bc.args[first(sparse_args)].nzVal - similar(val_array, Tv, size(bc)) + similar(nonzeros(sparse_arg), Tv, size(bc)) elseif sparse_typ <: AbstractGPUSparseVector && fpreszeros - iPtr = similar(offsets, Ti, total_nnz) - nzVal = similar(offsets, Tv, total_nnz) - sparse_array_type(sparse_arg){Tv, Ti}(iPtr, nzVal, rows) + out = similar(sparse_arg, Tv, (rows,)) + resize!(_stored_inds(out), total_nnz) + resize!(nonzeros(out), total_nnz) + out.nnz = total_nnz + out end if sparse_typ <: AbstractGPUSparseVector && !fpreszeros nonsparse_args = map(bc.args) do arg @@ -983,9 +1568,6 @@ end end ## COV_EXCL_STOP -function csc_type end -function csr_type end - # TODO: implement mapreducedim! function Base.mapreduce(f, op, A::AbstractGPUSparseMatrix; dims=:, init=nothing) # figure out the destination container type by looking at the initializer element, diff --git a/test/runtests.jl b/test/runtests.jl index a91715b9..f60c43e6 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -8,7 +8,7 @@ const init_worker_code = quote include("testsuite.jl") - TestSuite.sparse_types(::Type{<:JLArray}) = (JLSparseVector, JLSparseMatrixCSC, JLSparseMatrixCSR) + TestSuite.sparse_types(::Type{<:JLArray}) = (JLSparseVector, JLSparseMatrixCSC, JLSparseMatrixCSR, JLSparseMatrixCOO) TestSuite.sparse_types(::Type{<:Array}) = (SparseVector, SparseMatrixCSC) # Disable Float16-related tests until JuliaGPU/KernelAbstractions#600 is resolved diff --git a/test/testsuite/sparse.jl b/test/testsuite/sparse.jl index 3e0d92fe..f54ffbf8 100644 --- a/test/testsuite/sparse.jl +++ b/test/testsuite/sparse.jl @@ -1,27 +1,296 @@ @testsuite "sparse" (AT, eltypes)->begin sparse_ATs = sparse_types(AT) + matrix_ATs = filter(T -> T <: AbstractSparseMatrix, sparse_ATs) for sparse_AT in sparse_ATs if sparse_AT <: AbstractSparseVector - vector(sparse_AT, eltypes) - vector_construction(sparse_AT, eltypes) - broadcasting_vector(sparse_AT, eltypes) + vector(sparse_AT, AT, eltypes) + vector_construction(sparse_AT, AT, eltypes) + broadcasting_vector(sparse_AT, AT, eltypes) iszero_vector(sparse_AT, eltypes) elseif sparse_AT <: AbstractSparseMatrix - matrix(sparse_AT, eltypes) - matrix_construction(sparse_AT, eltypes) - broadcasting_matrix(sparse_AT, eltypes) - mapreduce_matrix(sparse_AT, eltypes) - linalg(sparse_AT, eltypes) - iszero_matrix(sparse_AT, eltypes) + # `matmul` runs for every format (incl. COO). The matrix battery below is + # CSC/CSR-only: COO lacks the broadcast/reductions/`opnorm`/`issymmetric` it uses. + matmul(sparse_AT, AT, eltypes) + # Float32 accumulation is a GPU-kernel contract; stdlib `Array` accumulates + # narrowly, so assert it only for GPU formats. + sparse_AT <: GPUArrays.AbstractGPUSparseMatrix && + matmul_accumulation(sparse_AT, AT, eltypes) + if sparse_AT <: Union{GPUArrays.AbstractGPUSparseMatrixCSC, + GPUArrays.AbstractGPUSparseMatrixCSR} + matrix(sparse_AT, eltypes) + matrix_construction(sparse_AT, AT, eltypes) + broadcasting_matrix(sparse_AT, AT, eltypes) + mapreduce_matrix(sparse_AT, eltypes) + linalg(sparse_AT, eltypes) + iszero_matrix(sparse_AT, eltypes) + end end end + undef_construction(sparse_ATs, eltypes) + # cross-format conversions need ≥2 registered matrix formats to be meaningful. + length(matrix_ATs) > 1 && conversions(matrix_ATs, AT, eltypes) + # the repeated-index COO matmul test runs against whichever COO type the back-end + # registers in `sparse_types` (no separate `sparse_coo_type` hook). + i = findfirst(T -> T <: GPUArrays.AbstractGPUSparseMatrixCOO, sparse_ATs) + i === nothing || coo_matmul(sparse_ATs[i], AT, eltypes) end using SparseArrays using SparseArrays: nonzeroinds, nonzeros, rowvals -function vector(AT, eltypes) - dense_AT = GPUArrays.dense_array_type(AT) +function coo_values(::Type{T}) where {T<:Real} + return T[1, 2, 3, 4] +end + +function coo_values(::Type{T}) where {T<:Complex} + return T[1 + im, 2 - 3im, 3 + 2im, 4 - im] +end + +function coo_vector(::Type{T}) where {T<:Real} + return T[2, 3, 5] +end + +function coo_vector(::Type{T}) where {T<:Complex} + return T[2 - im, 3 + 4im, 5 - 2im] +end + +function coo_rhs(::Type{T}) where {T<:Real} + return T[ + 1 2; + 3 4; + 5 6 + ] +end + +function coo_rhs(::Type{T}) where {T<:Complex} + return T[ + 1 + im 2 - im; + 3 + 2im 4 - 3im; + 5 - im 6 + im + ] +end + +function coo_lhs(::Type{T}) where {T<:Real} + return T[ + 1 2 3; + 4 5 6 + ] +end + +function coo_lhs(::Type{T}) where {T<:Complex} + return T[ + 1 - im 2 + im 3 - 2im; + 4 + 3im 5 - im 6 + im + ] +end + +# A non-zero-preserving broadcast densifies its result: GPU back-ends return the dense +# array type `dense_AT`, whereas CPU `SparseArrays` returns its sparse type with every +# entry explicitly stored. Accept either so the suite is meaningful for both. +is_densified(x, dense_AT, ::Type{ET}) where {ET} = + dense_AT <: GPUArrays.AnyGPUArray ? x isa dense_AT{ET} : (issparse(x) && eltype(x) === ET) + +function coo_matmul(AT, dense_AT, eltypes) + for ET in (Float16, Float32, ComplexF16, ComplexF32) + ET in eltypes || continue + @testset "COO repeated-index matmul($ET)" begin + I = Int32[1, 1, 2, 3] + J = Int32[1, 1, 2, 1] + V = coo_values(ET) + A = sparse(Int.(I), Int.(J), V, 3, 3) + dA = AT(dense_AT(I), dense_AT(J), dense_AT(V), (3, 3), length(V)) + + x = coo_vector(ET) + dx = dense_AT(x) + @test Array(dA * dx) ≈ Array(A * x) + @test Array(transpose(dA) * dx) ≈ Array(transpose(A) * x) + @test Array(adjoint(dA) * dx) ≈ Array(adjoint(A) * x) + + old_y = dense_AT(fill(ET(1), 3)) + mul!(old_y, dA, dx, ET(2), ET(3)) + @test Array(old_y) ≈ Array(ET(2) .* (A * x) .+ ET(3)) + + B = coo_rhs(ET) + dB = dense_AT(B) + @test Array(dA * dB) ≈ Array(A * B) + @test Array(transpose(dA) * dB) ≈ Array(transpose(A) * B) + @test Array(adjoint(dA) * dB) ≈ Array(adjoint(A) * B) + + L = coo_lhs(ET) + dL = dense_AT(L) + @test Array(dL * dA) ≈ Array(L * A) + @test Array(dL * transpose(dA)) ≈ Array(L * transpose(A)) + @test Array(dL * adjoint(dA)) ≈ Array(L * adjoint(A)) + end + end +end + +# Narrow complex test data to the matmul eltype: complex eltypes keep both parts (so +# `transpose` and `adjoint` differ); real eltypes take the real part, since a complex value +# with a nonzero imaginary part can't be converted to a real type directly. Projecting onto +# the real part reproduces the proven real test data exactly (e.g. `1 + 2im` → `1`). +matmul_cast(x::AbstractArray, ::Type{ET}) where {ET<:Complex} = ET.(x) +matmul_cast(x::AbstractArray, ::Type{ET}) where {ET<:Real} = ET.(real.(x)) +matmul_cast(x::Number, ::Type{ET}) where {ET<:Complex} = ET(x) +matmul_cast(x::Number, ::Type{ET}) where {ET<:Real} = ET(real(x)) + +# Per matrix format, general (duplicate-free) matmul: SpMV, SpMM, and dense·sparse, each +# with transpose/adjoint and the 5-arg `mul!`. The repeated-coordinate COO case is covered +# separately by `coo_matmul`. +function matmul(AT, dense_AT, eltypes) + for ET in (Float16, Float32, ComplexF16, ComplexF32) + ET in eltypes || continue + @testset "$(nameof(AT)) matmul($ET)" begin + V = matmul_cast(ComplexF32[1 + 2im, 2 - im, 3 + im, 4 - 3im, 5 + 2im], ET) + A = sparse([1, 1, 2, 4, 4], [1, 3, 2, 1, 4], V, 4, 4) + dA = AT(A) + + α = matmul_cast(2 + im, ET) + β = matmul_cast(3 - im, ET) + + # SpMV, transpose/adjoint, and the 5-arg and bool `mul!` + x = matmul_cast(ComplexF32[2 - im, 3 + 2im, 5 - 4im, 7 + im], ET) + dx = dense_AT(x) + @test Array(dA * dx) ≈ Array(A * x) + @test Array(transpose(dA) * dx) ≈ Array(transpose(A) * x) + @test Array(adjoint(dA) * dx) ≈ Array(adjoint(A) * x) + + y = matmul_cast(ComplexF32[1 + im, 2 - im, 3 + 2im, 4 - im], ET) + dy = dense_AT(copy(y)) + mul!(dy, dA, dx, α, β) + @test Array(dy) ≈ α .* Array(A * x) .+ β .* y + + dz = dense_AT(zeros(ET, size(A, 1))) + mul!(dz, dA, dx, true, false) + @test Array(dz) ≈ Array(A * x) + + # SpMM, transpose/adjoint, and the 5-arg `mul!` + B = matmul_cast(ComplexF32[ + 1 + im 2 - im 3 + 2im; + 4 - im 5 + im 6 - 2im; + 7 + im 8 - im 9 + 3im; + 2 + 4im 3 - 5im 4 + im + ], ET) + dB = dense_AT(B) + @test Array(dA * dB) ≈ Array(A * B) + @test Array(transpose(dA) * dB) ≈ Array(transpose(A) * B) + @test Array(adjoint(dA) * dB) ≈ Array(adjoint(A) * B) + + C = matmul_cast(ComplexF32[ + 1 - im 2 + im 3 - 2im; + 4 + im 5 - im 6 + 2im; + 7 - im 8 + im 9 - 3im; + 2 - 4im 3 + 5im 4 - im + ], ET) + dC = dense_AT(copy(C)) + mul!(dC, dA, dB, α, β) + @test Array(dC) ≈ α .* Array(A * B) .+ β .* C + + # dense·sparse, transpose/adjoint, and the 5-arg `mul!` + L = matmul_cast(ComplexF32[ + 1 - im 2 + im 3 - 2im 4 + im; + 5 + 2im 6 - im 7 + im 8 - 3im + ], ET) + dL = dense_AT(L) + @test Array(dL * dA) ≈ Array(L * A) + @test Array(dL * transpose(dA)) ≈ Array(L * transpose(A)) + @test Array(dL * adjoint(dA)) ≈ Array(L * adjoint(A)) + + D = matmul_cast(ComplexF32[ + 1 + im 2 - im 3 + 2im 4 - im; + 2 + 3im 3 - im 5 + im 7 - 2im + ], ET) + dD = dense_AT(copy(D)) + mul!(dD, dL, dA, α, β) + @test Array(dD) ≈ α .* Array(L * A) .+ β .* D + end + end +end + +# Per matrix format, the Float16/ComplexF16 accumulation contract: the back-end must +# accumulate in Float32/ComplexF32 and round once at the end. Every product feeding a given +# output entry is identical, so the n-term sum is independent of accumulation order, and the +# result must match a host reference accumulated in `Tacc` *exactly* (`==`, not `≈`); a +# narrow Float16 accumulation would lose precision and the equality would fail. +function matmul_accumulation(AT, dense_AT, eltypes) + for ET in (Float16, ComplexF16) + ET in eltypes || continue + Tacc = ET <: Complex ? ComplexF32 : Float32 + @testset "$(nameof(AT)) matmul accumulation($ET)" begin + n = 128 + av = matmul_cast(0.1 + 0.2im, ET) + xv = matmul_cast(0.1 - 0.3im, ET) + + # SpMV: a 1×n row times a length-n vector + A = sparse(fill(1, n), 1:n, fill(av, n), 1, n) + x = fill(xv, n) + dA = AT(A) + dx = dense_AT(x) + @test Array(dA * dx) == ET.(SparseMatrixCSC{Tacc,Int}(A) * Tacc.(x)) + + # SpMM: the same row times an n×3 dense matrix + B = fill(xv, n, 3) + dB = dense_AT(B) + @test Array(dA * dB) == ET.(SparseMatrixCSC{Tacc,Int}(A) * Tacc.(B)) + + # dense·sparse: a 3×n dense matrix times an n×1 column + S = sparse(1:n, fill(1, n), fill(av, n), n, 1) + dS = AT(S) + L = fill(xv, 3, n) + dL = dense_AT(L) + @test Array(dL * dS) == ET.(Tacc.(L) * SparseMatrixCSC{Tacc,Int}(S)) + end + end +end + +# Round-trip a duplicate-free matrix (and an empty one) through every ordered pair of +# registered matrix formats: `src == dst` exercises the identity path and `src != dst` the +# cross-format `convert`. Conversions preserve coordinates, so the round-trip is exact. +function conversions(matrix_ATs, dense_AT, eltypes) + @testset "sparse format conversions" begin + A = sparse([1, 4, 2, 4, 1], [3, 1, 2, 5, 1], Float32[3, 4, 5, 6, 7], 4, 5) + Z = spzeros(Float32, 3, 4) + for src in matrix_ATs, dst in matrix_ATs + @test SparseMatrixCSC(dst(src(A))) == A + @test SparseMatrixCSC(dst(src(Z))) == Z + end + end +end + +# The mandated empty-of-shape primitive: `ST{Tv,Ti}(undef, dims)` builds a structurally +# empty array (no stored entries), mirroring dense `undef` and `SparseArrays`' undef +# constructors. Exercised per registered format; the back-end's index type is read off a +# reference instance so the test stays back-end agnostic. +function undef_construction(sparse_ATs, eltypes) + @testset "undef construction" begin + for ST in sparse_ATs + if ST <: AbstractSparseVector + Ti = typeof(ST(spzeros(Float32, 3))).parameters[2] + for Tv in (Float32, ComplexF32) + Tv in eltypes || continue + v = ST{Tv,Ti}(undef, 5) + @test v isa ST{Tv,Ti} + @test size(v) == (5,) + @test nnz(v) == 0 + @test SparseVector(v) == spzeros(Tv, 5) + end + else + Ti = typeof(ST(spzeros(Float32, 3, 4))).parameters[2] + for Tv in (Float32, ComplexF32) + Tv in eltypes || continue + A = ST{Tv,Ti}(undef, (3, 4)) + @test A isa ST{Tv,Ti} + @test size(A) == (3, 4) + @test nnz(A) == 0 + @test SparseMatrixCSC(A) == spzeros(Tv, 3, 4) + @test size(ST{Tv,Ti}(undef, 3, 4)) == (3, 4) # varargs form + end + end + end + end +end + +function vector(AT, dense_AT, eltypes) for ET in eltypes @testset "Sparse vector properties($ET)" begin m = 25 @@ -58,8 +327,7 @@ function vector(AT, eltypes) end end -function vector_construction(AT, eltypes) - dense_AT = GPUArrays.dense_array_type(AT) +function vector_construction(AT, dense_AT, eltypes) for ET in eltypes m = 25 n = 35 @@ -71,11 +339,20 @@ function vector_construction(AT, eltypes) @test collect(d_x) == collect(x) @test similar(d_x) isa AT{ET} @test similar(d_x, Float32) isa AT{Float32} + + dense_x = dense_AT(collect(x)) + @test SparseVector(dense_x) == x + if dense_x isa GPUArrays.AnyGPUArray + typed_d_x = AT(dense_x) + @test typed_d_x isa typeof(d_x) + @test SparseVector(typed_d_x) == x + end + @test sparse(dense_x) isa AT{ET} + @test SparseVector(sparse(dense_x)) == x end end function matrix(AT, eltypes) - dense_AT = GPUArrays.dense_array_type(AT) for ET in eltypes @testset "Sparse matrix properties($ET)" begin m = 25 @@ -131,8 +408,7 @@ function matrix(AT, eltypes) end end -function matrix_construction(AT, eltypes) - dense_AT = GPUArrays.dense_array_type(AT) +function matrix_construction(AT, dense_AT, eltypes) for ET in eltypes m = 25 n = 35 @@ -148,11 +424,23 @@ function matrix_construction(AT, eltypes) @test similar(d_x, Float32, (n, m)) isa AT{Float32} @test similar(d_x, (3, 4)) isa AT{ET} @test size(similar(d_x, (3, 4))) == (3, 4) + + dense_x = dense_AT(collect(x)) + @test SparseMatrixCSC(dense_x) == x + if dense_x isa GPUArrays.AnyGPUArray + typed_d_x = AT(dense_x) + @test typed_d_x isa typeof(d_x) + @test SparseMatrixCSC(typed_d_x) == x + end + @test issparse(sparse(dense_x)) + @test SparseMatrixCSC(sparse(dense_x)) == x + if dense_x isa GPUArrays.AnyGPUArray + @test SparseMatrixCSC(sparse(dense_x; fmt=:csr)) == x + end end end -function broadcasting_vector(AT, eltypes) - dense_AT = GPUArrays.dense_array_type(AT) +function broadcasting_vector(AT, dense_AT, eltypes) for ET in eltypes @testset "SparseVector($ET)" begin m = 64 @@ -172,14 +460,14 @@ function broadcasting_vector(AT, eltypes) # not zero-preserving y = x .+ ET(1) dy = dx .+ ET(1) - @test dy isa dense_AT{ET} + @test is_densified(dy, dense_AT, ET) hy = Array(dy) @test Array(y) == hy # involving something dense y = x .+ ones(ET, m) dy = dx .+ dense_AT(ones(ET, m)) - @test dy isa dense_AT{ET} + @test is_densified(dy, dense_AT, ET) @test Array(y) == Array(dy) # sparse to sparse @@ -210,7 +498,7 @@ function broadcasting_vector(AT, eltypes) dw = AT(w) z = @. x * y * w * dense_arr dz = @. dx * dy * dw * d_dense_arr - @test dz isa dense_AT{ET} + @test is_densified(dz, dense_AT, ET) @test Array(z) == Array(dz) y = sprand(ET, m, p) @@ -226,7 +514,7 @@ function broadcasting_vector(AT, eltypes) dx = AT(x) dy = dx .+ 1 y = x .+ 1 - @test dy isa dense_AT{promote_type(ET, Int)} + @test is_densified(dy, dense_AT, promote_type(ET, Int)) @test Array(y) == Array(dy) ## zero-preserving dy = dx .* 1 @@ -240,8 +528,7 @@ function broadcasting_vector(AT, eltypes) end end -function broadcasting_matrix(AT, eltypes) - dense_AT = GPUArrays.dense_array_type(AT) +function broadcasting_matrix(AT, dense_AT, eltypes) for ET in eltypes @testset "SparseMatrix($ET)" begin m, n = 5, 6 @@ -257,7 +544,7 @@ function broadcasting_matrix(AT, eltypes) # not zero-preserving y = x .+ ET(1) dy = dx .+ ET(1) - @test dy isa dense_AT{ET} + @test is_densified(dy, dense_AT, ET) hy = Array(dy) dense_y = Array(y) @test Array(y) == Array(dy) @@ -265,7 +552,7 @@ function broadcasting_matrix(AT, eltypes) # involving something dense y = x .* ones(ET, m, n) dy = dx .* dense_AT(ones(ET, m, n)) - @test dy isa dense_AT{ET} + @test is_densified(dy, dense_AT, ET) @test Array(y) == Array(dy) # multiple inputs @@ -301,7 +588,6 @@ function broadcasting_matrix(AT, eltypes) end function mapreduce_matrix(AT, eltypes) - dense_AT = GPUArrays.dense_array_type(AT) for ET in eltypes @testset "SparseMatrix($ET)" begin m,n = 5,6 @@ -309,32 +595,45 @@ function mapreduce_matrix(AT, eltypes) x = sprand(ET, m, n, p) dx = AT(x) + # The `abs`-reductions below produce a result of element type `RT`; for some + # input eltypes that is a type the back-end can't allocate (e.g. `abs` of a + # `Complex{<:Integer}` is `Float64`, unsupported on Metal). Only exercise them + # when the back-end supports the result type; `sum` keeps the input eltype and + # is always tested. + RT = typeof(abs(zero(ET))) + # dim=: y = sum(x) dy = sum(dx) @test y ≈ dy - y = mapreduce(abs, +, x) - dy = mapreduce(abs, +, dx) - @test y ≈ dy + if RT in eltypes + y = mapreduce(abs, +, x) + dy = mapreduce(abs, +, dx) + @test y ≈ dy + end # dim=1 y = sum(x, dims=1) dy = sum(dx, dims=1) @test y ≈ Array(dy) - y = mapreduce(abs, +, x, dims=1) - dy = mapreduce(abs, +, dx, dims=1) - @test y ≈ Array(dy) + if RT in eltypes + y = mapreduce(abs, +, x, dims=1) + dy = mapreduce(abs, +, dx, dims=1) + @test y ≈ Array(dy) + end # dim=2 y = sum(x, dims=2) dy = sum(dx, dims=2) @test y ≈ Array(dy) - y = mapreduce(abs, +, x, dims=2) - dy = mapreduce(abs, +, dx, dims=2) - @test y ≈ Array(dy) + if RT in eltypes + y = mapreduce(abs, +, x, dims=2) + dy = mapreduce(abs, +, dx, dims=2) + @test y ≈ Array(dy) + end if ET in (Float32, Float64) dy = mapreduce(abs, +, dx; init=zero(ET)) y = mapreduce(abs, +, x; init=zero(ET)) @@ -342,19 +641,20 @@ function mapreduce_matrix(AT, eltypes) end # test with a matrix with fully empty rows - x = zeros(ET, m, n) - x[2, :] .= -one(ET) - x[2, end] = -ET(16) - dx = AT(sparse(x)) - y = mapreduce(abs, max, x) - dy = mapreduce(abs, max, dx) - @test y ≈ dy + if RT in eltypes + x = zeros(ET, m, n) + x[2, :] .= -one(ET) + x[2, end] = -ET(16) + dx = AT(sparse(x)) + y = mapreduce(abs, max, x) + dy = mapreduce(abs, max, dx) + @test y ≈ dy + end end end end function linalg(AT, eltypes) - dense_AT = GPUArrays.dense_array_type(AT) for ET in eltypes # sprandn doesn't work nicely with these... if !(ET <: Union{Int16, Int32, Int64, Complex{Int16}, Complex{Int32}, Complex{Int64}})