diff --git a/src/arithmetics.jl b/src/arithmetics.jl index 73c618a..16f9876 100644 --- a/src/arithmetics.jl +++ b/src/arithmetics.jl @@ -2,7 +2,7 @@ sum( src::AbstractArray, backend::Backend=get_backend(src); init=zero(eltype(src)), - dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing, + dims=nothing, # CPU settings max_tasks=Threads.nthreads(), @@ -58,7 +58,7 @@ end prod( src::AbstractArray, backend::Backend=get_backend(src); init=one(eltype(src)), - dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing, + dims=nothing, # CPU settings max_tasks=Threads.nthreads(), @@ -114,7 +114,7 @@ end maximum( src::AbstractArray, backend::Backend=get_backend(src); init=typemin(eltype(src)), - dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing, + dims=nothing, # CPU settings max_tasks=Threads.nthreads(), @@ -170,7 +170,7 @@ end minimum( src::AbstractArray, backend::Backend=get_backend(src); init=typemax(eltype(src)), - dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing, + dims=nothing, # CPU settings max_tasks=Threads.nthreads(), @@ -226,7 +226,7 @@ end count( [f=identity], src::AbstractArray, backend::Backend=get_backend(src); init=0, - dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing, + dims=nothing, # CPU settings max_tasks=Threads.nthreads(), diff --git a/src/reduce/mapreduce_nd.jl b/src/reduce/mapreduce_nd.jl index 690c292..bb1310b 100644 --- a/src/reduce/mapreduce_nd.jl +++ b/src/reduce/mapreduce_nd.jl @@ -1,5 +1,5 @@ # Generalized N-dimensional mapreduce for GPU and CPU backends, reducing one or more -# dimensions (`dims::Int` or `dims::Tuple`) of `src` into `dst`. +# dimensions (`dims::Integer` or a collection of integers) of `src` into `dst`. # # Design (see references: CUDA.jl / GPUArrays mapreducedim!, PyTorch Reduce.cuh, CUB): # 1. Canonicalize dims: collapse adjacent dimensions with matching strides into contiguous @@ -77,7 +77,7 @@ function mapreduce_nd( f, op, src::MapReduceSource, backend::Backend; init, neutral=neutral_element(op, typeof(init)), - dims::Union{Int, Tuple{Vararg{Int}}}, + dims, # CPU settings max_tasks::Int, @@ -91,14 +91,17 @@ function mapreduce_nd( @argcheck 1 <= block_size <= 1024 @argcheck ispow2(block_size) - dims_all = dims isa Int ? (dims,) : dims - - if Base.any(d < 1 for d in dims_all) - throw(ArgumentError("region dimension(s) must be ≥ 1, got $dims")) + dims_src = dims isa Number ? (dims,) : dims + dims_buf = Int[] + for d in dims_src + d isa Integer || throw(ArgumentError("reduced dimension(s) must be integers")) + dim = Int(d) + dim < 1 && throw(ArgumentError("region dimension(s) must be ≥ 1, got $d")) + push!(dims_buf, dim) end # Match Base: duplicate dims are ignored, e.g. dims=(2,2) behaves like dims=2. - dims_all = Tuple(Base.unique(dims_all)) + dims_all = Tuple(Base.unique(dims_buf)) src_sizes = size(src) ndim = length(src_sizes) diff --git a/src/reduce/reduce.jl b/src/reduce/reduce.jl index ba29351..5b266bd 100644 --- a/src/reduce/reduce.jl +++ b/src/reduce/reduce.jl @@ -57,7 +57,7 @@ include("mapreduce_nd.jl") op, src::AbstractArray, backend::Backend=get_backend(src); init, neutral=neutral_element(op, typeof(init)), - dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing, + dims=nothing, # CPU settings max_tasks::Int=Threads.nthreads(), @@ -70,16 +70,16 @@ include("mapreduce_nd.jl") ) Reduce `src` along dimensions `dims` using the binary operator `op`. If `dims` is `nothing` or -`:`, reduce `src` to a scalar. If `dims` is an integer or a tuple of integers, reduce `src` along -those dimension(s). The `init` value is used as the initial value for the reduction; `neutral` is -the neutral element for the operator `op`. +`:`, reduce `src` to a scalar. If `dims` is an integer or a collection of integers, reduce `src` +along those dimension(s). The `init` value is used as the initial value for the reduction; `neutral` +is the neutral element for the operator `op`. The returned type is the same as `init` - to control output precision, specify `init` explicitly. ## CPU settings Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional -arrays (`dims` is an integer or tuple) multithreading currently only becomes faster for -`max_tasks >= 4`; all other cases are scaling linearly with the number of threads. +arrays (`dims` is an integer or a collection of integers) multithreading currently only becomes +faster for `max_tasks >= 4`; all other cases are scaling linearly with the number of threads. Note that multithreading reductions only improves performance for cases with more compute-heavy operations, which hide the memory latency and thread launch overhead - that includes: @@ -93,8 +93,8 @@ The `block_size` parameter controls the number of threads per block and must be The `temp` parameter can be used to pass a pre-allocated temporary array. For reduction to a scalar (`dims=nothing` or `dims=:`), `length(temp) >= 2 * (length(src) + 2 * block_size - 1) ÷ (2 * -block_size)` is required. For reduction along dimensions (`dims` is an integer or tuple), `temp` is -used as the destination array, and thus must have the exact dimensions required - i.e. same +block_size)` is required. For reduction along dimensions (`dims` is an integer or a collection of +integers), `temp` is used as the destination array, and thus must have the exact dimensions required - i.e. same dimensionwise sizes as `src`, except for the reduced dimension(s) which become 1; there are some corner cases when one dimension is zero, check against `Base.reduce` for CPU arrays for exact behavior. @@ -142,7 +142,7 @@ end f, op, src::AbstractArray, backend::Backend=get_backend(src); init, neutral=neutral_element(op, typeof(init)), - dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing, + dims=nothing, # CPU settings max_tasks::Int=Threads.nthreads(), @@ -158,7 +158,7 @@ end mapreduce(f, op, A::AbstractArray, B::AbstractArray, As::AbstractArray..., backend::Backend; init, kwargs...) Reduce `src` along dimensions `dims` using the binary operator `op` after applying `f` elementwise. -If `dims` is `nothing` or `:`, reduce `src` to a scalar. If `dims` is an integer or a tuple of +If `dims` is `nothing` or `:`, reduce `src` to a scalar. If `dims` is an integer or a collection of integers, reduce `src` along those dimension(s). The `init` value is used as the initial value for the reduction (i.e. after mapping). @@ -175,16 +175,16 @@ are reduced without materializing the intermediate array. Mismatched axes throw ## CPU settings Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional -arrays (`dims` is an integer or tuple) multithreading currently only becomes faster for -`max_tasks >= 4`; all other cases are scaling linearly with the number of threads. +arrays (`dims` is an integer or a collection of integers) multithreading currently only becomes +faster for `max_tasks >= 4`; all other cases are scaling linearly with the number of threads. ## GPU settings The `block_size` parameter controls the number of threads per block and must be a power of two. The `temp` parameter can be used to pass a pre-allocated temporary array. For reduction to a scalar (`dims=nothing` or `dims=:`), `length(temp) >= 2 * (length(src) + 2 * block_size - 1) ÷ (2 * -block_size)` is required. For reduction along dimensions (`dims` is an integer or tuple), `temp` is -used as the destination array, and thus must have the exact dimensions required - i.e. same +block_size)` is required. For reduction along dimensions (`dims` is an integer or a collection of +integers), `temp` is used as the destination array, and thus must have the exact dimensions required - i.e. same dimensionwise sizes as `src`, except for the reduced dimension(s) which become 1; there are some corner cases when one dimension is zero, check against `Base.reduce` for CPU arrays for exact behavior. @@ -281,7 +281,7 @@ function _mapreduce_impl( f, op, src::MapReduceSource, backend::Backend; init, neutral=neutral_element(op, typeof(init)), - dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon} = nothing, + dims = nothing, # CPU settings max_tasks::Int=Threads.nthreads(), diff --git a/test/generic/reduce.jl b/test/generic/reduce.jl index 3f0587e..55b7e0a 100644 --- a/test/generic/reduce.jl +++ b/test/generic/reduce.jl @@ -259,6 +259,16 @@ end end end + # Base also accepts iterable dims such as vectors and ranges. + for dims in ([1,2], [1,3], [2,3], [1,2,3], [2,1], [2,1,2], Int[], Any[1,2], Int32[1,2], 1:2) + vh = rand(Int32(1):Int32(100), 3, 4, 5) + v = array_from_host(vh) + @test Array(AK.reduce(+, v; prefer_threads, init=Int32(0), dims)) == + sum(vh; init=Int32(0), dims) + end + + @test_throws ArgumentError AK.reduce(+, array_from_host(rand(Int32, 3, 4)); prefer_threads, init=Int32(0), dims=[1.0, 2.0]) + # Tiled strided GPU path: contiguous kept dimensions, one strided reduce # dimension, and dst_size == reduce_size. The 3D case also exercises a # partial output tile. @@ -650,6 +660,16 @@ end end end + # Base also accepts iterable dims such as vectors and ranges. + for dims in ([1,2], [1,3], [2,3], [1,2,3], [2,1], [2,1,2], Int[], Any[1,2], Int32[1,2], 1:2) + vh = rand(Int32(1):Int32(100), 3, 4, 5) + v = array_from_host(vh) + @test Array(AK.mapreduce(-, +, v; prefer_threads, init=Int32(0), dims)) == + mapreduce(-, +, vh; init=Int32(0), dims) + end + + @test_throws ArgumentError AK.mapreduce(-, +, array_from_host(rand(Int32, 3, 4)); prefer_threads, init=Int32(0), dims=[1.0, 2.0]) + # Tiled strided GPU path coverage for mapreduce, including a 3D case with # a partial output tile. for (shape, dims) in (((512, 512), 2), ((20, 13, 260), 3))