Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions src/arithmetics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
sum(
src::AbstractArray, backend::Backend=get_backend(src);
init=zero(eltype(src)),
dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing,
dims=nothing,

# CPU settings
max_tasks=Threads.nthreads(),
Expand Down Expand Up @@ -58,7 +58,7 @@ end
prod(
src::AbstractArray, backend::Backend=get_backend(src);
init=one(eltype(src)),
dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing,
dims=nothing,

# CPU settings
max_tasks=Threads.nthreads(),
Expand Down Expand Up @@ -114,7 +114,7 @@ end
maximum(
src::AbstractArray, backend::Backend=get_backend(src);
init=typemin(eltype(src)),
dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing,
dims=nothing,

# CPU settings
max_tasks=Threads.nthreads(),
Expand Down Expand Up @@ -170,7 +170,7 @@ end
minimum(
src::AbstractArray, backend::Backend=get_backend(src);
init=typemax(eltype(src)),
dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing,
dims=nothing,

# CPU settings
max_tasks=Threads.nthreads(),
Expand Down Expand Up @@ -226,7 +226,7 @@ end
count(
[f=identity], src::AbstractArray, backend::Backend=get_backend(src);
init=0,
dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing,
dims=nothing,

# CPU settings
max_tasks=Threads.nthreads(),
Expand Down
17 changes: 10 additions & 7 deletions src/reduce/mapreduce_nd.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Generalized N-dimensional mapreduce for GPU and CPU backends, reducing one or more
# dimensions (`dims::Int` or `dims::Tuple`) of `src` into `dst`.
# dimensions (`dims::Integer` or a collection of integers) of `src` into `dst`.
#
# Design (see references: CUDA.jl / GPUArrays mapreducedim!, PyTorch Reduce.cuh, CUB):
# 1. Canonicalize dims: collapse adjacent dimensions with matching strides into contiguous
Expand Down Expand Up @@ -77,7 +77,7 @@ function mapreduce_nd(
f, op, src::MapReduceSource, backend::Backend;
init,
neutral=neutral_element(op, typeof(init)),
dims::Union{Int, Tuple{Vararg{Int}}},
dims,

# CPU settings
max_tasks::Int,
Expand All @@ -91,14 +91,17 @@ function mapreduce_nd(
@argcheck 1 <= block_size <= 1024
@argcheck ispow2(block_size)

dims_all = dims isa Int ? (dims,) : dims

if Base.any(d < 1 for d in dims_all)
throw(ArgumentError("region dimension(s) must be ≥ 1, got $dims"))
dims_src = dims isa Number ? (dims,) : dims
dims_buf = Int[]
for d in dims_src
d isa Integer || throw(ArgumentError("reduced dimension(s) must be integers"))
dim = Int(d)
dim < 1 && throw(ArgumentError("region dimension(s) must be ≥ 1, got $d"))
push!(dims_buf, dim)
end

# Match Base: duplicate dims are ignored, e.g. dims=(2,2) behaves like dims=2.
dims_all = Tuple(Base.unique(dims_all))
dims_all = Tuple(Base.unique(dims_buf))

src_sizes = size(src)
ndim = length(src_sizes)
Expand Down
30 changes: 15 additions & 15 deletions src/reduce/reduce.jl
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ include("mapreduce_nd.jl")
op, src::AbstractArray, backend::Backend=get_backend(src);
init,
neutral=neutral_element(op, typeof(init)),
dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing,
dims=nothing,

# CPU settings
max_tasks::Int=Threads.nthreads(),
Expand All @@ -70,16 +70,16 @@ include("mapreduce_nd.jl")
)

Reduce `src` along dimensions `dims` using the binary operator `op`. If `dims` is `nothing` or
`:`, reduce `src` to a scalar. If `dims` is an integer or a tuple of integers, reduce `src` along
those dimension(s). The `init` value is used as the initial value for the reduction; `neutral` is
the neutral element for the operator `op`.
`:`, reduce `src` to a scalar. If `dims` is an integer or a collection of integers, reduce `src`
along those dimension(s). The `init` value is used as the initial value for the reduction; `neutral`
is the neutral element for the operator `op`.

The returned type is the same as `init` - to control output precision, specify `init` explicitly.

## CPU settings
Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional
arrays (`dims` is an integer or tuple) multithreading currently only becomes faster for
`max_tasks >= 4`; all other cases are scaling linearly with the number of threads.
arrays (`dims` is an integer or a collection of integers) multithreading currently only becomes
faster for `max_tasks >= 4`; all other cases are scaling linearly with the number of threads.

Note that multithreading reductions only improves performance for cases with more compute-heavy
operations, which hide the memory latency and thread launch overhead - that includes:
Expand All @@ -93,8 +93,8 @@ The `block_size` parameter controls the number of threads per block and must be

The `temp` parameter can be used to pass a pre-allocated temporary array. For reduction to a scalar
(`dims=nothing` or `dims=:`), `length(temp) >= 2 * (length(src) + 2 * block_size - 1) ÷ (2 *
block_size)` is required. For reduction along dimensions (`dims` is an integer or tuple), `temp` is
used as the destination array, and thus must have the exact dimensions required - i.e. same
block_size)` is required. For reduction along dimensions (`dims` is an integer or a collection of
integers), `temp` is used as the destination array, and thus must have the exact dimensions required - i.e. same
dimensionwise sizes as `src`, except for the reduced dimension(s) which become 1; there are some
corner cases when one dimension is zero, check against `Base.reduce` for CPU arrays for exact
behavior.
Expand Down Expand Up @@ -142,7 +142,7 @@ end
f, op, src::AbstractArray, backend::Backend=get_backend(src);
init,
neutral=neutral_element(op, typeof(init)),
dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing,
dims=nothing,

# CPU settings
max_tasks::Int=Threads.nthreads(),
Expand All @@ -158,7 +158,7 @@ end
mapreduce(f, op, A::AbstractArray, B::AbstractArray, As::AbstractArray..., backend::Backend; init, kwargs...)

Reduce `src` along dimensions `dims` using the binary operator `op` after applying `f` elementwise.
If `dims` is `nothing` or `:`, reduce `src` to a scalar. If `dims` is an integer or a tuple of
If `dims` is `nothing` or `:`, reduce `src` to a scalar. If `dims` is an integer or a collection of
integers, reduce `src` along those dimension(s). The `init` value is used as the initial value for
the reduction (i.e. after mapping).

Expand All @@ -175,16 +175,16 @@ are reduced without materializing the intermediate array. Mismatched axes throw

## CPU settings
Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional
arrays (`dims` is an integer or tuple) multithreading currently only becomes faster for
`max_tasks >= 4`; all other cases are scaling linearly with the number of threads.
arrays (`dims` is an integer or a collection of integers) multithreading currently only becomes
faster for `max_tasks >= 4`; all other cases are scaling linearly with the number of threads.

## GPU settings
The `block_size` parameter controls the number of threads per block and must be a power of two.

The `temp` parameter can be used to pass a pre-allocated temporary array. For reduction to a scalar
(`dims=nothing` or `dims=:`), `length(temp) >= 2 * (length(src) + 2 * block_size - 1) ÷ (2 *
block_size)` is required. For reduction along dimensions (`dims` is an integer or tuple), `temp` is
used as the destination array, and thus must have the exact dimensions required - i.e. same
block_size)` is required. For reduction along dimensions (`dims` is an integer or a collection of
integers), `temp` is used as the destination array, and thus must have the exact dimensions required - i.e. same
dimensionwise sizes as `src`, except for the reduced dimension(s) which become 1; there are some
corner cases when one dimension is zero, check against `Base.reduce` for CPU arrays for exact
behavior.
Expand Down Expand Up @@ -281,7 +281,7 @@ function _mapreduce_impl(
f, op, src::MapReduceSource, backend::Backend;
init,
neutral=neutral_element(op, typeof(init)),
dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon} = nothing,
dims = nothing,

# CPU settings
max_tasks::Int=Threads.nthreads(),
Expand Down
20 changes: 20 additions & 0 deletions test/generic/reduce.jl
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,16 @@ end
end
end

# Base also accepts iterable dims such as vectors and ranges.
for dims in ([1,2], [1,3], [2,3], [1,2,3], [2,1], [2,1,2], Int[], Any[1,2], Int32[1,2], 1:2)
vh = rand(Int32(1):Int32(100), 3, 4, 5)
v = array_from_host(vh)
@test Array(AK.reduce(+, v; prefer_threads, init=Int32(0), dims)) ==
sum(vh; init=Int32(0), dims)
end

@test_throws ArgumentError AK.reduce(+, array_from_host(rand(Int32, 3, 4)); prefer_threads, init=Int32(0), dims=[1.0, 2.0])

# Tiled strided GPU path: contiguous kept dimensions, one strided reduce
# dimension, and dst_size == reduce_size. The 3D case also exercises a
# partial output tile.
Expand Down Expand Up @@ -650,6 +660,16 @@ end
end
end

# Base also accepts iterable dims such as vectors and ranges.
for dims in ([1,2], [1,3], [2,3], [1,2,3], [2,1], [2,1,2], Int[], Any[1,2], Int32[1,2], 1:2)
vh = rand(Int32(1):Int32(100), 3, 4, 5)
v = array_from_host(vh)
@test Array(AK.mapreduce(-, +, v; prefer_threads, init=Int32(0), dims)) ==
mapreduce(-, +, vh; init=Int32(0), dims)
end

@test_throws ArgumentError AK.mapreduce(-, +, array_from_host(rand(Int32, 3, 4)); prefer_threads, init=Int32(0), dims=[1.0, 2.0])

# Tiled strided GPU path coverage for mapreduce, including a 3D case with
# a partial output tile.
for (shape, dims) in (((512, 512), 2), ((20, 13, 260), 3))
Expand Down
Loading