diff --git a/src/arithmetics.jl b/src/arithmetics.jl index a14a9e27..73c618a0 100644 --- a/src/arithmetics.jl +++ b/src/arithmetics.jl @@ -2,7 +2,7 @@ sum( src::AbstractArray, backend::Backend=get_backend(src); init=zero(eltype(src)), - dims::Union{Nothing, Int}=nothing, + dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing, # CPU settings max_tasks=Threads.nthreads(), @@ -33,11 +33,11 @@ m = MtlArray(rand(Int32(1):Int32(100), 10, 100_000)) s = AK.sum(m, dims=1) ``` -If you know the shape of the resulting array (in case of a axis-wise sum, i.e. `dims` is not +If you know the shape of the resulting array (in case of a dimensionwise sum, i.e. `dims` is not `nothing`), you can provide the `temp` argument to save results into and avoid allocations: ```julia m = MtlArray(rand(Int32(1):Int32(100), 10, 100_000)) -temp = MtlArray(zeros(Int32, 10)) +temp = MtlArray(zeros(Int32, 10, 1)) s = AK.sum(m, dims=2, temp=temp) ``` """ @@ -58,7 +58,7 @@ end prod( src::AbstractArray, backend::Backend=get_backend(src); init=one(eltype(src)), - dims::Union{Nothing, Int}=nothing, + dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing, # CPU settings max_tasks=Threads.nthreads(), @@ -89,11 +89,11 @@ m = ROCArray(rand(Int32(1):Int32(100), 10, 100_000)) p = AK.prod(m, dims=1) ``` -If you know the shape of the resulting array (in case of a axis-wise product, i.e. `dims` is not +If you know the shape of the resulting array (in case of a dimensionwise product, i.e. `dims` is not `nothing`), you can provide the `temp` argument to save results into and avoid allocations: ```julia m = ROCArray(rand(Int32(1):Int32(100), 10, 100_000)) -temp = ROCArray(ones(Int32, 10)) +temp = ROCArray(ones(Int32, 10, 1)) p = AK.prod(m, dims=2, temp=temp) ``` """ @@ -114,7 +114,7 @@ end maximum( src::AbstractArray, backend::Backend=get_backend(src); init=typemin(eltype(src)), - dims::Union{Nothing, Int}=nothing, + dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing, # CPU settings max_tasks=Threads.nthreads(), @@ -145,11 +145,11 @@ m = oneArray(rand(Int32(1):Int32(100), 10, 100_000)) m = AK.maximum(m, dims=1) ``` -If you know the shape of the resulting array (in case of a axis-wise maximum, i.e. `dims` is not +If you know the shape of the resulting array (in case of a dimensionwise maximum, i.e. `dims` is not `nothing`), you can provide the `temp` argument to save results into and avoid allocations: ```julia m = oneArray(rand(Int32(1):Int32(100), 10, 100_000)) -temp = oneArray(zeros(Int32, 10)) +temp = oneArray(zeros(Int32, 10, 1)) m = AK.maximum(m, dims=2, temp=temp) ``` """ @@ -170,7 +170,7 @@ end minimum( src::AbstractArray, backend::Backend=get_backend(src); init=typemax(eltype(src)), - dims::Union{Nothing, Int}=nothing, + dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing, # CPU settings max_tasks=Threads.nthreads(), @@ -201,11 +201,11 @@ m = CuArray(rand(Int32(1):Int32(100), 10, 100_000)) m = AK.minimum(m, dims=1) ``` -If you know the shape of the resulting array (in case of a axis-wise minimum, i.e. `dims` is not +If you know the shape of the resulting array (in case of a dimensionwise minimum, i.e. `dims` is not `nothing`), you can provide the `temp` argument to save results into and avoid allocations: ```julia m = CuArray(rand(Int32(1):Int32(100), 10, 100_000)) -temp = CuArray(ones(Int32, 10)) +temp = CuArray(ones(Int32, 10, 1)) m = AK.minimum(m, dims=2, temp=temp) ``` """ @@ -226,7 +226,7 @@ end count( [f=identity], src::AbstractArray, backend::Backend=get_backend(src); init=0, - dims::Union{Nothing, Int}=nothing, + dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing, # CPU settings max_tasks=Threads.nthreads(), @@ -263,12 +263,12 @@ m = MtlArray(rand(Bool, 10, 100_000)) c = AK.count(m, dims=1) ``` -If you know the shape of the resulting array (in case of a axis-wise count, i.e. `dims` is not +If you know the shape of the resulting array (in case of a dimensionwise count, i.e. `dims` is not `nothing`), you can provide the `temp` argument to save results into and avoid allocations: ```julia m = MtlArray(rand(Bool, 10, 100_000)) -temp = MtlArray(zeros(Int32, 10)) -c = AK.count(m, dims=2, temp=temp) +temp = MtlArray(zeros(Int32, 10, 1)) +c = AK.count(m; init=Int32(0), dims=2, temp=temp) ``` """ function count( diff --git a/src/reduce/mapreduce_1d_cpu.jl b/src/reduce/mapreduce_1d_cpu.jl index 95a93f21..3a47b603 100644 --- a/src/reduce/mapreduce_1d_cpu.jl +++ b/src/reduce/mapreduce_1d_cpu.jl @@ -1,5 +1,5 @@ function mapreduce_1d_cpu( - f, op, src::AbstractArray, backend::Backend; + f, op, src::MapReduceSource, backend::Backend; init, neutral, @@ -12,6 +12,10 @@ function mapreduce_1d_cpu( temp::Union{Nothing, AbstractArray}, switch_below::Int, ) + if src isa Base.Broadcast.Broadcasted + return op(init, Base.mapreduce(f, op, src; init=neutral)) + end + if max_tasks == 1 return op(init, Base.mapreduce(f, op, src; init=neutral)) end diff --git a/src/reduce/mapreduce_1d_gpu.jl b/src/reduce/mapreduce_1d_gpu.jl index 39e7c415..f016fa41 100644 --- a/src/reduce/mapreduce_1d_gpu.jl +++ b/src/reduce/mapreduce_1d_gpu.jl @@ -47,7 +47,7 @@ end function mapreduce_1d_gpu( - f, op, src::AbstractArray, backend::Backend; + f, op, src::MapReduceSource, backend::Backend; init, neutral, @@ -61,12 +61,13 @@ function mapreduce_1d_gpu( switch_below::Int, ) @argcheck 1 <= block_size <= 1024 + @argcheck ispow2(block_size) @argcheck switch_below >= 0 # Degenerate cases len = length(src) len == 0 && return init - len == 1 && return @allowscalar f(src[1]) + len == 1 && return op(init, @allowscalar f(src[1])) if len < switch_below h_src = Vector(src) return Base.mapreduce(f, op, h_src; init) @@ -87,8 +88,8 @@ function mapreduce_1d_gpu( dst = KernelAbstractions.allocate(backend, dst_type, blocks * 2) end - # Later the kernel will be compiled for views anyways, so use same types - src_view = @view src[1:end] + # Later the kernel will be compiled for views anyways, so use same types for arrays. + src_view = _mapreduce_1d_src_view(src) dst_view = @view dst[1:blocks] kernel! = _mapreduce_block!(backend, block_size) @@ -125,3 +126,6 @@ function mapreduce_1d_gpu( # The GPU kernel reduced all elements to one, but without the init value return op(init, @allowscalar(p1[1])) end + +_mapreduce_1d_src_view(src::AbstractArray) = @view src[1:end] +_mapreduce_1d_src_view(src::Base.Broadcast.Broadcasted) = src diff --git a/src/reduce/mapreduce_nd.jl b/src/reduce/mapreduce_nd.jl index 231d0dc4..690c2920 100644 --- a/src/reduce/mapreduce_nd.jl +++ b/src/reduce/mapreduce_nd.jl @@ -1,10 +1,85 @@ +# Generalized N-dimensional mapreduce for GPU and CPU backends, reducing one or more +# dimensions (`dims::Int` or `dims::Tuple`) of `src` into `dst`. +# +# Design (see references: CUDA.jl / GPUArrays mapreducedim!, PyTorch Reduce.cuh, CUB): +# 1. Canonicalize dims: collapse adjacent dimensions with matching strides into contiguous +# segments. A plain `dims=2` reduction, or any reduction over a contiguous block of dims +# (e.g. `dims=(1,2)`), collapses to a *single* reduce segment. +# 2. The inner reduce loop must avoid per-element integer division. For a single segment the +# element offset is just `j * stride`; only genuinely non-contiguous dim sets (e.g. +# `dims=(1,3)`) fall back to a per-element multi-dimensional decode. +# 3. Four work decompositions, chosen by the relative sizes and strides of the output and +# the reduction: +# - by_thread: one thread per output (many outputs, small reduction) +# - tiled_strided: several contiguous outputs per block, for square-like strided reductions +# - by_block: one block per output, grid-stride over outputs (few outputs, large reduction) +# - multigroup: several blocks per output, two-pass (dst_size==1 or very small dst_size) + +# Number of blocks the by_block / multigroup paths aim to launch, so a reduction with +# few output elements can still fill the GPU. A heuristic GPU-occupancy target. +const TARGET_BLOCKS = 256 + +# Below this many output elements, splitting a single output's reduction across multiple +# blocks (multigroup) is preferred over grid-striding by_block, because grid-stride with +# too few blocks cannot fill the GPU on its own. At or above this, by_block grid-strides. +const GS_DST_CUTOFF = 32 + +# Minimum reduction-loop iterations per thread in the multigroup first pass. +# Caps reduce_groups so splitting a reduction doesn't shrink per-thread work +# below the point where launch/scheduling overhead dominates actual work. +const MIN_ITEMS_PER_THREAD = 8 + +# Number of contiguous output rows handled by each tiled-strided block. With the default +# 256-thread block this gives 32 lanes per row. +const TILED_STRIDED_ROWS_PER_BLOCK = 8 + + +# Host-side canonicalization: split the dimensions into reduced and kept ("outer") segments, +# merging adjacent dimensions whose strides are contiguous. Returns two tuples of +# (stride, size) segments. +function _canonicalize_dims(src_sizes, src_strides, dims_valid) + ndim = length(src_sizes) + reduce_segs = Tuple{Int,Int}[] + outer_segs = Tuple{Int,Int}[] + + i = 1 + while i <= ndim + if i in dims_valid + seg_stride = src_strides[i] + seg_size = src_sizes[i] + while i + 1 <= ndim && + (i + 1) in dims_valid && + src_strides[i + 1] == seg_stride * seg_size + i += 1 + seg_size *= src_sizes[i] + end + push!(reduce_segs, (seg_stride, seg_size)) + else + seg_stride = src_strides[i] + seg_size = src_sizes[i] + while i + 1 <= ndim && + !((i + 1) in dims_valid) && + src_strides[i + 1] == seg_stride * seg_size + i += 1 + seg_size *= src_sizes[i] + end + push!(outer_segs, (seg_stride, seg_size)) + end + i += 1 + end + + return Tuple(reduce_segs), Tuple(outer_segs) +end + +# Main entry point + function mapreduce_nd( - f, op, src::AbstractArray, backend::Backend; + f, op, src::MapReduceSource, backend::Backend; init, - neutral=neutral_element(op, eltype(src)), - dims::Int, + neutral=neutral_element(op, typeof(init)), + dims::Union{Int, Tuple{Vararg{Int}}}, - # CPU settings - ignored here + # CPU settings max_tasks::Int, min_elems::Int, prefer_threads::Bool=true, @@ -14,327 +89,637 @@ function mapreduce_nd( temp::Union{Nothing, AbstractArray}, ) @argcheck 1 <= block_size <= 1024 + @argcheck ispow2(block_size) - # Degenerate cases begin; order of priority matters + dims_all = dims isa Int ? (dims,) : dims - # Invalid dims - if dims < 1 + if Base.any(d < 1 for d in dims_all) throw(ArgumentError("region dimension(s) must be ≥ 1, got $dims")) end - # If dims > number of dimensions, just map each element through f and add init, e.g.: - # julia> x = rand(Float64, 3, 5); - # julia> mapreduce(x -> -x, +, x, dims=3, init=Float32(0)) - # 3×5 Matrix{Float32} # Negative numbers + # Match Base: duplicate dims are ignored, e.g. dims=(2,2) behaves like dims=2. + dims_all = Tuple(Base.unique(dims_all)) + src_sizes = size(src) - if dims > length(src_sizes) - if isnothing(temp) - dst = KernelAbstractions.allocate(backend, typeof(init), src_sizes) - else - @argcheck get_backend(temp) == backend - @argcheck size(temp) == src_sizes - @argcheck eltype(temp) == typeof(init) - dst = temp - end + ndim = length(src_sizes) + + dims_valid = Tuple(d for d in dims_all if d <= ndim) + + # Degenerate cases begin; order of priority matters + + # All reduced dims are beyond ndims: just map each element through f and add init, e.g.: + # julia> x = rand(Float64, 3, 5); + # julia> mapreduce(x -> -x, +, x, dims=3, init=Float32(0)) # 3×5 Matrix{Float32} + if isempty(dims_valid) + dst = _alloc_or_temp(backend, temp, init, src_sizes) _mapreduce_nd_apply_init!(f, op, dst, src, backend; init, max_tasks, min_elems, block_size) return dst end # The per-dimension sizes of the destination array; construct tuple without allocations dst_sizes = unrolled_map_index(src_sizes) do i - i == dims ? 1 : src_sizes[i] + i in dims_valid ? 1 : src_sizes[i] end - # If any dimension except dims is zero, return empty similar array except with the dims - # dimension = 1. Weird, see example below: - # julia> x = rand(3, 0, 5); - # julia> reduce(+, x, dims=3) - # 3×0×1 Array{Float64, 3} + # If any kept dimension is zero, return empty array (reduced dims become 1), e.g.: + # julia> x = rand(3, 0, 5); reduce(+, x, dims=3) # 3×0×1 Array{Float64, 3} for isize in eachindex(src_sizes) - isize == dims && continue + isize in dims_valid && continue if src_sizes[isize] == 0 - if isnothing(temp) - dst = KernelAbstractions.allocate(backend, typeof(init), dst_sizes) - else - @argcheck size(temp) == dst_sizes - @argcheck eltype(temp) == typeof(init) - dst = temp - end - return dst + return _alloc_or_temp(backend, temp, init, dst_sizes) end end - # If sizes[dims] == 0, return array filled with init; same shape except sizes[dims] = 1: - # julia> x = rand(3, 0, 5); - # julia> mapreduce(+, x, dims=2) - # 3×1×5 Array{Float64, 3}: - # [:, :, 1] = - # 0.0 - # 0.0 - # 0.0 - # [...] - len = src_sizes[dims] + len = Base.prod(src_sizes[d] for d in dims_valid) + + # If a reduced dimension is zero, return array filled with init, e.g.: + # julia> x = rand(3, 0, 5); mapreduce(+, x, dims=2) # 3×1×5 of zeros if len == 0 - if isnothing(temp) - dst = KernelAbstractions.allocate(backend, typeof(init), dst_sizes) - else - @argcheck get_backend(temp) == backend - @argcheck size(temp) == dst_sizes - @argcheck eltype(temp) == typeof(init) - dst = temp - end + dst = _alloc_or_temp(backend, temp, init, dst_sizes) fill!(dst, init) return dst end - # If sizes[dims] == 1, just map each element through f. Again, keep same type as init + # If the reduced extent is 1, just map each element through f (keep init's type) if len == 1 - if isnothing(temp) - dst = KernelAbstractions.allocate(backend, typeof(init), src_sizes) - else - @argcheck get_backend(temp) == backend - @argcheck size(temp) == src_sizes - @argcheck eltype(temp) == typeof(init) - dst = temp - end + dst = _alloc_or_temp(backend, temp, init, src_sizes) _mapreduce_nd_apply_init!(f, op, dst, src, backend; init, max_tasks, min_elems, block_size) return dst end # Degenerate cases end - # Allocate destination array - if isnothing(temp) - dst = KernelAbstractions.allocate(backend, typeof(init), dst_sizes) - else - @argcheck get_backend(temp) == backend - @argcheck size(temp) == dst_sizes - @argcheck eltype(temp) == typeof(init) - dst = temp - end + dst = _alloc_or_temp(backend, temp, init, dst_sizes) dst_size = length(dst) - if !use_gpu_algorithm(backend, prefer_threads) - _mapreduce_nd_cpu_sections!( - f, op, dst, src; - init, - dims, - max_tasks=max_tasks, - min_elems=min_elems, + if backend == CPU_BACKEND + _mapreduce_nd_cpu_sections!(f, op, dst, src; init, max_tasks, min_elems) + return dst + end + + # The stride-based fast paths below index a flat buffer at + # `buffer[base_offset + Σ coordᵈ·strideᵈ + 1]`. This works for any source backed + # by a single dense column-major buffer (dense arrays, but also strided views, + # adjoints, permuted dims, and reshapes over one). Sources without such a buffer + # — `Broadcasted`, lazy/computed arrays — take the generic Cartesian-indexed + # fallback, which makes no layout assumption (and crucially does not wrap the + # source in `@Const`, which is what makes e.g. PermutedDimsArray uncompilable). + layout = _mapreduce_strided_layout(src) + if isnothing(layout) + blocks = cld(dst_size, block_size) + kernel! = _mapreduce_nd_generic!(backend, block_size) + kernel!( + src, dst, f, op, init, + CartesianIndices(dst), _mapreduce_reduce_indices(src, dst), dst_size, + ndrange=(block_size * blocks,), ) - else - # On GPUs we have two parallelisation approaches, based on which dimension has more elements: - # - If the dimension we are reducing has more elements, (e.g. reduce(+, rand(3, 1000), dims=2)), - # we use a block of threads per dst element - thus, a block of threads reduces the dims axis - # - If the other dimensions have more elements (e.g. reduce(+, rand(3, 1000), dims=1)), we - # use a single thread per dst element - thus, a thread reduces the dims axis sequentially, - # while the other dimensions are processed in parallel, independently - if dst_size >= src_sizes[dims] - blocks = (dst_size + block_size - 1) ÷ block_size - kernel1! = _mapreduce_nd_by_thread!(backend, block_size) - kernel1!( - src, dst, f, op, init, dims, - ndrange=(block_size * blocks,), + return dst + end + + buffer, base_offset, src_strides = layout + reduce_segs, outer_segs = _canonicalize_dims(src_sizes, src_strides, dims_valid) + + outer_strides = Tuple(str for (str, _) in outer_segs) + outer_sizes = Tuple(s for (_, s) in outer_segs) + reduce_strides = Tuple(str for (str, _) in reduce_segs) + reduce_sizes = Tuple(s for (_, s) in reduce_segs) + reduce_size = len + + # ───────────────────────────────────────────────────────────────────────── + # Dispatch decision (see header comment for the four paths): + # + # - square-like, contiguous output with strided input -> tiled_strided + # - dst_size >= reduce_size -> by_thread + # - dst_size == 1, or dst_size < GS_DST_CUTOFF + # (and dst_size < reduce_size) -> multigroup (split one + # reduction across blocks, + # needs a 2nd-pass combine) + # - otherwise (GS_DST_CUTOFF <= dst_size < reduce_size) -> by_block, grid-striding + # over outputs, single pass + # + # Rationale: grid-striding by_block launches `min(dst_size, TARGET_BLOCKS)` blocks; + # for very small dst_size (e.g. 5 or 9) that under-fills an 84-SM GPU, so splitting + # the (large) reduction itself across many blocks via multigroup is still better. + # For dst_size==1 there is nothing to grid-stride over, so multigroup is the only + # option regardless of GS_DST_CUTOFF. + # ───────────────────────────────────────────────────────────────────────── + + # by_block override 1: when the reduced dimension is the fastest-varying one + # (reduce_strides==(1,)), by_thread's per-thread strided access is badly + # uncoalesced, while by_block lets consecutive threads read consecutive elements. + use_by_block_for_coalescing = + dst_size >= reduce_size && reduce_size >= block_size && + length(reduce_sizes) == 1 && reduce_sizes[1] != 0 && + reduce_strides == (1,) + + # by_block override 2: when by_thread would launch too few blocks for a square-like + # output/reduction shape, split each output reduction across a full block. This + # fallback is mostly for layouts that do not satisfy the narrower tiled-strided + # pattern below; the common contiguous-output strided case uses tiled_strided. + # Avoid applying this to wide-output shapes, where by_thread's cross-output + # coalescing is better than a strided block reduction. + use_by_block_for_low_occupancy = + dst_size == reduce_size && reduce_size >= block_size && + cld(dst_size, block_size) < TARGET_BLOCKS && + length(reduce_sizes) == 1 && reduce_sizes[1] != 0 + + use_by_block = use_by_block_for_coalescing || use_by_block_for_low_occupancy + use_tiled_strided = + dst_size == reduce_size && reduce_size >= block_size && + length(outer_sizes) == 1 && outer_strides == (1,) && + length(reduce_sizes) == 1 && reduce_strides[1] > 1 && + block_size % TILED_STRIDED_ROWS_PER_BLOCK == 0 && + ispow2(block_size ÷ TILED_STRIDED_ROWS_PER_BLOCK) + + if use_tiled_strided + # Narrow layout-specific path: square-like row reductions with contiguous + # outputs and strided input reads, e.g. size=(1024,1024), dims=2. Several + # outputs share a block: small lane groups reduce one output each, preserving + # more cross-output memory coalescing than by_block while launching many more + # blocks than by_thread. This is the only extra kernel beyond the generic + # by_thread/by_block/multigroup shapes. + rows_per_block = TILED_STRIDED_ROWS_PER_BLOCK + blocks = cld(dst_size, rows_per_block) + kernel! = _mapreduce_nd_by_thread_tiled_strided!(backend, block_size) + kernel!( + buffer, dst, f, op, init, neutral, + base_offset, reduce_strides[1], dst_size, reduce_size, Val(rows_per_block), + ndrange=(block_size * blocks,), + ) + elseif dst_size >= reduce_size && !use_by_block + # Many outputs, small reduction: one thread per output reduces sequentially + blocks = cld(dst_size, block_size) + kernel! = _mapreduce_nd_by_thread!(backend, block_size) + kernel!( + buffer, dst, f, op, init, + base_offset, outer_strides, outer_sizes, reduce_strides, reduce_sizes, + dst_size, reduce_size, + ndrange=(block_size * blocks,), + ) + elseif use_by_block + # Grid-stride by_block: one full block cooperates on each output, then + # grid-strides across remaining outputs if fewer blocks than outputs were + # launched. + launch_blocks = min(dst_size, TARGET_BLOCKS) + kernel! = _mapreduce_nd_by_block!(backend, block_size) + kernel!( + buffer, dst, f, op, init, neutral, + base_offset, outer_strides, outer_sizes, reduce_strides, reduce_sizes, + dst_size, reduce_size, launch_blocks, + ndrange=(block_size * launch_blocks,), + ) + elseif dst_size == 1 || dst_size < GS_DST_CUTOFF + # Very few outputs, large reduction: split each output's reduction across + # `reduce_groups` blocks (multigroup), combine partials in a second pass. + reduce_groups = min( + cld(reduce_size, block_size), + block_size, + cld(TARGET_BLOCKS, dst_size), + cld(reduce_size, block_size * MIN_ITEMS_PER_THREAD), + ) + reduce_groups = max(reduce_groups, 1) + + if reduce_groups > 1 + partial = KernelAbstractions.allocate(backend, typeof(init), (dst_size, reduce_groups)) + + kernel! = _mapreduce_nd_multigroup!(backend, block_size) + kernel!( + buffer, partial, f, op, neutral, + base_offset, outer_strides, outer_sizes, reduce_strides, reduce_sizes, + Val(dst_size), reduce_size, reduce_groups, + ndrange=(block_size * dst_size * reduce_groups,), ) - else - # One block per output element - blocks = dst_size - kernel2! = _mapreduce_nd_by_block!(backend, block_size) + + # Second pass: reduce partial (dst_size × reduce_groups) → dst, one block + # per output. reduce_groups is small (<=block_size by construction), so use + # a small block size for this pass to avoid an oversubscribed tree-reduce. + pass2_block_size = _pass2_block_size(reduce_groups) + kernel2! = _mapreduce_partial_to_dst!(backend, pass2_block_size) kernel2!( - src, dst, f, op, init, neutral, dims, - ndrange=(block_size * blocks,), + partial, dst, op, init, neutral, + dst_size, reduce_groups, + ndrange=(pass2_block_size * dst_size,), + ) + else + # reduce_groups collapsed to 1 (e.g. reduce_size <= block_size): just do a + # single-block-per-output reduction directly, no partial array needed. + kernel! = _mapreduce_nd_by_block!(backend, block_size) + kernel!( + buffer, dst, f, op, init, neutral, + base_offset, outer_strides, outer_sizes, reduce_strides, reduce_sizes, + dst_size, reduce_size, dst_size, + ndrange=(block_size * dst_size,), ) end + else + # GS_DST_CUTOFF <= dst_size < reduce_size: grid-stride over outputs, one pass. + # Cap launched blocks at TARGET_BLOCKS; each block handles + # ceil(dst_size / launch_blocks) outputs sequentially. + launch_blocks = min(dst_size, TARGET_BLOCKS) + kernel! = _mapreduce_nd_by_block!(backend, block_size) + kernel!( + buffer, dst, f, op, init, neutral, + base_offset, outer_strides, outer_sizes, reduce_strides, reduce_sizes, + dst_size, reduce_size, launch_blocks, + ndrange=(block_size * launch_blocks,), + ) end return dst end +function _mapreduce_dense_strides(sizes::Tuple) + stride = 1 + return ntuple(length(sizes)) do i + s = stride + stride *= sizes[i] + s + end +end + +# Resolve the layout the stride-based fast-path kernels need. Those kernels index a +# flat buffer at `buffer[base_offset + Σ coordᵈ·strideᵈ + 1]`, so they work for any +# source backed by a single dense column-major buffer — a dense array, but also a +# strided view, adjoint, permuted-dims, or reshape over one. Returns +# `(buffer, base_offset, strides)` for such sources, or `nothing` (→ generic +# Cartesian-indexed fallback) for `Broadcasted` and anything not backed by a dense +# buffer (lazy/computed arrays, complex adjoints without `strides`, nested wrappers). +function _mapreduce_strided_layout(src::AbstractArray) + s = try + strides(src) + catch err + err isa MethodError || rethrow() + return nothing + end + + # Dense or contiguous (column-major) — index the source directly, no offset. + s == _mapreduce_dense_strides(size(src)) && return (src, 0, s) + + # Non-dense but strided: index the dense parent buffer at the source's offset. + # Only one level of wrapping over a dense buffer is supported; deeper nesting + # falls back to the generic kernel. + p = parent(src) + (p === src || !_mapreduce_is_dense_buffer(p)) && return nothing + base = _mapreduce_wrapper_offset(src, p) + base === nothing && return nothing + return (p, base, s) +end + +_mapreduce_strided_layout(::Base.Broadcast.Broadcasted) = nothing + +function _mapreduce_is_dense_buffer(p::AbstractArray) + try + return strides(p) == _mapreduce_dense_strides(size(p)) + catch err + err isa MethodError || rethrow() + return false + end +end + +# Offset (0-based, in elements) of the wrapper's first element within its dense +# parent buffer. SubArrays start at their first selected index; permuted-dims, +# reshapes, and (real) adjoints/transposes share the parent's first element. +function _mapreduce_wrapper_offset(src::SubArray, p) + try + # Base.map (not AK's array `map`, which shadows it inside this module). + first_index = Base.map(first, parentindices(src)) + return LinearIndices(p)[first_index...] - 1 + catch + return nothing # exotic index types → fall back to the generic kernel + end +end +_mapreduce_wrapper_offset(src, p) = 0 + +# The reduced-extent index space: full axes along reduced dimensions, a single +# index (`OneTo(1)`) along kept dimensions. Combined with `max(Iother, Ireduce)` +# this reproduces Base's `mapreducedim!` iteration without touching strides. +_mapreduce_reduce_indices(src, dst) = + CartesianIndices(ifelse.(axes(src) .== axes(dst), Ref(Base.OneTo(1)), axes(src))) + + +# Smallest power-of-two >= n, capped at 256 (the original fixed block size) and +# floored at 32 (one warp) — used for the multigroup second pass, which only ever +# combines `reduce_groups` values (reduce_groups <= block_size by construction). +@inline function _pass2_block_size(n::Int) + n <= 32 && return 32 + n <= 64 && return 64 + n <= 128 && return 128 + return 256 +end + + +# Allocate a destination array, or validate and reuse a user-provided `temp`. +function _alloc_or_temp(backend, temp, init, sizes) + isnothing(temp) && return KernelAbstractions.allocate(backend, typeof(init), sizes) + @argcheck get_backend(temp) == backend + @argcheck size(temp) == sizes + @argcheck eltype(temp) == typeof(init) + temp +end + +# CPU path function _mapreduce_nd_cpu_sections!( f, op, dst, src; - init, - dims, - max_tasks, min_elems, + init, max_tasks, min_elems, ) - src_sizes = size(src) - src_strides = strides(src) - dst_strides = strides(dst) - - reduce_size = src_sizes[dims] - ndims = length(src_sizes) + Rother = CartesianIndices(dst) + Rreduce = _mapreduce_reduce_indices(src, dst) - # Each thread handles a section of the output array - i.e. reducing along the dims, for - # multiple output strides foreachindex(dst, max_tasks=max_tasks, min_elems=min_elems) do idst - @inbounds begin - # Compute the base index in src (excluding the reduced axis) - input_base_idx = 0 - tmp = idst - 1 - KernelAbstractions.Extras.@unroll for i in ndims:-1:1 - if i != dims - input_base_idx += (tmp ÷ dst_strides[i]) * src_strides[i] - end - tmp = tmp % dst_strides[i] - end - - # Go over each element in the reduced dimension + Iother = Rother[idst] res = init - for i in 0:reduce_size - 1 - src_idx = input_base_idx + i * src_strides[dims] - res = op(res, f(src[src_idx + 1])) + for Ireduce in Rreduce + J = max(Iother, Ireduce) + res = op(res, f(src[J])) end dst[idst] = res end end - dst end +# Index helpers. Both decode a linear index into a byte-offset by walking compile-time-sized +# segment tuples. The single-segment case (the common one after canonicalization) needs no +# division; multi-segment falls back to a per-element decode (rare: non-adjacent dim sets). + +@inline function _outer_decode(tid, outer_strides, outer_sizes) + isempty(outer_sizes) && return 0 + length(outer_sizes) == 1 && return tid * outer_strides[1] + base = 0 + tmp = tid + @inbounds for i in 1:length(outer_sizes) + q = tmp ÷ outer_sizes[i] + r = tmp - q * outer_sizes[i] + base += r * outer_strides[i] + tmp = q + end + base +end -@kernel inbounds=true cpu=false unsafe_indices=true function _mapreduce_nd_by_thread!( - @Const(src), dst, - f, op, - init, dims, +@inline function _reduce_offset(j, reduce_strides, reduce_sizes) + isempty(reduce_sizes) && return 0 + # Single segment: j < reduce_size, so the decode is just j * stride — no division. + length(reduce_sizes) == 1 && return j * reduce_strides[1] + off = 0 + tmp = j + @inbounds for i in 1:length(reduce_sizes) - 1 + sz = reduce_sizes[i] + if ispow2(sz) + shift = trailing_zeros(sz) + r = tmp & (sz - 1) + tmp = tmp >> shift + else + q = tmp ÷ sz + r = tmp - q * sz + tmp = q + end + off += r * reduce_strides[i] + end + off += tmp * reduce_strides[end] + off +end + +# GPU kernel: generic fallback — one thread per output element, reducing sequentially +# over the reduced extents using Cartesian indexing. Makes no assumption about the +# source's memory layout, so it handles strided views, adjoints, permuted dims, and +# broadcasts over them. Mirrors the CPU `_mapreduce_nd_cpu_sections!` path. +# +# NOTE: the source is deliberately NOT marked `@Const` here. `@Const` prevents the +# `@inbounds` getindex of some wrappers (e.g. PermutedDimsArray's `genperm`) from +# being elided, leaving a `throw` that the GPU backends cannot compile. +@kernel inbounds=true cpu=false unsafe_indices=true function _mapreduce_nd_generic!( + src, dst, + f, op, init, + Rother, Rreduce, output_size, ) - # One thread per output element, when there are more outer elements than in the reduced dim - # e.g. reduce(+, rand(3, 1000), dims=1) => only 3 elements in the reduced dim - src_sizes = size(src) - src_strides = strides(src) - dst_sizes = size(dst) - dst_strides = strides(dst) + N = @groupsize()[1] + iblock = @index(Group, Linear) - 0x1 + ithread = @index(Local, Linear) - 0x1 + tid = ithread + iblock * N - output_size = length(dst) - reduce_size = src_sizes[dims] + if tid < output_size + Iother = Rother[tid + 0x1] + res = init + for Ireduce in Rreduce + J = max(Iother, Ireduce) + res = op(res, f(src[J])) + end + dst[Iother] = res + end +end - ndims = length(src_sizes) +@kernel inbounds=true cpu=false unsafe_indices=true function _mapreduce_nd_by_thread_tiled_strided!( + @Const(src), dst, + f, op, init, neutral, + base_offset, reduce_stride, + output_size, reduce_size, + ::Val{rows}, +) where {rows} + @uniform N = @groupsize()[1] + @uniform reduce_threads = N ÷ rows + sdata = @localmem eltype(dst) (N,) + + iblock = @index(Group, Linear) - 0x1 + ithread = @index(Local, Linear) - 0x1 - N = @groupsize()[1] + # Int(...) keeps the index decode signed; a bare `unsigned ÷ unsigned` result feeds + # the signed index math and adds an Int-conversion throw guard NVPTX keeps (CUDA ~20%). + row = Int(unsigned(ithread) % unsigned(rows)) + lane = Int(unsigned(ithread) ÷ unsigned(rows)) + iout = iblock * rows + row + + acc = neutral + if iout < output_size + # Contiguous outputs (outer_strides == (1,)), so the source base is just iout; + # base_offset (0 for a dense source) is folded in once, outside the reduce loop. + sbase = base_offset + iout + j = lane + while j < reduce_size + acc = op(acc, f(src[sbase + j * reduce_stride + 0x1])) + j += reduce_threads + end + end - # NOTE: for many index calculations in this library, computation using zero-indexing leads to - # fewer operations (also code is transpiled to CUDA / ROCm / oneAPI / Metal code which do zero - # indexing). Internal calculations will be done using zero indexing except when actually - # accessing memory. As with C, the lower bound is inclusive, the upper bound exclusive. + sdata[ithread + 0x1] = acc + @synchronize() + + step = reduce_threads ÷ 2 + while step >= 1 + if lane < step + dst_lane = row + lane * rows + src_lane = row + (lane + step) * rows + sdata[dst_lane + 0x1] = op(sdata[dst_lane + 0x1], sdata[src_lane + 0x1]) + end + @synchronize() + step ÷= 2 + end - # Group (block) and local (thread) indices - iblock = @index(Group, Linear) - 0x1 + if lane == 0x0 && iout < output_size + dst[iout + 0x1] = op(init, sdata[row + 0x1]) + end +end + +# GPU kernel: by_thread — one thread per output element, reducing sequentially. +# Used when there are more output elements than elements in the reduced dimension(s), +# e.g. reduce(+, rand(3, 1000), dims=1) — only 3 elements to reduce per output. + +@kernel inbounds=true cpu=false unsafe_indices=true function _mapreduce_nd_by_thread!( + @Const(src), dst, + f, op, init, + base_offset, + outer_strides, outer_sizes, + reduce_strides, reduce_sizes, + output_size, reduce_size, +) + # NOTE: index calculations use zero-indexing (fewer ops, matches the CUDA / ROCm / oneAPI / + # Metal code this is transpiled to), converting to one-indexing only at memory accesses. + # `base_offset` (the source's element offset within its dense buffer; 0 for a dense + # source) is folded into the per-output base, so it costs at most one add per thread. + N = @groupsize()[1] + iblock = @index(Group, Linear) - 0x1 ithread = @index(Local, Linear) - 0x1 + tid = ithread + iblock * N - # Each thread handles one output element - tid = ithread + iblock * N if tid < output_size + input_base = base_offset + _outer_decode(tid, outer_strides, outer_sizes) - # # Sometimes slightly faster method using additional memory with - # # output_idx = @private typeof(iblock) (ndims,) - # tmp = tid - # KernelAbstractions.Extras.@unroll for i in ndims:-1:1 - # output_idx[i] = tmp ÷ dst_strides[i] - # tmp = tmp % dst_strides[i] - # end - # # Compute the base index in src (excluding the reduced axis) - # input_base_idx = 0 - # KernelAbstractions.Extras.@unroll for i in 1:ndims - # i == dims && continue - # input_base_idx += output_idx[i] * src_strides[i] - # end - - # Compute the base index in src (excluding the reduced axis) - input_base_idx = typeof(ithread)(0) - tmp = tid - KernelAbstractions.Extras.@unroll for i in ndims:-1i16:1i16 - if i != dims - input_base_idx += (tmp ÷ dst_strides[i]) * src_strides[i] - end - tmp = tmp % dst_strides[i] - end - - # Go over each element in the reduced dimension; this implementation assumes that there - # are so many outer elements (each processed by an independent thread) that we afford to - # loop sequentially over the reduced dimension (e.g. reduce(+, rand(3, 1000), dims=1)) res = init - for i in 0x0:reduce_size - 0x1 - src_idx = input_base_idx + i * src_strides[dims] - res = op(res, f(src[src_idx + 0x1])) + for j in 0x0:reduce_size - 0x1 + off = _reduce_offset(j, reduce_strides, reduce_sizes) + res = op(res, f(src[input_base + off + 0x1])) end + dst[tid + 0x1] = res end end +# GPU kernel: by_block — each block reduces one output element, then grid-strides to +# the next output (iout += num_blocks) until all outputs are covered. When +# num_blocks >= output_size, each block handles at most one output (the original, +# non-striding behavior) at zero extra cost — the while loop runs once. +# +# Used for GS_DST_CUTOFF <= dst_size < reduce_size (grid-stride, num_blocks == +# min(dst_size, TARGET_BLOCKS) < dst_size in general), and also for the +# reduce_groups==1 fallback inside the multigroup branch (num_blocks == dst_size, +# so the loop runs exactly once per block — identical to the pre-grid-stride kernel). @kernel inbounds=true cpu=false unsafe_indices=true function _mapreduce_nd_by_block!( @Const(src), dst, - f, op, - init, neutral, - dims, + f, op, init, neutral, + base_offset, + outer_strides, outer_sizes, + reduce_strides, reduce_sizes, + output_size, reduce_size, num_blocks, ) - # One block per output element, when there are more elements in the reduced dim than in outer - # e.g. reduce(+, rand(3, 1000), dims=2) => only 3 elements in outer dimensions - src_sizes = size(src) - src_strides = strides(src) - dst_sizes = size(dst) - dst_strides = strides(dst) - - output_size = length(dst) - reduce_size = src_sizes[dims] - - ndims = length(src_sizes) - @uniform N = @groupsize()[1] sdata = @localmem eltype(dst) (N,) - # NOTE: for many index calculations in this library, computation using zero-indexing leads to - # fewer operations (also code is transpiled to CUDA / ROCm / oneAPI / Metal code which do zero - # indexing). Internal calculations will be done using zero indexing except when actually - # accessing memory. As with C, the lower bound is inclusive, the upper bound exclusive. - - # Group (block) and local (thread) indices - iblock = @index(Group, Linear) - 0x1 + iblock = @index(Group, Linear) - 0x1 ithread = @index(Local, Linear) - 0x1 + iout = iblock + while true + input_base = base_offset + _outer_decode(iout, outer_strides, outer_sizes) + + acc = neutral + j = ithread + while j < reduce_size + off = _reduce_offset(j, reduce_strides, reduce_sizes) + acc = op(acc, f(src[input_base + off + 0x1])) + j += N + end - # Each block handles one output element - thus, iblock ∈ [0, output_size) - - # # Sometimes slightly faster method using additional memory with - # # output_idx = @private typeof(iblock) (ndims,) - # tmp = iblock - # KernelAbstractions.Extras.@unroll for i in ndims:-1:1 - # output_idx[i] = tmp ÷ dst_strides[i] - # tmp = tmp % dst_strides[i] - # end - # # Compute the base index in src (excluding the reduced axis) - # input_base_idx = 0 - # KernelAbstractions.Extras.@unroll for i in 1:ndims - # i == dims && continue - # input_base_idx += output_idx[i] * src_strides[i] - # end - - # Compute the base index in src (excluding the reduced axis) - input_base_idx = typeof(ithread)(0) - tmp = iblock - KernelAbstractions.Extras.@unroll for i in ndims:-1i16:1i16 - if i != dims - input_base_idx += (tmp ÷ dst_strides[i]) * src_strides[i] + sdata[ithread + 0x1] = acc + @synchronize() + + @inline reduce_group!(@context, op, sdata, N, ithread) + + if ithread == 0x0 + dst[iout + 0x1] = op(init, sdata[0x1]) end - tmp = tmp % dst_strides[i] + + next_iout = iout + num_blocks + next_iout >= output_size && break + + @synchronize() + iout = next_iout end +end + +# GPU kernel: multi-group first pass — several blocks per output element. Block `iblock` +# handles output `iout` and group `igroup`, reducing its interleaved slice of the reduction +# into partial[iout, igroup]. The second pass combines the groups. + +@kernel inbounds=true cpu=false unsafe_indices=true function _mapreduce_nd_multigroup!( + @Const(src), partial, + f, op, neutral, + base_offset, + outer_strides, outer_sizes, + reduce_strides, reduce_sizes, + ::Val{output_size}, reduce_size, reduce_groups, +) where {output_size} + @uniform N = @groupsize()[1] + sdata = @localmem eltype(partial) (N,) + + iblock = @index(Group, Linear) - 0x1 + ithread = @index(Local, Linear) - 0x1 - # We have a block of threads to process the whole reduced dimension. First do pre-reduction - # in strides of N - partial = neutral - i = ithread - while i < reduce_size - src_idx = input_base_idx + i * src_strides[dims] - partial = op(partial, f(src[src_idx + 0x1])) - i += N + # Int(...) keeps the index decode signed (see tiled kernel): avoids a CUDA throw guard. + iout = Int(unsigned(iblock) % unsigned(output_size)) + igroup = Int(unsigned(iblock) ÷ unsigned(output_size)) + + input_base = base_offset + _outer_decode(iout, outer_strides, outer_sizes) + + acc = neutral + j = ithread + igroup * N + while j < reduce_size + off = _reduce_offset(j, reduce_strides, reduce_sizes) + acc = op(acc, f(src[input_base + off + 0x1])) + j += N * reduce_groups end - # Store partial result in shared memory; now we are down to a single block to reduce within - sdata[ithread + 0x1] = partial + sdata[ithread + 0x1] = acc @synchronize() @inline reduce_group!(@context, op, sdata, N, ithread) if ithread == 0x0 - dst[iblock + 0x1] = op(init, sdata[0x1]) + partial[iout + igroup * output_size + 0x1] = sdata[0x1] + end +end + +# GPU kernel: multi-group second pass — one block per output reduces over the `reduce_groups` +# partials and folds in `init`. Launched with a block size sized to `reduce_groups` +# (see _pass2_block_size), avoiding an oversubscribed tree-reduce when reduce_groups +# is much smaller than 256. + +@kernel inbounds=true cpu=false unsafe_indices=true function _mapreduce_partial_to_dst!( + @Const(partial), dst, + op, init, neutral, + output_size, reduce_groups, +) + @uniform N = @groupsize()[1] + sdata = @localmem eltype(dst) (N,) + + iblock = @index(Group, Linear) - 0x1 + ithread = @index(Local, Linear) - 0x1 + + if iblock < output_size + acc = neutral + g = ithread + while g < reduce_groups + acc = op(acc, partial[iblock + g * output_size + 0x1]) + g += N + end + + sdata[ithread + 0x1] = acc + @synchronize() + + @inline reduce_group!(@context, op, sdata, N, ithread) + + if ithread == 0x0 + dst[iblock + 0x1] = op(init, sdata[0x1]) + end end end diff --git a/src/reduce/reduce.jl b/src/reduce/reduce.jl index 230ded1b..ba29351e 100644 --- a/src/reduce/reduce.jl +++ b/src/reduce/reduce.jl @@ -1,5 +1,52 @@ # Backend implementations include("utilities.jl") + +const MapReduceSource = Union{AbstractArray, Base.Broadcast.Broadcasted} + +function _mapreduce_backend(src::AbstractArray) + return _mapreduce_get_backend(src) +end + +function _mapreduce_backend(src::Base.Broadcast.Broadcasted) + backend = _mapreduce_backend_from_args(src.args) + return isnothing(backend) ? CPU_BACKEND : backend +end + +function _mapreduce_get_backend(src::AbstractArray) + try + return get_backend(src) + catch err + err isa ArgumentError || rethrow() + return CPU_BACKEND + end +end + +_mapreduce_backend_from_arg(src::AbstractArray) = _mapreduce_get_backend(src) +_mapreduce_backend_from_arg(src::Base.Broadcast.Broadcasted) = _mapreduce_backend(src) +_mapreduce_backend_from_arg(_) = nothing + +function _mapreduce_backend_from_args(args::Tuple) + backend = nothing + for arg in args + arg_backend = _mapreduce_backend_from_arg(arg) + isnothing(arg_backend) && continue + if isnothing(backend) + backend = arg_backend + else + @argcheck arg_backend == backend + end + end + return backend +end + +function _mapreduce_check_map_axes(src::AbstractArray, srcs::AbstractArray...) + src_axes = axes(src) + for other in srcs + axes(other) == src_axes || throw(DimensionMismatch("all input arrays must have the same axes")) + end + return nothing +end + include("mapreduce_1d_cpu.jl") include("mapreduce_1d_gpu.jl") include("mapreduce_nd.jl") @@ -9,8 +56,8 @@ include("mapreduce_nd.jl") reduce( op, src::AbstractArray, backend::Backend=get_backend(src); init, - neutral=neutral_element(op, eltype(src)), - dims::Union{Nothing, Int}=nothing, + neutral=neutral_element(op, typeof(init)), + dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing, # CPU settings max_tasks::Int=Threads.nthreads(), @@ -22,16 +69,17 @@ include("mapreduce_nd.jl") switch_below::Int=0, ) -Reduce `src` along dimensions `dims` using the binary operator `op`. If `dims` is `nothing`, reduce -`src` to a scalar. If `dims` is an integer, reduce `src` along that dimension. The `init` value is -used as the initial value for the reduction; `neutral` is the neutral element for the operator `op`. +Reduce `src` along dimensions `dims` using the binary operator `op`. If `dims` is `nothing` or +`:`, reduce `src` to a scalar. If `dims` is an integer or a tuple of integers, reduce `src` along +those dimension(s). The `init` value is used as the initial value for the reduction; `neutral` is +the neutral element for the operator `op`. The returned type is the same as `init` - to control output precision, specify `init` explicitly. ## CPU settings Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional -arrays (`dims::Int`) multithreading currently only becomes faster for `max_tasks >= 4`; all other -cases are scaling linearly with the number of threads. +arrays (`dims` is an integer or tuple) multithreading currently only becomes faster for +`max_tasks >= 4`; all other cases are scaling linearly with the number of threads. Note that multithreading reductions only improves performance for cases with more compute-heavy operations, which hide the memory latency and thread launch overhead - that includes: @@ -41,14 +89,15 @@ operations, which hide the memory latency and thread launch overhead - that incl For non-memory-bound operations, reductions scale almost linearly with the number of threads. ## GPU settings -The `block_size` parameter controls the number of threads per block. +The `block_size` parameter controls the number of threads per block and must be a power of two. The `temp` parameter can be used to pass a pre-allocated temporary array. For reduction to a scalar -(`dims=nothing`), `length(temp) >= 2 * (length(src) + 2 * block_size - 1) ÷ (2 * block_size)` is -required. For reduction along a dimension (`dims` is an integer), `temp` is used as the destination -array, and thus must have the exact dimensions required - i.e. same dimensionwise sizes as `src`, -except for the reduced dimension which becomes 1; there are some corner cases when one dimension is -zero, check against `Base.reduce` for CPU arrays for exact behavior. +(`dims=nothing` or `dims=:`), `length(temp) >= 2 * (length(src) + 2 * block_size - 1) ÷ (2 * +block_size)` is required. For reduction along dimensions (`dims` is an integer or tuple), `temp` is +used as the destination array, and thus must have the exact dimensions required - i.e. same +dimensionwise sizes as `src`, except for the reduced dimension(s) which become 1; there are some +corner cases when one dimension is zero, check against `Base.reduce` for CPU arrays for exact +behavior. The `switch_below` parameter controls the threshold below which the reduction is performed on the CPU and is only used for 1D reductions (i.e. `dims=nothing`). @@ -74,7 +123,7 @@ mcolsum = AK.reduce(+, m; init=zero(eltype(m)), dims=2) ``` """ function reduce( - op, src::AbstractArray, backend::Backend=get_backend(src); + op, src::AbstractArray, backend::Backend=_mapreduce_backend(src); init, kwargs... ) @@ -92,8 +141,8 @@ end mapreduce( f, op, src::AbstractArray, backend::Backend=get_backend(src); init, - neutral=neutral_element(op, eltype(src)), - dims::Union{Nothing, Int}=nothing, + neutral=neutral_element(op, typeof(init)), + dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing, # CPU settings max_tasks::Int=Threads.nthreads(), @@ -105,29 +154,40 @@ end switch_below::Int=0, ) + mapreduce(f, op, A::AbstractArray, B::AbstractArray, As::AbstractArray...; init, kwargs...) + mapreduce(f, op, A::AbstractArray, B::AbstractArray, As::AbstractArray..., backend::Backend; init, kwargs...) + Reduce `src` along dimensions `dims` using the binary operator `op` after applying `f` elementwise. -If `dims` is `nothing`, reduce `src` to a scalar. If `dims` is an integer, reduce `src` along that -dimension. The `init` value is used as the initial value for the reduction (i.e. after mapping). +If `dims` is `nothing` or `:`, reduce `src` to a scalar. If `dims` is an integer or a tuple of +integers, reduce `src` along those dimension(s). The `init` value is used as the initial value for +the reduction (i.e. after mapping). -The `neutral` value is the neutral element (zero) for the operator `op`, which is needed for an -efficient GPU implementation that also allows a nonzero `init`. +The `neutral` value is the neutral element for the operator `op`, which is needed for an efficient +GPU implementation that also allows a nonzero `init`. The returned type is the same as `init` - to control output precision, specify `init` explicitly. +Multiple input arrays are supported with the same axes. This follows `Base.mapreduce(f, op, A, B, +...)` semantics: `f` is mapped across corresponding elements of the inputs and the mapped values +are reduced without materializing the intermediate array. Mismatched axes throw +`DimensionMismatch`; singleton-expanding broadcast semantics are reserved for internal +`Broadcasted` sources used by array backends. + ## CPU settings Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional -arrays (`dims::Int`) multithreading currently only becomes faster for `max_tasks >= 4`; all other -cases are scaling linearly with the number of threads. +arrays (`dims` is an integer or tuple) multithreading currently only becomes faster for +`max_tasks >= 4`; all other cases are scaling linearly with the number of threads. ## GPU settings -The `block_size` parameter controls the number of threads per block. +The `block_size` parameter controls the number of threads per block and must be a power of two. The `temp` parameter can be used to pass a pre-allocated temporary array. For reduction to a scalar -(`dims=nothing`), `length(temp) >= 2 * (length(src) + 2 * block_size - 1) ÷ (2 * block_size)` is -required. For reduction along a dimension (`dims` is an integer), `temp` is used as the destination -array, and thus must have the exact dimensions required - i.e. same dimensionwise sizes as `src`, -except for the reduced dimension which becomes 1; there are some corner cases when one dimension is -zero, check against `Base.reduce` for CPU arrays for exact behavior. +(`dims=nothing` or `dims=:`), `length(temp) >= 2 * (length(src) + 2 * block_size - 1) ÷ (2 * +block_size)` is required. For reduction along dimensions (`dims` is an integer or tuple), `temp` is +used as the destination array, and thus must have the exact dimensions required - i.e. same +dimensionwise sizes as `src`, except for the reduced dimension(s) which become 1; there are some +corner cases when one dimension is zero, check against `Base.reduce` for CPU arrays for exact +behavior. The `switch_below` parameter controls the threshold below which the reduction is performed on the CPU and is only used for 1D reductions (i.e. `dims=nothing`). @@ -152,9 +212,19 @@ m = MtlArray(rand(Int32(1):Int32(100), 10, 100_000)) mrowsumsq = AK.mapreduce(f, +, m; init=zero(eltype(m)), dims=1) mcolsumsq = AK.mapreduce(f, +, m; init=zero(eltype(m)), dims=2) ``` + +Computing a two-input dimensional reduction: +```julia +rows = AK.mapreduce((x, y) -> x * y, +, a, b; init=0f0, dims=1) +``` + +An explicit backend may be passed after all input arrays: +```julia +rows = AK.mapreduce((x, y) -> x * y, +, a, b, backend; init=0f0, dims=1) +``` """ function mapreduce( - f, op, src::AbstractArray, backend::Backend=get_backend(src); + f, op, src::MapReduceSource, backend::Backend=_mapreduce_backend(src); init, kwargs... ) @@ -165,12 +235,53 @@ function mapreduce( ) end +function mapreduce( + f, op, src::AbstractArray, src2::AbstractArray, srcs::AbstractArray...; + init, + kwargs... +) + return _mapreduce_multi( + f, op, nothing, src, src2, srcs...; + init, + kwargs... + ) +end + +function mapreduce( + f, op, src::AbstractArray, src2::AbstractArray, arg, args...; + init, + kwargs... +) + backend = isempty(args) ? arg : args[end] + backend isa Backend || throw(MethodError(mapreduce, (f, op, src, src2, arg, args...))) + srcs = isempty(args) ? () : (arg, args[1:end - 1]...) + return _mapreduce_multi( + f, op, backend, src, src2, srcs...; + init, + kwargs... + ) +end + +function _mapreduce_multi( + f, op, backend::Union{Nothing, Backend}, src::AbstractArray, srcs::AbstractArray...; + init, + kwargs... +) + _mapreduce_check_map_axes(src, srcs...) + bc = Base.Broadcast.instantiate(Base.Broadcast.broadcasted(f, src, srcs...)) + return mapreduce( + identity, op, bc, isnothing(backend) ? _mapreduce_backend(bc) : backend; + init, + kwargs... + ) +end + function _mapreduce_impl( - f, op, src::AbstractArray, backend::Backend; + f, op, src::MapReduceSource, backend::Backend; init, - neutral=neutral_element(op, eltype(src)), - dims::Union{Nothing, Int}=nothing, + neutral=neutral_element(op, typeof(init)), + dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon} = nothing, # CPU settings max_tasks::Int=Threads.nthreads(), @@ -182,7 +293,14 @@ function _mapreduce_impl( temp::Union{Nothing, AbstractArray}=nothing, switch_below::Int=0, ) - if isnothing(dims) + + # scalar *linear* indexing into a multidimensional Broadcasted object is + # only available on Julia 1.12; on earlier version, materialize it first. + if VERSION < v"1.12-" && src isa Base.Broadcast.Broadcasted + src = Base.Broadcast.materialize(src) + end + + if isnothing(dims) || dims isa Colon if use_gpu_algorithm(backend, prefer_threads) mapreduce_1d_gpu( f, op, src, backend; diff --git a/test/generic/reduce.jl b/test/generic/reduce.jl index 9fe8b5c2..3f0587ea 100644 --- a/test/generic/reduce.jl +++ b/test/generic/reduce.jl @@ -120,8 +120,20 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0) @test s == reduce(+, vh) end + # Base-compatible alias: dims=: reduces all dimensions to a scalar. + vh_colon = rand(Int32(1):Int32(10), 3, 4, 5) + @test AK.reduce(+, array_from_host(vh_colon); prefer_threads, init=Int32(0), dims=:) == + reduce(+, vh_colon; init=Int32(0), dims=:) + + vh_one = Int32[7] + @test AK.reduce(+, array_from_host(vh_one); prefer_threads, init=Int32(10)) == + reduce(+, vh_one; init=Int32(10)) + # Test that undefined kwargs are not accepted @test_throws MethodError AK.reduce(+, array_from_host(rand(Int32, 10)); init=10, bad=:kwarg) + if !IS_CPU_BACKEND + @test_throws ArgumentError AK.reduce(+, array_from_host(rand(Int32, 256)); prefer_threads, init=Int32(0), block_size=192) + end # Testing different settings AK.reduce( @@ -222,8 +234,67 @@ end end end + # Duplicate dims match Base semantics and are reduced once. + vh_dup = rand(Int32(1):Int32(10), 3, 4, 5) + @test Array(AK.reduce(+, array_from_host(vh_dup); prefer_threads, init=Int32(0), dims=(2,2))) == + sum(vh_dup; init=Int32(0), dims=(2,2)) + + # min/max with dims: tests correct neutral element in partial reduction + for dims in 1:3 + n1 = rand(1:50); n2 = rand(1:50); n3 = rand(1:50) + vh = rand(Int32(1):Int32(100), n1, n2, n3) + v = array_from_host(vh) + @test Array(AK.reduce(min, v; prefer_threads, init=typemax(Int32), neutral=typemax(Int32), dims)) == minimum(vh; dims) + @test Array(AK.reduce(max, v; prefer_threads, init=typemin(Int32), neutral=typemin(Int32), dims)) == maximum(vh; dims) + end + + # Tuple dims support. Order and duplicates match Base semantics. + for dims in [(1,2), (1,3), (2,3), (1,2,3), (2,1), (3,1), (2,1,2)] + for n1 in [1, 5, 10], n2 in [1, 5, 10], n3 in [1, 5, 10] + vh = rand(Int32(1):Int32(100), n1, n2, n3) + v = array_from_host(vh) + s = AK.reduce(+, v; prefer_threads, init=Int32(0), dims) + sh = Array(s) + @test sh == sum(vh; dims) + end + end + + # Tiled strided GPU path: contiguous kept dimensions, one strided reduce + # dimension, and dst_size == reduce_size. The 3D case also exercises a + # partial output tile. + for (shape, dims) in (((512, 512), 2), ((20, 13, 260), 3)) + vh = rand(Int32(1):Int32(3), shape...) + v = array_from_host(vh) + @test Array(AK.reduce(+, v; prefer_threads, init=Int32(0), dims)) == + sum(vh; init=Int32(0), dims) + end + + if IS_CPU_BACKEND + # The CPU fallback should not require strided storage. + vh = reshape(1:12, 1, 3, 4) + @test Array(AK.reduce(+, vh, BACKEND; prefer_threads, init=0, dims=(1,2))) == + sum(vh; init=0, dims=(1,2)) + else + # Strided GPU sources (views, adjoints, permuted dims) take the stride-based + # fast path over their dense parent buffer; the offset view exercises a nonzero + # base offset. Broadcasted/lazy sources still take the generic fallback. + vh = reshape(Int32(1):Int32(40), 5, 8) + v = array_from_host(vh) + @test Array(AK.reduce(+, @view(v[:, 1:2:end]); prefer_threads, init=Int32(0), dims=2)) == + Base.reduce(+, @view(vh[:, 1:2:end]); init=Int32(0), dims=2) + @test Array(AK.reduce(+, @view(v[2:end, 1:2:end]); prefer_threads, init=Int32(0), dims=2)) == + Base.reduce(+, @view(vh[2:end, 1:2:end]); init=Int32(0), dims=2) + @test Array(AK.reduce(+, v'; prefer_threads, init=Int32(0), dims=1)) == + Base.reduce(+, vh'; init=Int32(0), dims=1) + @test Array(AK.reduce(+, PermutedDimsArray(v, (2, 1)); prefer_threads, init=Int32(0), dims=1)) == + Base.reduce(+, PermutedDimsArray(vh, (2, 1)); init=Int32(0), dims=1) + end + # Test that undefined kwargs are not accepted @test_throws MethodError AK.reduce(+, array_from_host(rand(Int32, 10, 10)); prefer_threads, init=10, bad=:kwarg) + if !IS_CPU_BACKEND + @test_throws ArgumentError AK.reduce(+, array_from_host(rand(Int32, 16, 16)); prefer_threads, init=Int32(0), dims=1, block_size=192) + end # Testing different settings AK.reduce( @@ -342,6 +413,76 @@ end @test s == mapreduce(abs, +, vh) end + # Base-compatible alias: dims=: reduces all dimensions to a scalar. + vh_colon = rand(Int32(-10):Int32(10), 3, 4, 5) + @test AK.mapreduce(abs, +, array_from_host(vh_colon); prefer_threads, init=Int32(0), dims=:) == + mapreduce(abs, +, vh_colon; init=Int32(0), dims=:) + + vh_one = Int32[-7] + @test AK.mapreduce(abs, +, array_from_host(vh_one); prefer_threads, init=Int32(10)) == + mapreduce(abs, +, vh_one; init=Int32(10)) + + vh_typechange = rand(Int32(-10):Int32(10), 4, 5) + f_typechange = x -> Float32(x) / 2 + @test AK.mapreduce(f_typechange, +, array_from_host(vh_typechange); prefer_threads, init=0f0) ≈ + mapreduce(f_typechange, +, vh_typechange; init=0f0) + @test Array(AK.mapreduce(f_typechange, +, array_from_host(vh_typechange); prefer_threads, init=0f0, dims=2)) ≈ + mapreduce(f_typechange, +, vh_typechange; init=0f0, dims=2) + f_min_typechange = x -> Float32(10_000_000_000 + x) + f_max_typechange = x -> Float32(-10_000_000_000 + x) + @test AK.mapreduce(f_min_typechange, min, array_from_host(vh_typechange); prefer_threads, init=Inf32) ≈ + mapreduce(f_min_typechange, min, vh_typechange; init=Inf32) + @test AK.mapreduce(f_max_typechange, max, array_from_host(vh_typechange); prefer_threads, init=-Inf32) ≈ + mapreduce(f_max_typechange, max, vh_typechange; init=-Inf32) + + # Multi-input mapreduce lowers through a broadcasted source. + vh_a = rand(Int32(-10):Int32(10), 4, 5, 6) + vh_b = rand(Int32(-10):Int32(10), 4, 5, 6) + vh_c = rand(Int32(-10):Int32(10), 4, 5, 6) + v_a = array_from_host(vh_a) + v_b = array_from_host(vh_b) + v_c = array_from_host(vh_c) + @test AK.mapreduce((x, y) -> x * y, +, v_a, v_b; prefer_threads, init=Int32(0)) == + mapreduce((x, y) -> x * y, +, vh_a, vh_b; init=Int32(0)) + @test AK.mapreduce((x, y) -> x * y, +, v_a, v_b, BACKEND; prefer_threads, init=Int32(0)) == + mapreduce((x, y) -> x * y, +, vh_a, vh_b; init=Int32(0)) + @test AK.mapreduce((x, y, z) -> x + y * z, +, v_a, v_b, v_c, BACKEND; prefer_threads, init=Int32(0)) == + mapreduce((x, y, z) -> x + y * z, +, vh_a, vh_b, vh_c; init=Int32(0)) + @test AK.mapreduce((x, y) -> x * y, +, v_a, v_b; prefer_threads, init=Int32(0), dims=:) == + mapreduce((x, y) -> x * y, +, vh_a, vh_b; init=Int32(0), dims=:) + @test Array(AK.mapreduce((x, y) -> x * y, +, v_a, v_b; prefer_threads, init=Int32(0), dims=())) == + mapreduce((x, y) -> x * y, +, vh_a, vh_b; init=Int32(0), dims=()) + @test AK.mapreduce((x, y) -> Float32(x - y) / 3, +, v_a, v_b; prefer_threads, init=0f0) ≈ + mapreduce((x, y) -> Float32(x - y) / 3, +, vh_a, vh_b; init=0f0) + + for (shape, dims) in (((0, 3), 1), ((2, 0), 2), ((0, 0), (1, 2)), ((0, 3), ())) + h_empty1 = reshape(Int32[], shape...) + h_empty2 = fill(Int32(2), shape...) + @test Array(AK.mapreduce((x, y) -> x + y, +, + array_from_host(h_empty1), + array_from_host(h_empty2); + prefer_threads, init=Int32(10), dims)) == + mapreduce((x, y) -> x + y, +, h_empty1, h_empty2; init=Int32(10), dims) + end + + @test_throws DimensionMismatch AK.mapreduce( + (x, y) -> x + y, +, + array_from_host(rand(Int32, 2, 3)), + array_from_host(rand(Int32, 1, 3)); + prefer_threads, + init=Int32(0), + ) + + if IS_CPU_BACKEND + bc = Base.Broadcast.instantiate(Base.Broadcast.broadcasted(+, reshape(1:6, 2, 3), reshape(10:15, 2, 3))) + @test AK.mapreduce(identity, +, bc; prefer_threads, init=0) == + mapreduce(identity, +, bc; init=0) + @test Array(AK.mapreduce(identity, +, bc; prefer_threads, init=0, dims=2)) == + mapreduce(identity, +, bc; init=0, dims=2) + @test Array(AK.mapreduce(identity, +, bc; prefer_threads, init=0, dims=())) == + mapreduce(identity, +, bc; init=0, dims=()) + end + # Testing different settings, enforcing change of type between f and op f(s, temp) = AK.mapreduce( p -> (p.x, p.y), @@ -362,6 +503,9 @@ end # Test that undefined kwargs are not accepted @test_throws MethodError AK.mapreduce(-, +, v; prefer_threads, init=10, bad=:kwarg) + if !IS_CPU_BACKEND + @test_throws ArgumentError AK.mapreduce(-, +, array_from_host(rand(Int32, 256)); prefer_threads, init=Int32(0), block_size=192) + end end @@ -458,8 +602,87 @@ end end end + # Duplicate dims match Base semantics and are reduced once. + vh_dup = rand(Int32(1):Int32(10), 3, 4, 5) + @test Array(AK.mapreduce(-, +, array_from_host(vh_dup); prefer_threads, init=Int32(0), dims=(2,2))) == + mapreduce(-, +, vh_dup; init=Int32(0), dims=(2,2)) + + # Multi-input mapreduce with dimensional reductions. + vh_ma = rand(Int32(-10):Int32(10), 4, 5, 6) + vh_mb = rand(Int32(-10):Int32(10), 4, 5, 6) + v_ma = array_from_host(vh_ma) + v_mb = array_from_host(vh_mb) + for dims in (1, 2, (1, 2), (1, 3), (1, 2, 3), (2, 2)) + @test Array(AK.mapreduce((x, y) -> x * y, +, v_ma, v_mb; prefer_threads, init=Int32(0), dims)) == + mapreduce((x, y) -> x * y, +, vh_ma, vh_mb; init=Int32(0), dims) + end + @test Array(AK.mapreduce((x, y) -> x * y, +, v_ma, v_mb, BACKEND; prefer_threads, init=Int32(0), dims=(1, 2))) == + mapreduce((x, y) -> x * y, +, vh_ma, vh_mb; init=Int32(0), dims=(1, 2)) + @test Array(AK.mapreduce((x, y) -> x * y, +, v_ma, v_mb; prefer_threads, init=Int32(0), dims=())) == + mapreduce((x, y) -> x * y, +, vh_ma, vh_mb; init=Int32(0), dims=()) + @test Array(AK.mapreduce((x, y) -> Float32(x - y) / 3, +, v_ma, v_mb; prefer_threads, init=0f0, dims=(1, 2))) ≈ + mapreduce((x, y) -> Float32(x - y) / 3, +, vh_ma, vh_mb; init=0f0, dims=(1, 2)) + vh_typechange_nd = rand(Int32(-10):Int32(10), 4, 5) + f_min_typechange_nd = x -> Float32(10_000_000_000 + x) + f_max_typechange_nd = x -> Float32(-10_000_000_000 + x) + @test Array(AK.mapreduce(f_min_typechange_nd, min, array_from_host(vh_typechange_nd); prefer_threads, init=Inf32, dims=2)) ≈ + mapreduce(f_min_typechange_nd, min, vh_typechange_nd; init=Inf32, dims=2) + @test Array(AK.mapreduce(f_max_typechange_nd, max, array_from_host(vh_typechange_nd); prefer_threads, init=-Inf32, dims=2)) ≈ + mapreduce(f_max_typechange_nd, max, vh_typechange_nd; init=-Inf32, dims=2) + + # min/max with dims: tests correct neutral element in partial reduction + for dims in 1:3 + n1 = rand(1:50); n2 = rand(1:50); n3 = rand(1:50) + vh = rand(Int32(1):Int32(100), n1, n2, n3) + v = array_from_host(vh) + @test Array(AK.reduce(min, v; prefer_threads, init=typemax(Int32), neutral=typemax(Int32), dims)) == minimum(vh; dims) + @test Array(AK.reduce(max, v; prefer_threads, init=typemin(Int32), neutral=typemin(Int32), dims)) == maximum(vh; dims) + end + + # Tuple dims support. Order and duplicates match Base semantics. + for dims in [(1,2), (1,3), (2,3), (1,2,3), (2,1), (3,1), (2,1,2)] + for n1 in [1, 5, 10], n2 in [1, 5, 10], n3 in [1, 5, 10] + vh = rand(Int32(1):Int32(100), n1, n2, n3) + v = array_from_host(vh) + s = AK.mapreduce(-, +, v; prefer_threads, init=Int32(0), dims) + sh = Array(s) + @test sh == mapreduce(-, +, vh; init=Int32(0), dims) + end + end + + # Tiled strided GPU path coverage for mapreduce, including a 3D case with + # a partial output tile. + for (shape, dims) in (((512, 512), 2), ((20, 13, 260), 3)) + vh = rand(Int32(1):Int32(3), shape...) + v = array_from_host(vh) + @test Array(AK.mapreduce(x -> x - Int32(1), +, v; prefer_threads, init=Int32(0), dims)) == + mapreduce(x -> x - Int32(1), +, vh; init=Int32(0), dims) + end + + if IS_CPU_BACKEND + # The CPU fallback should not require strided storage. + vh = reshape(1:12, 1, 3, 4) + @test Array(AK.mapreduce(x -> 2x, +, vh, BACKEND; prefer_threads, init=0, dims=(1,2))) == + mapreduce(x -> 2x, +, vh; init=0, dims=(1,2)) + else + # Strided GPU sources (views, adjoints, permuted dims) take the stride-based + # fast path over their dense parent buffer; the offset view exercises a nonzero + # base offset. Broadcasted/lazy sources still take the generic fallback. + vh = reshape(Int32(1):Int32(40), 5, 8) + v = array_from_host(vh) + @test Array(AK.mapreduce(x -> x - Int32(1), +, @view(v[:, 1:2:end]); prefer_threads, init=Int32(0), dims=2)) == + mapreduce(x -> x - Int32(1), +, @view(vh[:, 1:2:end]); init=Int32(0), dims=2) + @test Array(AK.mapreduce(x -> x - Int32(1), +, @view(v[2:end, 1:2:end]); prefer_threads, init=Int32(0), dims=2)) == + mapreduce(x -> x - Int32(1), +, @view(vh[2:end, 1:2:end]); init=Int32(0), dims=2) + @test Array(AK.mapreduce(x -> x - Int32(1), +, PermutedDimsArray(v, (2, 1)); prefer_threads, init=Int32(0), dims=1)) == + mapreduce(x -> x - Int32(1), +, PermutedDimsArray(vh, (2, 1)); init=Int32(0), dims=1) + end + # Test that undefined kwargs are not accepted @test_throws MethodError AK.mapreduce(-, +, array_from_host(rand(Int32, 3, 4, 5)); prefer_threads, init=10, bad=:kwarg) + if !IS_CPU_BACKEND + @test_throws ArgumentError AK.mapreduce(-, +, array_from_host(rand(Int32, 16, 16)); prefer_threads, init=Int32(0), dims=1, block_size=192) + end # Testing different settings AK.mapreduce( @@ -528,6 +751,7 @@ end # Testing different settings v = array_from_host(rand(-5:5, 100_000)) AK.sum(v; prefer_threads, block_size=64) + @test AK.sum(v; prefer_threads, dims=:) == sum(Array(v); dims=:) # Test that undefined kwargs are not accepted @test_throws MethodError AK.sum(v; prefer_threads, bad=:kwarg) @@ -573,6 +797,7 @@ end # Testing different settings v = array_from_host(rand(-5:5, 100_000)) AK.prod(v; prefer_threads, block_size=64) + @test AK.prod(v; prefer_threads, dims=:) == prod(Array(v); dims=:) # Test that undefined kwargs are not accepted @test_throws MethodError AK.prod(v; prefer_threads, bad=:kwarg) @@ -618,6 +843,7 @@ end # Testing different settings v = array_from_host(rand(-5:5, 100_000)) AK.minimum(v; prefer_threads, block_size=64) + @test AK.minimum(v; prefer_threads, dims=:) == minimum(Array(v); dims=:) # Test that undefined kwargs are not accepted @test_throws MethodError AK.minimum(v; prefer_threads, bad=:kwarg) @@ -663,6 +889,7 @@ end # Testing different settings v = array_from_host(rand(-5:5, 100_000)) AK.maximum(v; prefer_threads, block_size=64) + @test AK.maximum(v; prefer_threads, dims=:) == maximum(Array(v); dims=:) # Test that undefined kwargs are not accepted @test_throws MethodError AK.maximum(v; prefer_threads, bad=:kwarg) @@ -715,6 +942,7 @@ end # Testing different settings v = array_from_host(rand(-5:5, 100_000)) AK.count(x->x>0, v; prefer_threads, block_size=64) + @test AK.count(x->x>0, v; prefer_threads, dims=:) == count(x->x>0, Array(v); dims=:) # Test that undefined kwargs are not accepted @test_throws MethodError AK.count(v; prefer_threads, bad=:kwarg)