JuliaGPU · maleadt · Jun 19, 2026 · Jun 1, 2026 · Jun 6, 2026 · Jun 9, 2026
diff --git a/src/arithmetics.jl b/src/arithmetics.jl
@@ -2,7 +2,7 @@
     sum(
         src::AbstractArray, backend::Backend=get_backend(src);
         init=zero(eltype(src)),
-        dims::Union{Nothing, Int}=nothing,
+        dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing,
 
         # CPU settings
         max_tasks=Threads.nthreads(),
@@ -33,11 +33,11 @@ m = MtlArray(rand(Int32(1):Int32(100), 10, 100_000))
 s = AK.sum(m, dims=1)
 ```
 
-If you know the shape of the resulting array (in case of a axis-wise sum, i.e. `dims` is not
+If you know the shape of the resulting array (in case of a dimensionwise sum, i.e. `dims` is not
 `nothing`), you can provide the `temp` argument to save results into and avoid allocations:
 ```julia
 m = MtlArray(rand(Int32(1):Int32(100), 10, 100_000))
-temp = MtlArray(zeros(Int32, 10))
+temp = MtlArray(zeros(Int32, 10, 1))
 s = AK.sum(m, dims=2, temp=temp)
 ```
 """
@@ -58,7 +58,7 @@ end
     prod(
         src::AbstractArray, backend::Backend=get_backend(src);
         init=one(eltype(src)),
-        dims::Union{Nothing, Int}=nothing,
+        dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing,
 
         # CPU settings
         max_tasks=Threads.nthreads(),
@@ -89,11 +89,11 @@ m = ROCArray(rand(Int32(1):Int32(100), 10, 100_000))
 p = AK.prod(m, dims=1)
 ```
 
-If you know the shape of the resulting array (in case of a axis-wise product, i.e. `dims` is not
+If you know the shape of the resulting array (in case of a dimensionwise product, i.e. `dims` is not
 `nothing`), you can provide the `temp` argument to save results into and avoid allocations:
 ```julia
 m = ROCArray(rand(Int32(1):Int32(100), 10, 100_000))
-temp = ROCArray(ones(Int32, 10))
+temp = ROCArray(ones(Int32, 10, 1))
 p = AK.prod(m, dims=2, temp=temp)
 ```
 """
@@ -114,7 +114,7 @@ end
     maximum(
         src::AbstractArray, backend::Backend=get_backend(src);
         init=typemin(eltype(src)),
-        dims::Union{Nothing, Int}=nothing,
+        dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing,
 
         # CPU settings
         max_tasks=Threads.nthreads(),
@@ -145,11 +145,11 @@ m = oneArray(rand(Int32(1):Int32(100), 10, 100_000))
 m = AK.maximum(m, dims=1)
 ```
 
-If you know the shape of the resulting array (in case of a axis-wise maximum, i.e. `dims` is not
+If you know the shape of the resulting array (in case of a dimensionwise maximum, i.e. `dims` is not
 `nothing`), you can provide the `temp` argument to save results into and avoid allocations:
 ```julia
 m = oneArray(rand(Int32(1):Int32(100), 10, 100_000))
-temp = oneArray(zeros(Int32, 10))
+temp = oneArray(zeros(Int32, 10, 1))
 m = AK.maximum(m, dims=2, temp=temp)
 ```
 """
@@ -170,7 +170,7 @@ end
     minimum(
         src::AbstractArray, backend::Backend=get_backend(src);
         init=typemax(eltype(src)),
-        dims::Union{Nothing, Int}=nothing,
+        dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing,
 
         # CPU settings
         max_tasks=Threads.nthreads(),
@@ -201,11 +201,11 @@ m = CuArray(rand(Int32(1):Int32(100), 10, 100_000))
 m = AK.minimum(m, dims=1)
 ```
 
-If you know the shape of the resulting array (in case of a axis-wise minimum, i.e. `dims` is not
+If you know the shape of the resulting array (in case of a dimensionwise minimum, i.e. `dims` is not
 `nothing`), you can provide the `temp` argument to save results into and avoid allocations:
 ```julia
 m = CuArray(rand(Int32(1):Int32(100), 10, 100_000))
-temp = CuArray(ones(Int32, 10))
+temp = CuArray(ones(Int32, 10, 1))
 m = AK.minimum(m, dims=2, temp=temp)
 ```
 """
@@ -226,7 +226,7 @@ end
     count(
         [f=identity], src::AbstractArray, backend::Backend=get_backend(src);
         init=0,
-        dims::Union{Nothing, Int}=nothing,
+        dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing,
 
         # CPU settings
         max_tasks=Threads.nthreads(),
@@ -263,12 +263,12 @@ m = MtlArray(rand(Bool, 10, 100_000))
 c = AK.count(m, dims=1)
 ```
 
-If you know the shape of the resulting array (in case of a axis-wise count, i.e. `dims` is not
+If you know the shape of the resulting array (in case of a dimensionwise count, i.e. `dims` is not
 `nothing`), you can provide the `temp` argument to save results into and avoid allocations:
 ```julia
 m = MtlArray(rand(Bool, 10, 100_000))
-temp = MtlArray(zeros(Int32, 10))
-c = AK.count(m, dims=2, temp=temp)
+temp = MtlArray(zeros(Int32, 10, 1))
+c = AK.count(m; init=Int32(0), dims=2, temp=temp)
 ```
 """
 function count(

diff --git a/src/reduce/mapreduce_1d_cpu.jl b/src/reduce/mapreduce_1d_cpu.jl
@@ -1,5 +1,5 @@
 function mapreduce_1d_cpu(
-    f, op, src::AbstractArray, backend::Backend;
+    f, op, src::MapReduceSource, backend::Backend;
     init,
     neutral,
 
@@ -12,6 +12,10 @@ function mapreduce_1d_cpu(
     temp::Union{Nothing, AbstractArray},
     switch_below::Int,
 )
+    if src isa Base.Broadcast.Broadcasted
+        return op(init, Base.mapreduce(f, op, src; init=neutral))
+    end
+
     if max_tasks == 1
         return op(init, Base.mapreduce(f, op, src; init=neutral))
     end

diff --git a/src/reduce/mapreduce_1d_gpu.jl b/src/reduce/mapreduce_1d_gpu.jl
@@ -47,7 +47,7 @@ end
 
 
 function mapreduce_1d_gpu(
-    f, op, src::AbstractArray, backend::Backend;
+    f, op, src::MapReduceSource, backend::Backend;
     init,
     neutral,
 
@@ -61,12 +61,13 @@ function mapreduce_1d_gpu(
     switch_below::Int,
 )
     @argcheck 1 <= block_size <= 1024
+    @argcheck ispow2(block_size)
     @argcheck switch_below >= 0
 
     # Degenerate cases
     len = length(src)
     len == 0 && return init
-    len == 1 && return @allowscalar f(src[1])
+    len == 1 && return op(init, @allowscalar f(src[1]))
     if len < switch_below
         h_src = Vector(src)
         return Base.mapreduce(f, op, h_src; init)
@@ -87,8 +88,8 @@ function mapreduce_1d_gpu(
         dst = KernelAbstractions.allocate(backend, dst_type, blocks * 2)
     end
 
-    # Later the kernel will be compiled for views anyways, so use same types
-    src_view = @view src[1:end]
+    # Later the kernel will be compiled for views anyways, so use same types for arrays.
+    src_view = _mapreduce_1d_src_view(src)
     dst_view = @view dst[1:blocks]
 
     kernel! = _mapreduce_block!(backend, block_size)
@@ -125,3 +126,6 @@ function mapreduce_1d_gpu(
     # The GPU kernel reduced all elements to one, but without the init value
     return op(init, @allowscalar(p1[1]))
 end
+
+_mapreduce_1d_src_view(src::AbstractArray) = @view src[1:end]
+_mapreduce_1d_src_view(src::Base.Broadcast.Broadcasted) = src