JuliaGPU · maleadt · Jun 19, 2026 · Jun 19, 2026
diff --git a/src/arithmetics.jl b/src/arithmetics.jl
@@ -2,7 +2,7 @@
     sum(
         src::AbstractArray, backend::Backend=get_backend(src);
         init=zero(eltype(src)),
-        dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing,
+        dims=nothing,
 
         # CPU settings
         max_tasks=Threads.nthreads(),
@@ -58,7 +58,7 @@ end
     prod(
         src::AbstractArray, backend::Backend=get_backend(src);
         init=one(eltype(src)),
-        dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing,
+        dims=nothing,
 
         # CPU settings
         max_tasks=Threads.nthreads(),
@@ -114,7 +114,7 @@ end
     maximum(
         src::AbstractArray, backend::Backend=get_backend(src);
         init=typemin(eltype(src)),
-        dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing,
+        dims=nothing,
 
         # CPU settings
         max_tasks=Threads.nthreads(),
@@ -170,7 +170,7 @@ end
     minimum(
         src::AbstractArray, backend::Backend=get_backend(src);
         init=typemax(eltype(src)),
-        dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing,
+        dims=nothing,
 
         # CPU settings
         max_tasks=Threads.nthreads(),
@@ -226,7 +226,7 @@ end
     count(
         [f=identity], src::AbstractArray, backend::Backend=get_backend(src);
         init=0,
-        dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing,
+        dims=nothing,
 
         # CPU settings
         max_tasks=Threads.nthreads(),

diff --git a/src/reduce/mapreduce_nd.jl b/src/reduce/mapreduce_nd.jl
@@ -1,5 +1,5 @@
 # Generalized N-dimensional mapreduce for GPU and CPU backends, reducing one or more
-# dimensions (`dims::Int` or `dims::Tuple`) of `src` into `dst`.
+# dimensions (`dims::Integer` or a collection of integers) of `src` into `dst`.
 #
 # Design (see references: CUDA.jl / GPUArrays mapreducedim!, PyTorch Reduce.cuh, CUB):
 #   1. Canonicalize dims: collapse adjacent dimensions with matching strides into contiguous
@@ -77,7 +77,7 @@ function mapreduce_nd(
     f, op, src::MapReduceSource, backend::Backend;
     init,
     neutral=neutral_element(op, typeof(init)),
-    dims::Union{Int, Tuple{Vararg{Int}}},
+    dims,
 
     # CPU settings
     max_tasks::Int,
@@ -91,14 +91,17 @@ function mapreduce_nd(
     @argcheck 1 <= block_size <= 1024
     @argcheck ispow2(block_size)
 
-    dims_all = dims isa Int ? (dims,) : dims
-
-    if Base.any(d < 1 for d in dims_all)
-        throw(ArgumentError("region dimension(s) must be ≥ 1, got $dims"))
+    dims_src = dims isa Number ? (dims,) : dims
+    dims_buf = Int[]
+    for d in dims_src
+        d isa Integer || throw(ArgumentError("reduced dimension(s) must be integers"))
+        dim = Int(d)
+        dim < 1 && throw(ArgumentError("region dimension(s) must be ≥ 1, got $d"))
+        push!(dims_buf, dim)
     end
 
     # Match Base: duplicate dims are ignored, e.g. dims=(2,2) behaves like dims=2.
-    dims_all = Tuple(Base.unique(dims_all))
+    dims_all = Tuple(Base.unique(dims_buf))
 
     src_sizes = size(src)
     ndim      = length(src_sizes)

diff --git a/src/reduce/reduce.jl b/src/reduce/reduce.jl
@@ -57,7 +57,7 @@ include("mapreduce_nd.jl")
         op, src::AbstractArray, backend::Backend=get_backend(src);
         init,
         neutral=neutral_element(op, typeof(init)),
-        dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing,
+        dims=nothing,
 
         # CPU settings
         max_tasks::Int=Threads.nthreads(),
@@ -70,16 +70,16 @@ include("mapreduce_nd.jl")
     )
 
 Reduce `src` along dimensions `dims` using the binary operator `op`. If `dims` is `nothing` or
-`:`, reduce `src` to a scalar. If `dims` is an integer or a tuple of integers, reduce `src` along
-those dimension(s). The `init` value is used as the initial value for the reduction; `neutral` is
-the neutral element for the operator `op`.
+`:`, reduce `src` to a scalar. If `dims` is an integer or a collection of integers, reduce `src`
+along those dimension(s). The `init` value is used as the initial value for the reduction; `neutral`
+is the neutral element for the operator `op`.
 
 The returned type is the same as `init` - to control output precision, specify `init` explicitly.
 
 ## CPU settings
 Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional
-arrays (`dims` is an integer or tuple) multithreading currently only becomes faster for
-`max_tasks >= 4`; all other cases are scaling linearly with the number of threads.
+arrays (`dims` is an integer or a collection of integers) multithreading currently only becomes
+faster for `max_tasks >= 4`; all other cases are scaling linearly with the number of threads.
 
 Note that multithreading reductions only improves performance for cases with more compute-heavy
 operations, which hide the memory latency and thread launch overhead - that includes:
@@ -93,8 +93,8 @@ The `block_size` parameter controls the number of threads per block and must be
 
 The `temp` parameter can be used to pass a pre-allocated temporary array. For reduction to a scalar
 (`dims=nothing` or `dims=:`), `length(temp) >= 2 * (length(src) + 2 * block_size - 1) ÷ (2 *
-block_size)` is required. For reduction along dimensions (`dims` is an integer or tuple), `temp` is
-used as the destination array, and thus must have the exact dimensions required - i.e. same
+block_size)` is required. For reduction along dimensions (`dims` is an integer or a collection of
+integers), `temp` is used as the destination array, and thus must have the exact dimensions required - i.e. same
 dimensionwise sizes as `src`, except for the reduced dimension(s) which become 1; there are some
 corner cases when one dimension is zero, check against `Base.reduce` for CPU arrays for exact
 behavior.
@@ -142,7 +142,7 @@ end
         f, op, src::AbstractArray, backend::Backend=get_backend(src);
         init,
         neutral=neutral_element(op, typeof(init)),
-        dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing,
+        dims=nothing,
 
         # CPU settings
         max_tasks::Int=Threads.nthreads(),
@@ -158,7 +158,7 @@ end
     mapreduce(f, op, A::AbstractArray, B::AbstractArray, As::AbstractArray..., backend::Backend; init, kwargs...)
 
 Reduce `src` along dimensions `dims` using the binary operator `op` after applying `f` elementwise.
-If `dims` is `nothing` or `:`, reduce `src` to a scalar. If `dims` is an integer or a tuple of
+If `dims` is `nothing` or `:`, reduce `src` to a scalar. If `dims` is an integer or a collection of
 integers, reduce `src` along those dimension(s). The `init` value is used as the initial value for
 the reduction (i.e. after mapping).
 
@@ -175,16 +175,16 @@ are reduced without materializing the intermediate array. Mismatched axes throw
 
 ## CPU settings
 Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional
-arrays (`dims` is an integer or tuple) multithreading currently only becomes faster for
-`max_tasks >= 4`; all other cases are scaling linearly with the number of threads.
+arrays (`dims` is an integer or a collection of integers) multithreading currently only becomes
+faster for `max_tasks >= 4`; all other cases are scaling linearly with the number of threads.
 
 ## GPU settings
 The `block_size` parameter controls the number of threads per block and must be a power of two.
 
 The `temp` parameter can be used to pass a pre-allocated temporary array. For reduction to a scalar
 (`dims=nothing` or `dims=:`), `length(temp) >= 2 * (length(src) + 2 * block_size - 1) ÷ (2 *
-block_size)` is required. For reduction along dimensions (`dims` is an integer or tuple), `temp` is
-used as the destination array, and thus must have the exact dimensions required - i.e. same
+block_size)` is required. For reduction along dimensions (`dims` is an integer or a collection of
+integers), `temp` is used as the destination array, and thus must have the exact dimensions required - i.e. same
 dimensionwise sizes as `src`, except for the reduced dimension(s) which become 1; there are some
 corner cases when one dimension is zero, check against `Base.reduce` for CPU arrays for exact
 behavior.
@@ -281,7 +281,7 @@ function _mapreduce_impl(
     f, op, src::MapReduceSource, backend::Backend;
     init,
     neutral=neutral_element(op, typeof(init)),
-    dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon} = nothing,
+    dims = nothing,
 
     # CPU settings
     max_tasks::Int=Threads.nthreads(),

diff --git a/test/generic/reduce.jl b/test/generic/reduce.jl
@@ -259,6 +259,16 @@ end
         end
     end
 
+    # Base also accepts iterable dims such as vectors and ranges.
+    for dims in ([1,2], [1,3], [2,3], [1,2,3], [2,1], [2,1,2], Int[], Any[1,2], Int32[1,2], 1:2)
+        vh = rand(Int32(1):Int32(100), 3, 4, 5)
+        v = array_from_host(vh)
+        @test Array(AK.reduce(+, v; prefer_threads, init=Int32(0), dims)) ==
+            sum(vh; init=Int32(0), dims)
+    end
+
+    @test_throws ArgumentError AK.reduce(+, array_from_host(rand(Int32, 3, 4)); prefer_threads, init=Int32(0), dims=[1.0, 2.0])
+
     # Tiled strided GPU path: contiguous kept dimensions, one strided reduce
     # dimension, and dst_size == reduce_size. The 3D case also exercises a
     # partial output tile.
@@ -650,6 +660,16 @@ end
         end
     end
 
+    # Base also accepts iterable dims such as vectors and ranges.
+    for dims in ([1,2], [1,3], [2,3], [1,2,3], [2,1], [2,1,2], Int[], Any[1,2], Int32[1,2], 1:2)
+        vh = rand(Int32(1):Int32(100), 3, 4, 5)
+        v = array_from_host(vh)
+        @test Array(AK.mapreduce(-, +, v; prefer_threads, init=Int32(0), dims)) ==
+            mapreduce(-, +, vh; init=Int32(0), dims)
+    end
+
+    @test_throws ArgumentError AK.mapreduce(-, +, array_from_host(rand(Int32, 3, 4)); prefer_threads, init=Int32(0), dims=[1.0, 2.0])
+
     # Tiled strided GPU path coverage for mapreduce, including a 3D case with
     # a partial output tile.
     for (shape, dims) in (((512, 512), 2), ((20, 13, 260), 3))