diff --git a/src/arithmetics.jl b/src/arithmetics.jl
index a14a9e27..73c618a0 100644
--- a/src/arithmetics.jl
+++ b/src/arithmetics.jl
@@ -2,7 +2,7 @@
     sum(
         src::AbstractArray, backend::Backend=get_backend(src);
         init=zero(eltype(src)),
-        dims::Union{Nothing, Int}=nothing,
+        dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing,
 
         # CPU settings
         max_tasks=Threads.nthreads(),
@@ -33,11 +33,11 @@ m = MtlArray(rand(Int32(1):Int32(100), 10, 100_000))
 s = AK.sum(m, dims=1)
 ```
 
-If you know the shape of the resulting array (in case of a axis-wise sum, i.e. `dims` is not
+If you know the shape of the resulting array (in case of a dimensionwise sum, i.e. `dims` is not
 `nothing`), you can provide the `temp` argument to save results into and avoid allocations:
 ```julia
 m = MtlArray(rand(Int32(1):Int32(100), 10, 100_000))
-temp = MtlArray(zeros(Int32, 10))
+temp = MtlArray(zeros(Int32, 10, 1))
 s = AK.sum(m, dims=2, temp=temp)
 ```
 """
@@ -58,7 +58,7 @@ end
     prod(
         src::AbstractArray, backend::Backend=get_backend(src);
         init=one(eltype(src)),
-        dims::Union{Nothing, Int}=nothing,
+        dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing,
 
         # CPU settings
         max_tasks=Threads.nthreads(),
@@ -89,11 +89,11 @@ m = ROCArray(rand(Int32(1):Int32(100), 10, 100_000))
 p = AK.prod(m, dims=1)
 ```
 
-If you know the shape of the resulting array (in case of a axis-wise product, i.e. `dims` is not
+If you know the shape of the resulting array (in case of a dimensionwise product, i.e. `dims` is not
 `nothing`), you can provide the `temp` argument to save results into and avoid allocations:
 ```julia
 m = ROCArray(rand(Int32(1):Int32(100), 10, 100_000))
-temp = ROCArray(ones(Int32, 10))
+temp = ROCArray(ones(Int32, 10, 1))
 p = AK.prod(m, dims=2, temp=temp)
 ```
 """
@@ -114,7 +114,7 @@ end
     maximum(
         src::AbstractArray, backend::Backend=get_backend(src);
         init=typemin(eltype(src)),
-        dims::Union{Nothing, Int}=nothing,
+        dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing,
 
         # CPU settings
         max_tasks=Threads.nthreads(),
@@ -145,11 +145,11 @@ m = oneArray(rand(Int32(1):Int32(100), 10, 100_000))
 m = AK.maximum(m, dims=1)
 ```
 
-If you know the shape of the resulting array (in case of a axis-wise maximum, i.e. `dims` is not
+If you know the shape of the resulting array (in case of a dimensionwise maximum, i.e. `dims` is not
 `nothing`), you can provide the `temp` argument to save results into and avoid allocations:
 ```julia
 m = oneArray(rand(Int32(1):Int32(100), 10, 100_000))
-temp = oneArray(zeros(Int32, 10))
+temp = oneArray(zeros(Int32, 10, 1))
 m = AK.maximum(m, dims=2, temp=temp)
 ```
 """
@@ -170,7 +170,7 @@ end
     minimum(
         src::AbstractArray, backend::Backend=get_backend(src);
         init=typemax(eltype(src)),
-        dims::Union{Nothing, Int}=nothing,
+        dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing,
 
         # CPU settings
         max_tasks=Threads.nthreads(),
@@ -201,11 +201,11 @@ m = CuArray(rand(Int32(1):Int32(100), 10, 100_000))
 m = AK.minimum(m, dims=1)
 ```
 
-If you know the shape of the resulting array (in case of a axis-wise minimum, i.e. `dims` is not
+If you know the shape of the resulting array (in case of a dimensionwise minimum, i.e. `dims` is not
 `nothing`), you can provide the `temp` argument to save results into and avoid allocations:
 ```julia
 m = CuArray(rand(Int32(1):Int32(100), 10, 100_000))
-temp = CuArray(ones(Int32, 10))
+temp = CuArray(ones(Int32, 10, 1))
 m = AK.minimum(m, dims=2, temp=temp)
 ```
 """
@@ -226,7 +226,7 @@ end
     count(
         [f=identity], src::AbstractArray, backend::Backend=get_backend(src);
         init=0,
-        dims::Union{Nothing, Int}=nothing,
+        dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing,
 
         # CPU settings
         max_tasks=Threads.nthreads(),
@@ -263,12 +263,12 @@ m = MtlArray(rand(Bool, 10, 100_000))
 c = AK.count(m, dims=1)
 ```
 
-If you know the shape of the resulting array (in case of a axis-wise count, i.e. `dims` is not
+If you know the shape of the resulting array (in case of a dimensionwise count, i.e. `dims` is not
 `nothing`), you can provide the `temp` argument to save results into and avoid allocations:
 ```julia
 m = MtlArray(rand(Bool, 10, 100_000))
-temp = MtlArray(zeros(Int32, 10))
-c = AK.count(m, dims=2, temp=temp)
+temp = MtlArray(zeros(Int32, 10, 1))
+c = AK.count(m; init=Int32(0), dims=2, temp=temp)
 ```
 """
 function count(
diff --git a/src/reduce/mapreduce_1d_cpu.jl b/src/reduce/mapreduce_1d_cpu.jl
index 95a93f21..3a47b603 100644
--- a/src/reduce/mapreduce_1d_cpu.jl
+++ b/src/reduce/mapreduce_1d_cpu.jl
@@ -1,5 +1,5 @@
 function mapreduce_1d_cpu(
-    f, op, src::AbstractArray, backend::Backend;
+    f, op, src::MapReduceSource, backend::Backend;
     init,
     neutral,
 
@@ -12,6 +12,10 @@ function mapreduce_1d_cpu(
     temp::Union{Nothing, AbstractArray},
     switch_below::Int,
 )
+    if src isa Base.Broadcast.Broadcasted
+        return op(init, Base.mapreduce(f, op, src; init=neutral))
+    end
+
     if max_tasks == 1
         return op(init, Base.mapreduce(f, op, src; init=neutral))
     end
diff --git a/src/reduce/mapreduce_1d_gpu.jl b/src/reduce/mapreduce_1d_gpu.jl
index 39e7c415..f016fa41 100644
--- a/src/reduce/mapreduce_1d_gpu.jl
+++ b/src/reduce/mapreduce_1d_gpu.jl
@@ -47,7 +47,7 @@ end
 
 
 function mapreduce_1d_gpu(
-    f, op, src::AbstractArray, backend::Backend;
+    f, op, src::MapReduceSource, backend::Backend;
     init,
     neutral,
 
@@ -61,12 +61,13 @@ function mapreduce_1d_gpu(
     switch_below::Int,
 )
     @argcheck 1 <= block_size <= 1024
+    @argcheck ispow2(block_size)
     @argcheck switch_below >= 0
 
     # Degenerate cases
     len = length(src)
     len == 0 && return init
-    len == 1 && return @allowscalar f(src[1])
+    len == 1 && return op(init, @allowscalar f(src[1]))
     if len < switch_below
         h_src = Vector(src)
         return Base.mapreduce(f, op, h_src; init)
@@ -87,8 +88,8 @@ function mapreduce_1d_gpu(
         dst = KernelAbstractions.allocate(backend, dst_type, blocks * 2)
     end
 
-    # Later the kernel will be compiled for views anyways, so use same types
-    src_view = @view src[1:end]
+    # Later the kernel will be compiled for views anyways, so use same types for arrays.
+    src_view = _mapreduce_1d_src_view(src)
     dst_view = @view dst[1:blocks]
 
     kernel! = _mapreduce_block!(backend, block_size)
@@ -125,3 +126,6 @@ function mapreduce_1d_gpu(
     # The GPU kernel reduced all elements to one, but without the init value
     return op(init, @allowscalar(p1[1]))
 end
+
+_mapreduce_1d_src_view(src::AbstractArray) = @view src[1:end]
+_mapreduce_1d_src_view(src::Base.Broadcast.Broadcasted) = src
diff --git a/src/reduce/mapreduce_nd.jl b/src/reduce/mapreduce_nd.jl
index 231d0dc4..690c2920 100644
--- a/src/reduce/mapreduce_nd.jl
+++ b/src/reduce/mapreduce_nd.jl
@@ -1,10 +1,85 @@
+# Generalized N-dimensional mapreduce for GPU and CPU backends, reducing one or more
+# dimensions (`dims::Int` or `dims::Tuple`) of `src` into `dst`.
+#
+# Design (see references: CUDA.jl / GPUArrays mapreducedim!, PyTorch Reduce.cuh, CUB):
+#   1. Canonicalize dims: collapse adjacent dimensions with matching strides into contiguous
+#      segments. A plain `dims=2` reduction, or any reduction over a contiguous block of dims
+#      (e.g. `dims=(1,2)`), collapses to a *single* reduce segment.
+#   2. The inner reduce loop must avoid per-element integer division. For a single segment the
+#      element offset is just `j * stride`; only genuinely non-contiguous dim sets (e.g.
+#      `dims=(1,3)`) fall back to a per-element multi-dimensional decode.
+#   3. Four work decompositions, chosen by the relative sizes and strides of the output and
+#      the reduction:
+#        - by_thread:     one thread per output (many outputs, small reduction)
+#        - tiled_strided: several contiguous outputs per block, for square-like strided reductions
+#        - by_block:      one block per output, grid-stride over outputs (few outputs, large reduction)
+#        - multigroup:    several blocks per output, two-pass (dst_size==1 or very small dst_size)
+
+# Number of blocks the by_block / multigroup paths aim to launch, so a reduction with
+# few output elements can still fill the GPU. A heuristic GPU-occupancy target.
+const TARGET_BLOCKS = 256
+
+# Below this many output elements, splitting a single output's reduction across multiple
+# blocks (multigroup) is preferred over grid-striding by_block, because grid-stride with
+# too few blocks cannot fill the GPU on its own. At or above this, by_block grid-strides.
+const GS_DST_CUTOFF = 32
+
+# Minimum reduction-loop iterations per thread in the multigroup first pass.
+# Caps reduce_groups so splitting a reduction doesn't shrink per-thread work
+# below the point where launch/scheduling overhead dominates actual work.
+const MIN_ITEMS_PER_THREAD = 8
+
+# Number of contiguous output rows handled by each tiled-strided block. With the default
+# 256-thread block this gives 32 lanes per row.
+const TILED_STRIDED_ROWS_PER_BLOCK = 8
+
+
+# Host-side canonicalization: split the dimensions into reduced and kept ("outer") segments,
+# merging adjacent dimensions whose strides are contiguous. Returns two tuples of
+# (stride, size) segments.
+function _canonicalize_dims(src_sizes, src_strides, dims_valid)
+    ndim = length(src_sizes)
+    reduce_segs = Tuple{Int,Int}[]
+    outer_segs  = Tuple{Int,Int}[]
+
+    i = 1
+    while i <= ndim
+        if i in dims_valid
+            seg_stride = src_strides[i]
+            seg_size   = src_sizes[i]
+            while i + 1 <= ndim &&
+                  (i + 1) in dims_valid &&
+                  src_strides[i + 1] == seg_stride * seg_size
+                i += 1
+                seg_size *= src_sizes[i]
+            end
+            push!(reduce_segs, (seg_stride, seg_size))
+        else
+            seg_stride = src_strides[i]
+            seg_size   = src_sizes[i]
+            while i + 1 <= ndim &&
+                  !((i + 1) in dims_valid) &&
+                  src_strides[i + 1] == seg_stride * seg_size
+                i += 1
+                seg_size *= src_sizes[i]
+            end
+            push!(outer_segs, (seg_stride, seg_size))
+        end
+        i += 1
+    end
+
+    return Tuple(reduce_segs), Tuple(outer_segs)
+end
+
+# Main entry point
+
 function mapreduce_nd(
-    f, op, src::AbstractArray, backend::Backend;
+    f, op, src::MapReduceSource, backend::Backend;
     init,
-    neutral=neutral_element(op, eltype(src)),
-    dims::Int,
+    neutral=neutral_element(op, typeof(init)),
+    dims::Union{Int, Tuple{Vararg{Int}}},
 
-    # CPU settings - ignored here
+    # CPU settings
     max_tasks::Int,
     min_elems::Int,
     prefer_threads::Bool=true,
@@ -14,327 +89,637 @@ function mapreduce_nd(
     temp::Union{Nothing, AbstractArray},
 )
     @argcheck 1 <= block_size <= 1024
+    @argcheck ispow2(block_size)
 
-    # Degenerate cases begin; order of priority matters
+    dims_all = dims isa Int ? (dims,) : dims
 
-    # Invalid dims
-    if dims < 1
+    if Base.any(d < 1 for d in dims_all)
         throw(ArgumentError("region dimension(s) must be ≥ 1, got $dims"))
     end
 
-    # If dims > number of dimensions, just map each element through f and add init, e.g.:
-    #   julia> x = rand(Float64, 3, 5);
-    #   julia> mapreduce(x -> -x, +, x, dims=3, init=Float32(0))
-    #   3×5 Matrix{Float32}     # Negative numbers
+    # Match Base: duplicate dims are ignored, e.g. dims=(2,2) behaves like dims=2.
+    dims_all = Tuple(Base.unique(dims_all))
+
     src_sizes = size(src)
-    if dims > length(src_sizes)
-        if isnothing(temp)
-            dst = KernelAbstractions.allocate(backend, typeof(init), src_sizes)
-        else
-            @argcheck get_backend(temp) == backend
-            @argcheck size(temp) == src_sizes
-            @argcheck eltype(temp) == typeof(init)
-            dst = temp
-        end
+    ndim      = length(src_sizes)
+
+    dims_valid = Tuple(d for d in dims_all if d <= ndim)
+
+    # Degenerate cases begin; order of priority matters
+
+    # All reduced dims are beyond ndims: just map each element through f and add init, e.g.:
+    #   julia> x = rand(Float64, 3, 5);
+    #   julia> mapreduce(x -> -x, +, x, dims=3, init=Float32(0))     # 3×5 Matrix{Float32}
+    if isempty(dims_valid)
+        dst = _alloc_or_temp(backend, temp, init, src_sizes)
         _mapreduce_nd_apply_init!(f, op, dst, src, backend; init, max_tasks, min_elems, block_size)
         return dst
     end
 
     # The per-dimension sizes of the destination array; construct tuple without allocations
     dst_sizes = unrolled_map_index(src_sizes) do i
-        i == dims ? 1 : src_sizes[i]
+        i in dims_valid ? 1 : src_sizes[i]
     end
 
-    # If any dimension except dims is zero, return empty similar array except with the dims
-    # dimension = 1. Weird, see example below:
-    #   julia> x = rand(3, 0, 5);
-    #   julia> reduce(+, x, dims=3)
-    #   3×0×1 Array{Float64, 3}
+    # If any kept dimension is zero, return empty array (reduced dims become 1), e.g.:
+    #   julia> x = rand(3, 0, 5);  reduce(+, x, dims=3)     # 3×0×1 Array{Float64, 3}
     for isize in eachindex(src_sizes)
-        isize == dims && continue
+        isize in dims_valid && continue
         if src_sizes[isize] == 0
-            if isnothing(temp)
-                dst = KernelAbstractions.allocate(backend, typeof(init), dst_sizes)
-            else
-                @argcheck size(temp) == dst_sizes
-                @argcheck eltype(temp) == typeof(init)
-                dst = temp
-            end
-            return dst
+            return _alloc_or_temp(backend, temp, init, dst_sizes)
         end
     end
 
-    # If sizes[dims] == 0, return array filled with init; same shape except sizes[dims] = 1:
-    #   julia> x = rand(3, 0, 5);
-    #   julia> mapreduce(+, x, dims=2)
-    #   3×1×5 Array{Float64, 3}:
-    #   [:, :, 1] =
-    #    0.0
-    #    0.0
-    #    0.0
-    #   [...]
-    len = src_sizes[dims]
+    len = Base.prod(src_sizes[d] for d in dims_valid)
+
+    # If a reduced dimension is zero, return array filled with init, e.g.:
+    #   julia> x = rand(3, 0, 5);  mapreduce(+, x, dims=2)     # 3×1×5 of zeros
     if len == 0
-        if isnothing(temp)
-            dst = KernelAbstractions.allocate(backend, typeof(init), dst_sizes)
-        else
-            @argcheck get_backend(temp) == backend
-            @argcheck size(temp) == dst_sizes
-            @argcheck eltype(temp) == typeof(init)
-            dst = temp
-        end
+        dst = _alloc_or_temp(backend, temp, init, dst_sizes)
         fill!(dst, init)
         return dst
     end
 
-    # If sizes[dims] == 1, just map each element through f. Again, keep same type as init
+    # If the reduced extent is 1, just map each element through f (keep init's type)
     if len == 1
-        if isnothing(temp)
-            dst = KernelAbstractions.allocate(backend, typeof(init), src_sizes)
-        else
-            @argcheck get_backend(temp) == backend
-            @argcheck size(temp) == src_sizes
-            @argcheck eltype(temp) == typeof(init)
-            dst = temp
-        end
+        dst = _alloc_or_temp(backend, temp, init, src_sizes)
         _mapreduce_nd_apply_init!(f, op, dst, src, backend; init, max_tasks, min_elems, block_size)
         return dst
     end
 
     # Degenerate cases end
 
-    # Allocate destination array
-    if isnothing(temp)
-        dst = KernelAbstractions.allocate(backend, typeof(init), dst_sizes)
-    else
-        @argcheck get_backend(temp) == backend
-        @argcheck size(temp) == dst_sizes
-        @argcheck eltype(temp) == typeof(init)
-        dst = temp
-    end
+    dst = _alloc_or_temp(backend, temp, init, dst_sizes)
     dst_size = length(dst)
 
-    if !use_gpu_algorithm(backend, prefer_threads)
-        _mapreduce_nd_cpu_sections!(
-            f, op, dst, src;
-            init,
-            dims,
-            max_tasks=max_tasks,
-            min_elems=min_elems,
+    if backend == CPU_BACKEND
+        _mapreduce_nd_cpu_sections!(f, op, dst, src; init, max_tasks, min_elems)
+        return dst
+    end
+
+    # The stride-based fast paths below index a flat buffer at
+    # `buffer[base_offset + Σ coordᵈ·strideᵈ + 1]`. This works for any source backed
+    # by a single dense column-major buffer (dense arrays, but also strided views,
+    # adjoints, permuted dims, and reshapes over one). Sources without such a buffer
+    # — `Broadcasted`, lazy/computed arrays — take the generic Cartesian-indexed
+    # fallback, which makes no layout assumption (and crucially does not wrap the
+    # source in `@Const`, which is what makes e.g. PermutedDimsArray uncompilable).
+    layout = _mapreduce_strided_layout(src)
+    if isnothing(layout)
+        blocks = cld(dst_size, block_size)
+        kernel! = _mapreduce_nd_generic!(backend, block_size)
+        kernel!(
+            src, dst, f, op, init,
+            CartesianIndices(dst), _mapreduce_reduce_indices(src, dst), dst_size,
+            ndrange=(block_size * blocks,),
         )
-    else
-        # On GPUs we have two parallelisation approaches, based on which dimension has more elements:
-        #   - If the dimension we are reducing has more elements, (e.g. reduce(+, rand(3, 1000), dims=2)),
-        #     we use a block of threads per dst element - thus, a block of threads reduces the dims axis
-        #   - If the other dimensions have more elements (e.g. reduce(+, rand(3, 1000), dims=1)), we
-        #     use a single thread per dst element - thus, a thread reduces the dims axis sequentially,
-        #     while the other dimensions are processed in parallel, independently
-        if dst_size >= src_sizes[dims]
-            blocks = (dst_size + block_size - 1) ÷ block_size
-            kernel1! = _mapreduce_nd_by_thread!(backend, block_size)
-            kernel1!(
-                src, dst, f, op, init, dims,
-                ndrange=(block_size * blocks,),
+        return dst
+    end
+
+    buffer, base_offset, src_strides = layout
+    reduce_segs, outer_segs = _canonicalize_dims(src_sizes, src_strides, dims_valid)
+
+    outer_strides  = Tuple(str for (str, _) in outer_segs)
+    outer_sizes    = Tuple(s   for (_, s)   in outer_segs)
+    reduce_strides = Tuple(str for (str, _) in reduce_segs)
+    reduce_sizes   = Tuple(s   for (_, s)   in reduce_segs)
+    reduce_size    = len
+
+    # ─────────────────────────────────────────────────────────────────────────
+    # Dispatch decision (see header comment for the four paths):
+    #
+    #   - square-like, contiguous output with strided input -> tiled_strided
+    #   - dst_size >= reduce_size                          -> by_thread
+    #   - dst_size == 1, or dst_size < GS_DST_CUTOFF
+    #     (and dst_size < reduce_size)                     -> multigroup (split one
+    #                                                          reduction across blocks,
+    #                                                          needs a 2nd-pass combine)
+    #   - otherwise (GS_DST_CUTOFF <= dst_size < reduce_size) -> by_block, grid-striding
+    #                                                            over outputs, single pass
+    #
+    # Rationale: grid-striding by_block launches `min(dst_size, TARGET_BLOCKS)` blocks;
+    # for very small dst_size (e.g. 5 or 9) that under-fills an 84-SM GPU, so splitting
+    # the (large) reduction itself across many blocks via multigroup is still better.
+    # For dst_size==1 there is nothing to grid-stride over, so multigroup is the only
+    # option regardless of GS_DST_CUTOFF.
+    # ─────────────────────────────────────────────────────────────────────────
+
+    # by_block override 1: when the reduced dimension is the fastest-varying one
+    # (reduce_strides==(1,)), by_thread's per-thread strided access is badly
+    # uncoalesced, while by_block lets consecutive threads read consecutive elements.
+    use_by_block_for_coalescing =
+        dst_size >= reduce_size && reduce_size >= block_size &&
+        length(reduce_sizes) == 1 && reduce_sizes[1] != 0 &&
+        reduce_strides == (1,)
+
+    # by_block override 2: when by_thread would launch too few blocks for a square-like
+    # output/reduction shape, split each output reduction across a full block. This
+    # fallback is mostly for layouts that do not satisfy the narrower tiled-strided
+    # pattern below; the common contiguous-output strided case uses tiled_strided.
+    # Avoid applying this to wide-output shapes, where by_thread's cross-output
+    # coalescing is better than a strided block reduction.
+    use_by_block_for_low_occupancy =
+        dst_size == reduce_size && reduce_size >= block_size &&
+        cld(dst_size, block_size) < TARGET_BLOCKS &&
+        length(reduce_sizes) == 1 && reduce_sizes[1] != 0
+
+    use_by_block = use_by_block_for_coalescing || use_by_block_for_low_occupancy
+    use_tiled_strided =
+        dst_size == reduce_size && reduce_size >= block_size &&
+        length(outer_sizes) == 1 && outer_strides == (1,) &&
+        length(reduce_sizes) == 1 && reduce_strides[1] > 1 &&
+        block_size % TILED_STRIDED_ROWS_PER_BLOCK == 0 &&
+        ispow2(block_size ÷ TILED_STRIDED_ROWS_PER_BLOCK)
+
+    if use_tiled_strided
+        # Narrow layout-specific path: square-like row reductions with contiguous
+        # outputs and strided input reads, e.g. size=(1024,1024), dims=2. Several
+        # outputs share a block: small lane groups reduce one output each, preserving
+        # more cross-output memory coalescing than by_block while launching many more
+        # blocks than by_thread. This is the only extra kernel beyond the generic
+        # by_thread/by_block/multigroup shapes.
+        rows_per_block = TILED_STRIDED_ROWS_PER_BLOCK
+        blocks = cld(dst_size, rows_per_block)
+        kernel! = _mapreduce_nd_by_thread_tiled_strided!(backend, block_size)
+        kernel!(
+            buffer, dst, f, op, init, neutral,
+            base_offset, reduce_strides[1], dst_size, reduce_size, Val(rows_per_block),
+            ndrange=(block_size * blocks,),
+        )
+    elseif dst_size >= reduce_size && !use_by_block
+        # Many outputs, small reduction: one thread per output reduces sequentially
+        blocks = cld(dst_size, block_size)
+        kernel! = _mapreduce_nd_by_thread!(backend, block_size)
+        kernel!(
+            buffer, dst, f, op, init,
+            base_offset, outer_strides, outer_sizes, reduce_strides, reduce_sizes,
+            dst_size, reduce_size,
+            ndrange=(block_size * blocks,),
+        )
+    elseif use_by_block
+        # Grid-stride by_block: one full block cooperates on each output, then
+        # grid-strides across remaining outputs if fewer blocks than outputs were
+        # launched.
+        launch_blocks = min(dst_size, TARGET_BLOCKS)
+        kernel! = _mapreduce_nd_by_block!(backend, block_size)
+        kernel!(
+            buffer, dst, f, op, init, neutral,
+            base_offset, outer_strides, outer_sizes, reduce_strides, reduce_sizes,
+            dst_size, reduce_size, launch_blocks,
+            ndrange=(block_size * launch_blocks,),
+        )
+    elseif dst_size == 1 || dst_size < GS_DST_CUTOFF
+        # Very few outputs, large reduction: split each output's reduction across
+        # `reduce_groups` blocks (multigroup), combine partials in a second pass.
+        reduce_groups = min(
+            cld(reduce_size, block_size),
+            block_size,
+            cld(TARGET_BLOCKS, dst_size),
+            cld(reduce_size, block_size * MIN_ITEMS_PER_THREAD),
+        )
+        reduce_groups = max(reduce_groups, 1)
+
+        if reduce_groups > 1
+            partial = KernelAbstractions.allocate(backend, typeof(init), (dst_size, reduce_groups))
+
+            kernel! = _mapreduce_nd_multigroup!(backend, block_size)
+            kernel!(
+                buffer, partial, f, op, neutral,
+                base_offset, outer_strides, outer_sizes, reduce_strides, reduce_sizes,
+                Val(dst_size), reduce_size, reduce_groups,
+                ndrange=(block_size * dst_size * reduce_groups,),
             )
-        else
-            # One block per output element
-            blocks = dst_size
-            kernel2! = _mapreduce_nd_by_block!(backend, block_size)
+
+            # Second pass: reduce partial (dst_size × reduce_groups) → dst, one block
+            # per output. reduce_groups is small (<=block_size by construction), so use
+            # a small block size for this pass to avoid an oversubscribed tree-reduce.
+            pass2_block_size = _pass2_block_size(reduce_groups)
+            kernel2! = _mapreduce_partial_to_dst!(backend, pass2_block_size)
             kernel2!(
-                src, dst, f, op, init, neutral, dims,
-                ndrange=(block_size * blocks,),
+                partial, dst, op, init, neutral,
+                dst_size, reduce_groups,
+                ndrange=(pass2_block_size * dst_size,),
+            )
+        else
+            # reduce_groups collapsed to 1 (e.g. reduce_size <= block_size): just do a
+            # single-block-per-output reduction directly, no partial array needed.
+            kernel! = _mapreduce_nd_by_block!(backend, block_size)
+            kernel!(
+                buffer, dst, f, op, init, neutral,
+                base_offset, outer_strides, outer_sizes, reduce_strides, reduce_sizes,
+                dst_size, reduce_size, dst_size,
+                ndrange=(block_size * dst_size,),
             )
         end
+    else
+        # GS_DST_CUTOFF <= dst_size < reduce_size: grid-stride over outputs, one pass.
+        # Cap launched blocks at TARGET_BLOCKS; each block handles
+        # ceil(dst_size / launch_blocks) outputs sequentially.
+        launch_blocks = min(dst_size, TARGET_BLOCKS)
+        kernel! = _mapreduce_nd_by_block!(backend, block_size)
+        kernel!(
+            buffer, dst, f, op, init, neutral,
+            base_offset, outer_strides, outer_sizes, reduce_strides, reduce_sizes,
+            dst_size, reduce_size, launch_blocks,
+            ndrange=(block_size * launch_blocks,),
+        )
     end
 
     return dst
 end
 
+function _mapreduce_dense_strides(sizes::Tuple)
+    stride = 1
+    return ntuple(length(sizes)) do i
+        s = stride
+        stride *= sizes[i]
+        s
+    end
+end
+
+# Resolve the layout the stride-based fast-path kernels need. Those kernels index a
+# flat buffer at `buffer[base_offset + Σ coordᵈ·strideᵈ + 1]`, so they work for any
+# source backed by a single dense column-major buffer — a dense array, but also a
+# strided view, adjoint, permuted-dims, or reshape over one. Returns
+# `(buffer, base_offset, strides)` for such sources, or `nothing` (→ generic
+# Cartesian-indexed fallback) for `Broadcasted` and anything not backed by a dense
+# buffer (lazy/computed arrays, complex adjoints without `strides`, nested wrappers).
+function _mapreduce_strided_layout(src::AbstractArray)
+    s = try
+        strides(src)
+    catch err
+        err isa MethodError || rethrow()
+        return nothing
+    end
+
+    # Dense or contiguous (column-major) — index the source directly, no offset.
+    s == _mapreduce_dense_strides(size(src)) && return (src, 0, s)
+
+    # Non-dense but strided: index the dense parent buffer at the source's offset.
+    # Only one level of wrapping over a dense buffer is supported; deeper nesting
+    # falls back to the generic kernel.
+    p = parent(src)
+    (p === src || !_mapreduce_is_dense_buffer(p)) && return nothing
+    base = _mapreduce_wrapper_offset(src, p)
+    base === nothing && return nothing
+    return (p, base, s)
+end
+
+_mapreduce_strided_layout(::Base.Broadcast.Broadcasted) = nothing
+
+function _mapreduce_is_dense_buffer(p::AbstractArray)
+    try
+        return strides(p) == _mapreduce_dense_strides(size(p))
+    catch err
+        err isa MethodError || rethrow()
+        return false
+    end
+end
+
+# Offset (0-based, in elements) of the wrapper's first element within its dense
+# parent buffer. SubArrays start at their first selected index; permuted-dims,
+# reshapes, and (real) adjoints/transposes share the parent's first element.
+function _mapreduce_wrapper_offset(src::SubArray, p)
+    try
+        # Base.map (not AK's array `map`, which shadows it inside this module).
+        first_index = Base.map(first, parentindices(src))
+        return LinearIndices(p)[first_index...] - 1
+    catch
+        return nothing   # exotic index types → fall back to the generic kernel
+    end
+end
+_mapreduce_wrapper_offset(src, p) = 0
+
+# The reduced-extent index space: full axes along reduced dimensions, a single
+# index (`OneTo(1)`) along kept dimensions. Combined with `max(Iother, Ireduce)`
+# this reproduces Base's `mapreducedim!` iteration without touching strides.
+_mapreduce_reduce_indices(src, dst) =
+    CartesianIndices(ifelse.(axes(src) .== axes(dst), Ref(Base.OneTo(1)), axes(src)))
+
+
+# Smallest power-of-two >= n, capped at 256 (the original fixed block size) and
+# floored at 32 (one warp) — used for the multigroup second pass, which only ever
+# combines `reduce_groups` values (reduce_groups <= block_size by construction).
+@inline function _pass2_block_size(n::Int)
+    n <= 32  && return 32
+    n <= 64  && return 64
+    n <= 128 && return 128
+    return 256
+end
+
+
+# Allocate a destination array, or validate and reuse a user-provided `temp`.
+function _alloc_or_temp(backend, temp, init, sizes)
+    isnothing(temp) && return KernelAbstractions.allocate(backend, typeof(init), sizes)
+    @argcheck get_backend(temp) == backend
+    @argcheck size(temp) == sizes
+    @argcheck eltype(temp) == typeof(init)
+    temp
+end
+
+# CPU path
 
 function _mapreduce_nd_cpu_sections!(
     f, op, dst, src;
-    init,
-    dims,
-    max_tasks, min_elems,
+    init, max_tasks, min_elems,
 )
-    src_sizes = size(src)
-    src_strides = strides(src)
-    dst_strides = strides(dst)
-
-    reduce_size = src_sizes[dims]
-    ndims = length(src_sizes)
+    Rother  = CartesianIndices(dst)
+    Rreduce = _mapreduce_reduce_indices(src, dst)
 
-    # Each thread handles a section of the output array - i.e. reducing along the dims, for
-    # multiple output strides
     foreachindex(dst, max_tasks=max_tasks, min_elems=min_elems) do idst
-
         @inbounds begin
-            # Compute the base index in src (excluding the reduced axis)
-            input_base_idx = 0
-            tmp = idst - 1
-            KernelAbstractions.Extras.@unroll for i in ndims:-1:1
-                if i != dims
-                    input_base_idx += (tmp ÷ dst_strides[i]) * src_strides[i]
-                end
-                tmp = tmp % dst_strides[i]
-            end
-
-            # Go over each element in the reduced dimension
+            Iother = Rother[idst]
             res = init
-            for i in 0:reduce_size - 1
-                src_idx = input_base_idx + i * src_strides[dims]
-                res = op(res, f(src[src_idx + 1]))
+            for Ireduce in Rreduce
+                J = max(Iother, Ireduce)
+                res = op(res, f(src[J]))
             end
             dst[idst] = res
         end
     end
-
     dst
 end
 
+# Index helpers. Both decode a linear index into a byte-offset by walking compile-time-sized
+# segment tuples. The single-segment case (the common one after canonicalization) needs no
+# division; multi-segment falls back to a per-element decode (rare: non-adjacent dim sets).
+
+@inline function _outer_decode(tid, outer_strides, outer_sizes)
+    isempty(outer_sizes) && return 0
+    length(outer_sizes) == 1 && return tid * outer_strides[1]
+    base = 0
+    tmp  = tid
+    @inbounds for i in 1:length(outer_sizes)
+        q     = tmp ÷ outer_sizes[i]
+        r     = tmp - q * outer_sizes[i]
+        base += r * outer_strides[i]
+        tmp   = q
+    end
+    base
+end
 
-@kernel inbounds=true cpu=false unsafe_indices=true function _mapreduce_nd_by_thread!(
-    @Const(src), dst,
-    f, op,
-    init, dims,
+@inline function _reduce_offset(j, reduce_strides, reduce_sizes)
+    isempty(reduce_sizes) && return 0
+    # Single segment: j < reduce_size, so the decode is just j * stride — no division.
+    length(reduce_sizes) == 1 && return j * reduce_strides[1]
+    off = 0
+    tmp = j
+    @inbounds for i in 1:length(reduce_sizes) - 1
+        sz = reduce_sizes[i]
+        if ispow2(sz)
+            shift = trailing_zeros(sz)
+            r   = tmp & (sz - 1)
+            tmp = tmp >> shift
+        else
+            q   = tmp ÷ sz
+            r   = tmp - q * sz
+            tmp = q
+        end
+        off += r * reduce_strides[i]
+    end
+    off += tmp * reduce_strides[end]
+    off
+end
+
+# GPU kernel: generic fallback — one thread per output element, reducing sequentially
+# over the reduced extents using Cartesian indexing. Makes no assumption about the
+# source's memory layout, so it handles strided views, adjoints, permuted dims, and
+# broadcasts over them. Mirrors the CPU `_mapreduce_nd_cpu_sections!` path.
+#
+# NOTE: the source is deliberately NOT marked `@Const` here. `@Const` prevents the
+# `@inbounds` getindex of some wrappers (e.g. PermutedDimsArray's `genperm`) from
+# being elided, leaving a `throw` that the GPU backends cannot compile.
+@kernel inbounds=true cpu=false unsafe_indices=true function _mapreduce_nd_generic!(
+    src, dst,
+    f, op, init,
+    Rother, Rreduce, output_size,
 )
-    # One thread per output element, when there are more outer elements than in the reduced dim
-    # e.g. reduce(+, rand(3, 1000), dims=1) => only 3 elements in the reduced dim
-    src_sizes = size(src)
-    src_strides = strides(src)
-    dst_sizes = size(dst)
-    dst_strides = strides(dst)
+    N       = @groupsize()[1]
+    iblock  = @index(Group, Linear) - 0x1
+    ithread = @index(Local, Linear) - 0x1
+    tid     = ithread + iblock * N
 
-    output_size = length(dst)
-    reduce_size = src_sizes[dims]
+    if tid < output_size
+        Iother = Rother[tid + 0x1]
+        res = init
+        for Ireduce in Rreduce
+            J = max(Iother, Ireduce)
+            res = op(res, f(src[J]))
+        end
+        dst[Iother] = res
+    end
+end
 
-    ndims = length(src_sizes)
+@kernel inbounds=true cpu=false unsafe_indices=true function _mapreduce_nd_by_thread_tiled_strided!(
+    @Const(src), dst,
+    f, op, init, neutral,
+    base_offset, reduce_stride,
+    output_size, reduce_size,
+    ::Val{rows},
+) where {rows}
+    @uniform N = @groupsize()[1]
+    @uniform reduce_threads = N ÷ rows
+    sdata = @localmem eltype(dst) (N,)
+
+    iblock  = @index(Group, Linear) - 0x1
+    ithread = @index(Local, Linear) - 0x1
 
-    N = @groupsize()[1]
+    # Int(...) keeps the index decode signed; a bare `unsigned ÷ unsigned` result feeds
+    # the signed index math and adds an Int-conversion throw guard NVPTX keeps (CUDA ~20%).
+    row  = Int(unsigned(ithread) % unsigned(rows))
+    lane = Int(unsigned(ithread) ÷ unsigned(rows))
+    iout = iblock * rows + row
+
+    acc = neutral
+    if iout < output_size
+        # Contiguous outputs (outer_strides == (1,)), so the source base is just iout;
+        # base_offset (0 for a dense source) is folded in once, outside the reduce loop.
+        sbase = base_offset + iout
+        j = lane
+        while j < reduce_size
+            acc = op(acc, f(src[sbase + j * reduce_stride + 0x1]))
+            j += reduce_threads
+        end
+    end
 
-    # NOTE: for many index calculations in this library, computation using zero-indexing leads to
-    # fewer operations (also code is transpiled to CUDA / ROCm / oneAPI / Metal code which do zero
-    # indexing). Internal calculations will be done using zero indexing except when actually
-    # accessing memory. As with C, the lower bound is inclusive, the upper bound exclusive.
+    sdata[ithread + 0x1] = acc
+    @synchronize()
+
+    step = reduce_threads ÷ 2
+    while step >= 1
+        if lane < step
+            dst_lane = row + lane * rows
+            src_lane = row + (lane + step) * rows
+            sdata[dst_lane + 0x1] = op(sdata[dst_lane + 0x1], sdata[src_lane + 0x1])
+        end
+        @synchronize()
+        step ÷= 2
+    end
 
-    # Group (block) and local (thread) indices
-    iblock = @index(Group, Linear) - 0x1
+    if lane == 0x0 && iout < output_size
+        dst[iout + 0x1] = op(init, sdata[row + 0x1])
+    end
+end
+
+# GPU kernel: by_thread — one thread per output element, reducing sequentially.
+# Used when there are more output elements than elements in the reduced dimension(s),
+# e.g. reduce(+, rand(3, 1000), dims=1) — only 3 elements to reduce per output.
+
+@kernel inbounds=true cpu=false unsafe_indices=true function _mapreduce_nd_by_thread!(
+    @Const(src), dst,
+    f, op, init,
+    base_offset,
+    outer_strides, outer_sizes,
+    reduce_strides, reduce_sizes,
+    output_size, reduce_size,
+)
+    # NOTE: index calculations use zero-indexing (fewer ops, matches the CUDA / ROCm / oneAPI /
+    # Metal code this is transpiled to), converting to one-indexing only at memory accesses.
+    # `base_offset` (the source's element offset within its dense buffer; 0 for a dense
+    # source) is folded into the per-output base, so it costs at most one add per thread.
+    N       = @groupsize()[1]
+    iblock  = @index(Group, Linear) - 0x1
     ithread = @index(Local, Linear) - 0x1
+    tid     = ithread + iblock * N
 
-    # Each thread handles one output element
-    tid = ithread + iblock * N
     if tid < output_size
+        input_base = base_offset + _outer_decode(tid, outer_strides, outer_sizes)
 
-        # # Sometimes slightly faster method using additional memory with
-        # # output_idx = @private typeof(iblock) (ndims,)
-        # tmp = tid
-        # KernelAbstractions.Extras.@unroll for i in ndims:-1:1
-        #     output_idx[i] = tmp ÷ dst_strides[i]
-        #     tmp = tmp % dst_strides[i]
-        # end
-        # # Compute the base index in src (excluding the reduced axis)
-        # input_base_idx = 0
-        # KernelAbstractions.Extras.@unroll for i in 1:ndims
-        #     i == dims && continue
-        #     input_base_idx += output_idx[i] * src_strides[i]
-        # end
-
-        # Compute the base index in src (excluding the reduced axis)
-        input_base_idx = typeof(ithread)(0)
-        tmp = tid
-        KernelAbstractions.Extras.@unroll for i in ndims:-1i16:1i16
-            if i != dims
-                input_base_idx += (tmp ÷ dst_strides[i]) * src_strides[i]
-            end
-            tmp = tmp % dst_strides[i]
-        end
-
-        # Go over each element in the reduced dimension; this implementation assumes that there
-        # are so many outer elements (each processed by an independent thread) that we afford to
-        # loop sequentially over the reduced dimension (e.g. reduce(+, rand(3, 1000), dims=1))
         res = init
-        for i in 0x0:reduce_size - 0x1
-            src_idx = input_base_idx + i * src_strides[dims]
-            res = op(res, f(src[src_idx + 0x1]))
+        for j in 0x0:reduce_size - 0x1
+            off = _reduce_offset(j, reduce_strides, reduce_sizes)
+            res = op(res, f(src[input_base + off + 0x1]))
         end
+
         dst[tid + 0x1] = res
     end
 end
 
+# GPU kernel: by_block — each block reduces one output element, then grid-strides to
+# the next output (iout += num_blocks) until all outputs are covered. When
+# num_blocks >= output_size, each block handles at most one output (the original,
+# non-striding behavior) at zero extra cost — the while loop runs once.
+#
+# Used for GS_DST_CUTOFF <= dst_size < reduce_size (grid-stride, num_blocks ==
+# min(dst_size, TARGET_BLOCKS) < dst_size in general), and also for the
+# reduce_groups==1 fallback inside the multigroup branch (num_blocks == dst_size,
+# so the loop runs exactly once per block — identical to the pre-grid-stride kernel).
 
 @kernel inbounds=true cpu=false unsafe_indices=true function _mapreduce_nd_by_block!(
     @Const(src), dst,
-    f, op,
-    init, neutral,
-    dims,
+    f, op, init, neutral,
+    base_offset,
+    outer_strides, outer_sizes,
+    reduce_strides, reduce_sizes,
+    output_size, reduce_size, num_blocks,
 )
-    # One block per output element, when there are more elements in the reduced dim than in outer
-    # e.g. reduce(+, rand(3, 1000), dims=2) => only 3 elements in outer dimensions
-    src_sizes = size(src)
-    src_strides = strides(src)
-    dst_sizes = size(dst)
-    dst_strides = strides(dst)
-
-    output_size = length(dst)
-    reduce_size = src_sizes[dims]
-
-    ndims = length(src_sizes)
-
     @uniform N = @groupsize()[1]
     sdata = @localmem eltype(dst) (N,)
 
-    # NOTE: for many index calculations in this library, computation using zero-indexing leads to
-    # fewer operations (also code is transpiled to CUDA / ROCm / oneAPI / Metal code which do zero
-    # indexing). Internal calculations will be done using zero indexing except when actually
-    # accessing memory. As with C, the lower bound is inclusive, the upper bound exclusive.
-
-    # Group (block) and local (thread) indices
-    iblock = @index(Group, Linear) - 0x1
+    iblock  = @index(Group, Linear) - 0x1
     ithread = @index(Local, Linear) - 0x1
+    iout = iblock
+    while true
+        input_base = base_offset + _outer_decode(iout, outer_strides, outer_sizes)
+
+        acc = neutral
+        j   = ithread
+        while j < reduce_size
+            off = _reduce_offset(j, reduce_strides, reduce_sizes)
+            acc = op(acc, f(src[input_base + off + 0x1]))
+            j  += N
+        end
 
-    # Each block handles one output element - thus, iblock ∈ [0, output_size)
-
-    # # Sometimes slightly faster method using additional memory with
-    # # output_idx = @private typeof(iblock) (ndims,)
-    # tmp = iblock
-    # KernelAbstractions.Extras.@unroll for i in ndims:-1:1
-    #     output_idx[i] = tmp ÷ dst_strides[i]
-    #     tmp = tmp % dst_strides[i]
-    # end
-    # # Compute the base index in src (excluding the reduced axis)
-    # input_base_idx = 0
-    # KernelAbstractions.Extras.@unroll for i in 1:ndims
-    #     i == dims && continue
-    #     input_base_idx += output_idx[i] * src_strides[i]
-    # end
-
-    # Compute the base index in src (excluding the reduced axis)
-    input_base_idx = typeof(ithread)(0)
-    tmp = iblock
-    KernelAbstractions.Extras.@unroll for i in ndims:-1i16:1i16
-        if i != dims
-            input_base_idx += (tmp ÷ dst_strides[i]) * src_strides[i]
+        sdata[ithread + 0x1] = acc
+        @synchronize()
+
+        @inline reduce_group!(@context, op, sdata, N, ithread)
+
+        if ithread == 0x0
+            dst[iout + 0x1] = op(init, sdata[0x1])
         end
-        tmp = tmp % dst_strides[i]
+
+        next_iout = iout + num_blocks
+        next_iout >= output_size && break
+
+        @synchronize()
+        iout = next_iout
     end
+end
+
+# GPU kernel: multi-group first pass — several blocks per output element. Block `iblock`
+# handles output `iout` and group `igroup`, reducing its interleaved slice of the reduction
+# into partial[iout, igroup]. The second pass combines the groups.
+
+@kernel inbounds=true cpu=false unsafe_indices=true function _mapreduce_nd_multigroup!(
+    @Const(src), partial,
+    f, op, neutral,
+    base_offset,
+    outer_strides, outer_sizes,
+    reduce_strides, reduce_sizes,
+    ::Val{output_size}, reduce_size, reduce_groups,
+) where {output_size}
+    @uniform N = @groupsize()[1]
+    sdata = @localmem eltype(partial) (N,)
+
+    iblock  = @index(Group, Linear) - 0x1
+    ithread = @index(Local, Linear) - 0x1
 
-    # We have a block of threads to process the whole reduced dimension. First do pre-reduction
-    # in strides of N
-    partial = neutral
-    i = ithread
-    while i < reduce_size
-        src_idx = input_base_idx + i * src_strides[dims]
-        partial = op(partial, f(src[src_idx + 0x1]))
-        i += N
+    # Int(...) keeps the index decode signed (see tiled kernel): avoids a CUDA throw guard.
+    iout   = Int(unsigned(iblock) % unsigned(output_size))
+    igroup = Int(unsigned(iblock) ÷ unsigned(output_size))
+
+    input_base = base_offset + _outer_decode(iout, outer_strides, outer_sizes)
+
+    acc = neutral
+    j   = ithread + igroup * N
+    while j < reduce_size
+        off = _reduce_offset(j, reduce_strides, reduce_sizes)
+        acc = op(acc, f(src[input_base + off + 0x1]))
+        j  += N * reduce_groups
     end
 
-    # Store partial result in shared memory; now we are down to a single block to reduce within
-    sdata[ithread + 0x1] = partial
+    sdata[ithread + 0x1] = acc
     @synchronize()
 
     @inline reduce_group!(@context, op, sdata, N, ithread)
 
     if ithread == 0x0
-        dst[iblock + 0x1] = op(init, sdata[0x1])
+        partial[iout + igroup * output_size + 0x1] = sdata[0x1]
+    end
+end
+
+# GPU kernel: multi-group second pass — one block per output reduces over the `reduce_groups`
+# partials and folds in `init`. Launched with a block size sized to `reduce_groups`
+# (see _pass2_block_size), avoiding an oversubscribed tree-reduce when reduce_groups
+# is much smaller than 256.
+
+@kernel inbounds=true cpu=false unsafe_indices=true function _mapreduce_partial_to_dst!(
+    @Const(partial), dst,
+    op, init, neutral,
+    output_size, reduce_groups,
+)
+    @uniform N = @groupsize()[1]
+    sdata = @localmem eltype(dst) (N,)
+
+    iblock  = @index(Group, Linear) - 0x1
+    ithread = @index(Local, Linear) - 0x1
+
+    if iblock < output_size
+        acc = neutral
+        g = ithread
+        while g < reduce_groups
+            acc = op(acc, partial[iblock + g * output_size + 0x1])
+            g += N
+        end
+
+        sdata[ithread + 0x1] = acc
+        @synchronize()
+
+        @inline reduce_group!(@context, op, sdata, N, ithread)
+
+        if ithread == 0x0
+            dst[iblock + 0x1] = op(init, sdata[0x1])
+        end
     end
 end
diff --git a/src/reduce/reduce.jl b/src/reduce/reduce.jl
index 230ded1b..ba29351e 100644
--- a/src/reduce/reduce.jl
+++ b/src/reduce/reduce.jl
@@ -1,5 +1,52 @@
 # Backend implementations
 include("utilities.jl")
+
+const MapReduceSource = Union{AbstractArray, Base.Broadcast.Broadcasted}
+
+function _mapreduce_backend(src::AbstractArray)
+    return _mapreduce_get_backend(src)
+end
+
+function _mapreduce_backend(src::Base.Broadcast.Broadcasted)
+    backend = _mapreduce_backend_from_args(src.args)
+    return isnothing(backend) ? CPU_BACKEND : backend
+end
+
+function _mapreduce_get_backend(src::AbstractArray)
+    try
+        return get_backend(src)
+    catch err
+        err isa ArgumentError || rethrow()
+        return CPU_BACKEND
+    end
+end
+
+_mapreduce_backend_from_arg(src::AbstractArray) = _mapreduce_get_backend(src)
+_mapreduce_backend_from_arg(src::Base.Broadcast.Broadcasted) = _mapreduce_backend(src)
+_mapreduce_backend_from_arg(_) = nothing
+
+function _mapreduce_backend_from_args(args::Tuple)
+    backend = nothing
+    for arg in args
+        arg_backend = _mapreduce_backend_from_arg(arg)
+        isnothing(arg_backend) && continue
+        if isnothing(backend)
+            backend = arg_backend
+        else
+            @argcheck arg_backend == backend
+        end
+    end
+    return backend
+end
+
+function _mapreduce_check_map_axes(src::AbstractArray, srcs::AbstractArray...)
+    src_axes = axes(src)
+    for other in srcs
+        axes(other) == src_axes || throw(DimensionMismatch("all input arrays must have the same axes"))
+    end
+    return nothing
+end
+
 include("mapreduce_1d_cpu.jl")
 include("mapreduce_1d_gpu.jl")
 include("mapreduce_nd.jl")
@@ -9,8 +56,8 @@ include("mapreduce_nd.jl")
     reduce(
         op, src::AbstractArray, backend::Backend=get_backend(src);
         init,
-        neutral=neutral_element(op, eltype(src)),
-        dims::Union{Nothing, Int}=nothing,
+        neutral=neutral_element(op, typeof(init)),
+        dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing,
 
         # CPU settings
         max_tasks::Int=Threads.nthreads(),
@@ -22,16 +69,17 @@ include("mapreduce_nd.jl")
         switch_below::Int=0,
     )
 
-Reduce `src` along dimensions `dims` using the binary operator `op`. If `dims` is `nothing`, reduce
-`src` to a scalar. If `dims` is an integer, reduce `src` along that dimension. The `init` value is
-used as the initial value for the reduction; `neutral` is the neutral element for the operator `op`.
+Reduce `src` along dimensions `dims` using the binary operator `op`. If `dims` is `nothing` or
+`:`, reduce `src` to a scalar. If `dims` is an integer or a tuple of integers, reduce `src` along
+those dimension(s). The `init` value is used as the initial value for the reduction; `neutral` is
+the neutral element for the operator `op`.
 
 The returned type is the same as `init` - to control output precision, specify `init` explicitly.
 
 ## CPU settings
 Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional
-arrays (`dims::Int`) multithreading currently only becomes faster for `max_tasks >= 4`; all other
-cases are scaling linearly with the number of threads.
+arrays (`dims` is an integer or tuple) multithreading currently only becomes faster for
+`max_tasks >= 4`; all other cases are scaling linearly with the number of threads.
 
 Note that multithreading reductions only improves performance for cases with more compute-heavy
 operations, which hide the memory latency and thread launch overhead - that includes:
@@ -41,14 +89,15 @@ operations, which hide the memory latency and thread launch overhead - that incl
 For non-memory-bound operations, reductions scale almost linearly with the number of threads.
 
 ## GPU settings
-The `block_size` parameter controls the number of threads per block.
+The `block_size` parameter controls the number of threads per block and must be a power of two.
 
 The `temp` parameter can be used to pass a pre-allocated temporary array. For reduction to a scalar
-(`dims=nothing`), `length(temp) >= 2 * (length(src) + 2 * block_size - 1) ÷ (2 * block_size)` is
-required. For reduction along a dimension (`dims` is an integer), `temp` is used as the destination
-array, and thus must have the exact dimensions required - i.e. same dimensionwise sizes as `src`,
-except for the reduced dimension which becomes 1; there are some corner cases when one dimension is
-zero, check against `Base.reduce` for CPU arrays for exact behavior.
+(`dims=nothing` or `dims=:`), `length(temp) >= 2 * (length(src) + 2 * block_size - 1) ÷ (2 *
+block_size)` is required. For reduction along dimensions (`dims` is an integer or tuple), `temp` is
+used as the destination array, and thus must have the exact dimensions required - i.e. same
+dimensionwise sizes as `src`, except for the reduced dimension(s) which become 1; there are some
+corner cases when one dimension is zero, check against `Base.reduce` for CPU arrays for exact
+behavior.
 
 The `switch_below` parameter controls the threshold below which the reduction is performed on the
 CPU and is only used for 1D reductions (i.e. `dims=nothing`).
@@ -74,7 +123,7 @@ mcolsum = AK.reduce(+, m; init=zero(eltype(m)), dims=2)
 ```
 """
 function reduce(
-    op, src::AbstractArray, backend::Backend=get_backend(src);
+    op, src::AbstractArray, backend::Backend=_mapreduce_backend(src);
     init,
     kwargs...
 )
@@ -92,8 +141,8 @@ end
     mapreduce(
         f, op, src::AbstractArray, backend::Backend=get_backend(src);
         init,
-        neutral=neutral_element(op, eltype(src)),
-        dims::Union{Nothing, Int}=nothing,
+        neutral=neutral_element(op, typeof(init)),
+        dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon}=nothing,
 
         # CPU settings
         max_tasks::Int=Threads.nthreads(),
@@ -105,29 +154,40 @@ end
         switch_below::Int=0,
     )
 
+    mapreduce(f, op, A::AbstractArray, B::AbstractArray, As::AbstractArray...; init, kwargs...)
+    mapreduce(f, op, A::AbstractArray, B::AbstractArray, As::AbstractArray..., backend::Backend; init, kwargs...)
+
 Reduce `src` along dimensions `dims` using the binary operator `op` after applying `f` elementwise.
-If `dims` is `nothing`, reduce `src` to a scalar. If `dims` is an integer, reduce `src` along that
-dimension. The `init` value is used as the initial value for the reduction (i.e. after mapping).
+If `dims` is `nothing` or `:`, reduce `src` to a scalar. If `dims` is an integer or a tuple of
+integers, reduce `src` along those dimension(s). The `init` value is used as the initial value for
+the reduction (i.e. after mapping).
 
-The `neutral` value is the neutral element (zero) for the operator `op`, which is needed for an
-efficient GPU implementation that also allows a nonzero `init`.
+The `neutral` value is the neutral element for the operator `op`, which is needed for an efficient
+GPU implementation that also allows a nonzero `init`.
 
 The returned type is the same as `init` - to control output precision, specify `init` explicitly.
 
+Multiple input arrays are supported with the same axes. This follows `Base.mapreduce(f, op, A, B,
+...)` semantics: `f` is mapped across corresponding elements of the inputs and the mapped values
+are reduced without materializing the intermediate array. Mismatched axes throw
+`DimensionMismatch`; singleton-expanding broadcast semantics are reserved for internal
+`Broadcasted` sources used by array backends.
+
 ## CPU settings
 Use at most `max_tasks` threads with at least `min_elems` elements per task. For N-dimensional
-arrays (`dims::Int`) multithreading currently only becomes faster for `max_tasks >= 4`; all other
-cases are scaling linearly with the number of threads.
+arrays (`dims` is an integer or tuple) multithreading currently only becomes faster for
+`max_tasks >= 4`; all other cases are scaling linearly with the number of threads.
 
 ## GPU settings
-The `block_size` parameter controls the number of threads per block.
+The `block_size` parameter controls the number of threads per block and must be a power of two.
 
 The `temp` parameter can be used to pass a pre-allocated temporary array. For reduction to a scalar
-(`dims=nothing`), `length(temp) >= 2 * (length(src) + 2 * block_size - 1) ÷ (2 * block_size)` is
-required. For reduction along a dimension (`dims` is an integer), `temp` is used as the destination
-array, and thus must have the exact dimensions required - i.e. same dimensionwise sizes as `src`,
-except for the reduced dimension which becomes 1; there are some corner cases when one dimension is
-zero, check against `Base.reduce` for CPU arrays for exact behavior.
+(`dims=nothing` or `dims=:`), `length(temp) >= 2 * (length(src) + 2 * block_size - 1) ÷ (2 *
+block_size)` is required. For reduction along dimensions (`dims` is an integer or tuple), `temp` is
+used as the destination array, and thus must have the exact dimensions required - i.e. same
+dimensionwise sizes as `src`, except for the reduced dimension(s) which become 1; there are some
+corner cases when one dimension is zero, check against `Base.reduce` for CPU arrays for exact
+behavior.
 
 The `switch_below` parameter controls the threshold below which the reduction is performed on the
 CPU and is only used for 1D reductions (i.e. `dims=nothing`).
@@ -152,9 +212,19 @@ m = MtlArray(rand(Int32(1):Int32(100), 10, 100_000))
 mrowsumsq = AK.mapreduce(f, +, m; init=zero(eltype(m)), dims=1)
 mcolsumsq = AK.mapreduce(f, +, m; init=zero(eltype(m)), dims=2)
 ```
+
+Computing a two-input dimensional reduction:
+```julia
+rows = AK.mapreduce((x, y) -> x * y, +, a, b; init=0f0, dims=1)
+```
+
+An explicit backend may be passed after all input arrays:
+```julia
+rows = AK.mapreduce((x, y) -> x * y, +, a, b, backend; init=0f0, dims=1)
+```
 """
 function mapreduce(
-    f, op, src::AbstractArray, backend::Backend=get_backend(src);
+    f, op, src::MapReduceSource, backend::Backend=_mapreduce_backend(src);
     init,
     kwargs...
 )
@@ -165,12 +235,53 @@ function mapreduce(
     )
 end
 
+function mapreduce(
+    f, op, src::AbstractArray, src2::AbstractArray, srcs::AbstractArray...;
+    init,
+    kwargs...
+)
+    return _mapreduce_multi(
+        f, op, nothing, src, src2, srcs...;
+        init,
+        kwargs...
+    )
+end
+
+function mapreduce(
+    f, op, src::AbstractArray, src2::AbstractArray, arg, args...;
+    init,
+    kwargs...
+)
+    backend = isempty(args) ? arg : args[end]
+    backend isa Backend || throw(MethodError(mapreduce, (f, op, src, src2, arg, args...)))
+    srcs = isempty(args) ? () : (arg, args[1:end - 1]...)
+    return _mapreduce_multi(
+        f, op, backend, src, src2, srcs...;
+        init,
+        kwargs...
+    )
+end
+
+function _mapreduce_multi(
+    f, op, backend::Union{Nothing, Backend}, src::AbstractArray, srcs::AbstractArray...;
+    init,
+    kwargs...
+)
+    _mapreduce_check_map_axes(src, srcs...)
+    bc = Base.Broadcast.instantiate(Base.Broadcast.broadcasted(f, src, srcs...))
+    return mapreduce(
+        identity, op, bc, isnothing(backend) ? _mapreduce_backend(bc) : backend;
+        init,
+        kwargs...
+    )
+end
+
 
 function _mapreduce_impl(
-    f, op, src::AbstractArray, backend::Backend;
+    f, op, src::MapReduceSource, backend::Backend;
     init,
-    neutral=neutral_element(op, eltype(src)),
-    dims::Union{Nothing, Int}=nothing,
+    neutral=neutral_element(op, typeof(init)),
+    dims::Union{Nothing, Int, Tuple{Vararg{Int}}, Colon} = nothing,
 
     # CPU settings
     max_tasks::Int=Threads.nthreads(),
@@ -182,7 +293,14 @@ function _mapreduce_impl(
     temp::Union{Nothing, AbstractArray}=nothing,
     switch_below::Int=0,
 )
-    if isnothing(dims)
+
+    # scalar *linear* indexing into a multidimensional Broadcasted object is
+    # only available on Julia 1.12; on earlier version, materialize it first.
+    if VERSION < v"1.12-" && src isa Base.Broadcast.Broadcasted
+        src = Base.Broadcast.materialize(src)
+    end
+
+    if isnothing(dims) || dims isa Colon
         if use_gpu_algorithm(backend, prefer_threads)
             mapreduce_1d_gpu(
                 f, op, src, backend;
diff --git a/test/generic/reduce.jl b/test/generic/reduce.jl
index 9fe8b5c2..3f0587ea 100644
--- a/test/generic/reduce.jl
+++ b/test/generic/reduce.jl
@@ -120,8 +120,20 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0)
         @test s == reduce(+, vh)
     end
 
+    # Base-compatible alias: dims=: reduces all dimensions to a scalar.
+    vh_colon = rand(Int32(1):Int32(10), 3, 4, 5)
+    @test AK.reduce(+, array_from_host(vh_colon); prefer_threads, init=Int32(0), dims=:) ==
+        reduce(+, vh_colon; init=Int32(0), dims=:)
+
+    vh_one = Int32[7]
+    @test AK.reduce(+, array_from_host(vh_one); prefer_threads, init=Int32(10)) ==
+        reduce(+, vh_one; init=Int32(10))
+
     # Test that undefined kwargs are not accepted
     @test_throws MethodError AK.reduce(+, array_from_host(rand(Int32, 10)); init=10, bad=:kwarg)
+    if !IS_CPU_BACKEND
+        @test_throws ArgumentError AK.reduce(+, array_from_host(rand(Int32, 256)); prefer_threads, init=Int32(0), block_size=192)
+    end
 
     # Testing different settings
     AK.reduce(
@@ -222,8 +234,67 @@ end
         end
     end
 
+    # Duplicate dims match Base semantics and are reduced once.
+    vh_dup = rand(Int32(1):Int32(10), 3, 4, 5)
+    @test Array(AK.reduce(+, array_from_host(vh_dup); prefer_threads, init=Int32(0), dims=(2,2))) ==
+        sum(vh_dup; init=Int32(0), dims=(2,2))
+
+    # min/max with dims: tests correct neutral element in partial reduction
+    for dims in 1:3
+        n1 = rand(1:50); n2 = rand(1:50); n3 = rand(1:50)
+        vh = rand(Int32(1):Int32(100), n1, n2, n3)
+        v = array_from_host(vh)
+        @test Array(AK.reduce(min, v; prefer_threads, init=typemax(Int32), neutral=typemax(Int32), dims)) == minimum(vh; dims)
+        @test Array(AK.reduce(max, v; prefer_threads, init=typemin(Int32), neutral=typemin(Int32), dims)) == maximum(vh; dims)
+    end
+
+    # Tuple dims support. Order and duplicates match Base semantics.
+    for dims in [(1,2), (1,3), (2,3), (1,2,3), (2,1), (3,1), (2,1,2)]
+        for n1 in [1, 5, 10], n2 in [1, 5, 10], n3 in [1, 5, 10]
+            vh = rand(Int32(1):Int32(100), n1, n2, n3)
+            v = array_from_host(vh)
+            s = AK.reduce(+, v; prefer_threads, init=Int32(0), dims)
+            sh = Array(s)
+            @test sh == sum(vh; dims)
+        end
+    end
+
+    # Tiled strided GPU path: contiguous kept dimensions, one strided reduce
+    # dimension, and dst_size == reduce_size. The 3D case also exercises a
+    # partial output tile.
+    for (shape, dims) in (((512, 512), 2), ((20, 13, 260), 3))
+        vh = rand(Int32(1):Int32(3), shape...)
+        v = array_from_host(vh)
+        @test Array(AK.reduce(+, v; prefer_threads, init=Int32(0), dims)) ==
+            sum(vh; init=Int32(0), dims)
+    end
+
+    if IS_CPU_BACKEND
+        # The CPU fallback should not require strided storage.
+        vh = reshape(1:12, 1, 3, 4)
+        @test Array(AK.reduce(+, vh, BACKEND; prefer_threads, init=0, dims=(1,2))) ==
+            sum(vh; init=0, dims=(1,2))
+    else
+        # Strided GPU sources (views, adjoints, permuted dims) take the stride-based
+        # fast path over their dense parent buffer; the offset view exercises a nonzero
+        # base offset. Broadcasted/lazy sources still take the generic fallback.
+        vh = reshape(Int32(1):Int32(40), 5, 8)
+        v = array_from_host(vh)
+        @test Array(AK.reduce(+, @view(v[:, 1:2:end]); prefer_threads, init=Int32(0), dims=2)) ==
+            Base.reduce(+, @view(vh[:, 1:2:end]); init=Int32(0), dims=2)
+        @test Array(AK.reduce(+, @view(v[2:end, 1:2:end]); prefer_threads, init=Int32(0), dims=2)) ==
+            Base.reduce(+, @view(vh[2:end, 1:2:end]); init=Int32(0), dims=2)
+        @test Array(AK.reduce(+, v'; prefer_threads, init=Int32(0), dims=1)) ==
+            Base.reduce(+, vh'; init=Int32(0), dims=1)
+        @test Array(AK.reduce(+, PermutedDimsArray(v, (2, 1)); prefer_threads, init=Int32(0), dims=1)) ==
+            Base.reduce(+, PermutedDimsArray(vh, (2, 1)); init=Int32(0), dims=1)
+    end
+
     # Test that undefined kwargs are not accepted
     @test_throws MethodError AK.reduce(+, array_from_host(rand(Int32, 10, 10)); prefer_threads, init=10, bad=:kwarg)
+    if !IS_CPU_BACKEND
+        @test_throws ArgumentError AK.reduce(+, array_from_host(rand(Int32, 16, 16)); prefer_threads, init=Int32(0), dims=1, block_size=192)
+    end
 
     # Testing different settings
     AK.reduce(
@@ -342,6 +413,76 @@ end
         @test s == mapreduce(abs, +, vh)
     end
 
+    # Base-compatible alias: dims=: reduces all dimensions to a scalar.
+    vh_colon = rand(Int32(-10):Int32(10), 3, 4, 5)
+    @test AK.mapreduce(abs, +, array_from_host(vh_colon); prefer_threads, init=Int32(0), dims=:) ==
+        mapreduce(abs, +, vh_colon; init=Int32(0), dims=:)
+
+    vh_one = Int32[-7]
+    @test AK.mapreduce(abs, +, array_from_host(vh_one); prefer_threads, init=Int32(10)) ==
+        mapreduce(abs, +, vh_one; init=Int32(10))
+
+    vh_typechange = rand(Int32(-10):Int32(10), 4, 5)
+    f_typechange = x -> Float32(x) / 2
+    @test AK.mapreduce(f_typechange, +, array_from_host(vh_typechange); prefer_threads, init=0f0) ≈
+        mapreduce(f_typechange, +, vh_typechange; init=0f0)
+    @test Array(AK.mapreduce(f_typechange, +, array_from_host(vh_typechange); prefer_threads, init=0f0, dims=2)) ≈
+        mapreduce(f_typechange, +, vh_typechange; init=0f0, dims=2)
+    f_min_typechange = x -> Float32(10_000_000_000 + x)
+    f_max_typechange = x -> Float32(-10_000_000_000 + x)
+    @test AK.mapreduce(f_min_typechange, min, array_from_host(vh_typechange); prefer_threads, init=Inf32) ≈
+        mapreduce(f_min_typechange, min, vh_typechange; init=Inf32)
+    @test AK.mapreduce(f_max_typechange, max, array_from_host(vh_typechange); prefer_threads, init=-Inf32) ≈
+        mapreduce(f_max_typechange, max, vh_typechange; init=-Inf32)
+
+    # Multi-input mapreduce lowers through a broadcasted source.
+    vh_a = rand(Int32(-10):Int32(10), 4, 5, 6)
+    vh_b = rand(Int32(-10):Int32(10), 4, 5, 6)
+    vh_c = rand(Int32(-10):Int32(10), 4, 5, 6)
+    v_a = array_from_host(vh_a)
+    v_b = array_from_host(vh_b)
+    v_c = array_from_host(vh_c)
+    @test AK.mapreduce((x, y) -> x * y, +, v_a, v_b; prefer_threads, init=Int32(0)) ==
+        mapreduce((x, y) -> x * y, +, vh_a, vh_b; init=Int32(0))
+    @test AK.mapreduce((x, y) -> x * y, +, v_a, v_b, BACKEND; prefer_threads, init=Int32(0)) ==
+        mapreduce((x, y) -> x * y, +, vh_a, vh_b; init=Int32(0))
+    @test AK.mapreduce((x, y, z) -> x + y * z, +, v_a, v_b, v_c, BACKEND; prefer_threads, init=Int32(0)) ==
+        mapreduce((x, y, z) -> x + y * z, +, vh_a, vh_b, vh_c; init=Int32(0))
+    @test AK.mapreduce((x, y) -> x * y, +, v_a, v_b; prefer_threads, init=Int32(0), dims=:) ==
+        mapreduce((x, y) -> x * y, +, vh_a, vh_b; init=Int32(0), dims=:)
+    @test Array(AK.mapreduce((x, y) -> x * y, +, v_a, v_b; prefer_threads, init=Int32(0), dims=())) ==
+        mapreduce((x, y) -> x * y, +, vh_a, vh_b; init=Int32(0), dims=())
+    @test AK.mapreduce((x, y) -> Float32(x - y) / 3, +, v_a, v_b; prefer_threads, init=0f0) ≈
+        mapreduce((x, y) -> Float32(x - y) / 3, +, vh_a, vh_b; init=0f0)
+
+    for (shape, dims) in (((0, 3), 1), ((2, 0), 2), ((0, 0), (1, 2)), ((0, 3), ()))
+        h_empty1 = reshape(Int32[], shape...)
+        h_empty2 = fill(Int32(2), shape...)
+        @test Array(AK.mapreduce((x, y) -> x + y, +,
+                                  array_from_host(h_empty1),
+                                  array_from_host(h_empty2);
+                                  prefer_threads, init=Int32(10), dims)) ==
+            mapreduce((x, y) -> x + y, +, h_empty1, h_empty2; init=Int32(10), dims)
+    end
+
+    @test_throws DimensionMismatch AK.mapreduce(
+        (x, y) -> x + y, +,
+        array_from_host(rand(Int32, 2, 3)),
+        array_from_host(rand(Int32, 1, 3));
+        prefer_threads,
+        init=Int32(0),
+    )
+
+    if IS_CPU_BACKEND
+        bc = Base.Broadcast.instantiate(Base.Broadcast.broadcasted(+, reshape(1:6, 2, 3), reshape(10:15, 2, 3)))
+        @test AK.mapreduce(identity, +, bc; prefer_threads, init=0) ==
+            mapreduce(identity, +, bc; init=0)
+        @test Array(AK.mapreduce(identity, +, bc; prefer_threads, init=0, dims=2)) ==
+            mapreduce(identity, +, bc; init=0, dims=2)
+        @test Array(AK.mapreduce(identity, +, bc; prefer_threads, init=0, dims=())) ==
+            mapreduce(identity, +, bc; init=0, dims=())
+    end
+
     # Testing different settings, enforcing change of type between f and op
     f(s, temp) = AK.mapreduce(
         p -> (p.x, p.y),
@@ -362,6 +503,9 @@ end
 
     # Test that undefined kwargs are not accepted
     @test_throws MethodError AK.mapreduce(-, +, v; prefer_threads, init=10, bad=:kwarg)
+    if !IS_CPU_BACKEND
+        @test_throws ArgumentError AK.mapreduce(-, +, array_from_host(rand(Int32, 256)); prefer_threads, init=Int32(0), block_size=192)
+    end
 end
 
 
@@ -458,8 +602,87 @@ end
         end
     end
 
+    # Duplicate dims match Base semantics and are reduced once.
+    vh_dup = rand(Int32(1):Int32(10), 3, 4, 5)
+    @test Array(AK.mapreduce(-, +, array_from_host(vh_dup); prefer_threads, init=Int32(0), dims=(2,2))) ==
+        mapreduce(-, +, vh_dup; init=Int32(0), dims=(2,2))
+
+    # Multi-input mapreduce with dimensional reductions.
+    vh_ma = rand(Int32(-10):Int32(10), 4, 5, 6)
+    vh_mb = rand(Int32(-10):Int32(10), 4, 5, 6)
+    v_ma = array_from_host(vh_ma)
+    v_mb = array_from_host(vh_mb)
+    for dims in (1, 2, (1, 2), (1, 3), (1, 2, 3), (2, 2))
+        @test Array(AK.mapreduce((x, y) -> x * y, +, v_ma, v_mb; prefer_threads, init=Int32(0), dims)) ==
+            mapreduce((x, y) -> x * y, +, vh_ma, vh_mb; init=Int32(0), dims)
+    end
+    @test Array(AK.mapreduce((x, y) -> x * y, +, v_ma, v_mb, BACKEND; prefer_threads, init=Int32(0), dims=(1, 2))) ==
+        mapreduce((x, y) -> x * y, +, vh_ma, vh_mb; init=Int32(0), dims=(1, 2))
+    @test Array(AK.mapreduce((x, y) -> x * y, +, v_ma, v_mb; prefer_threads, init=Int32(0), dims=())) ==
+        mapreduce((x, y) -> x * y, +, vh_ma, vh_mb; init=Int32(0), dims=())
+    @test Array(AK.mapreduce((x, y) -> Float32(x - y) / 3, +, v_ma, v_mb; prefer_threads, init=0f0, dims=(1, 2))) ≈
+        mapreduce((x, y) -> Float32(x - y) / 3, +, vh_ma, vh_mb; init=0f0, dims=(1, 2))
+    vh_typechange_nd = rand(Int32(-10):Int32(10), 4, 5)
+    f_min_typechange_nd = x -> Float32(10_000_000_000 + x)
+    f_max_typechange_nd = x -> Float32(-10_000_000_000 + x)
+    @test Array(AK.mapreduce(f_min_typechange_nd, min, array_from_host(vh_typechange_nd); prefer_threads, init=Inf32, dims=2)) ≈
+        mapreduce(f_min_typechange_nd, min, vh_typechange_nd; init=Inf32, dims=2)
+    @test Array(AK.mapreduce(f_max_typechange_nd, max, array_from_host(vh_typechange_nd); prefer_threads, init=-Inf32, dims=2)) ≈
+        mapreduce(f_max_typechange_nd, max, vh_typechange_nd; init=-Inf32, dims=2)
+
+    # min/max with dims: tests correct neutral element in partial reduction
+    for dims in 1:3
+        n1 = rand(1:50); n2 = rand(1:50); n3 = rand(1:50)
+        vh = rand(Int32(1):Int32(100), n1, n2, n3)
+        v = array_from_host(vh)
+        @test Array(AK.reduce(min, v; prefer_threads, init=typemax(Int32), neutral=typemax(Int32), dims)) == minimum(vh; dims)
+        @test Array(AK.reduce(max, v; prefer_threads, init=typemin(Int32), neutral=typemin(Int32), dims)) == maximum(vh; dims)
+    end
+
+    # Tuple dims support. Order and duplicates match Base semantics.
+    for dims in [(1,2), (1,3), (2,3), (1,2,3), (2,1), (3,1), (2,1,2)]
+        for n1 in [1, 5, 10], n2 in [1, 5, 10], n3 in [1, 5, 10]
+            vh = rand(Int32(1):Int32(100), n1, n2, n3)
+            v = array_from_host(vh)
+            s = AK.mapreduce(-, +, v; prefer_threads, init=Int32(0), dims)
+            sh = Array(s)
+            @test sh == mapreduce(-, +, vh; init=Int32(0), dims)
+        end
+    end
+
+    # Tiled strided GPU path coverage for mapreduce, including a 3D case with
+    # a partial output tile.
+    for (shape, dims) in (((512, 512), 2), ((20, 13, 260), 3))
+        vh = rand(Int32(1):Int32(3), shape...)
+        v = array_from_host(vh)
+        @test Array(AK.mapreduce(x -> x - Int32(1), +, v; prefer_threads, init=Int32(0), dims)) ==
+            mapreduce(x -> x - Int32(1), +, vh; init=Int32(0), dims)
+    end
+
+    if IS_CPU_BACKEND
+        # The CPU fallback should not require strided storage.
+        vh = reshape(1:12, 1, 3, 4)
+        @test Array(AK.mapreduce(x -> 2x, +, vh, BACKEND; prefer_threads, init=0, dims=(1,2))) ==
+            mapreduce(x -> 2x, +, vh; init=0, dims=(1,2))
+    else
+        # Strided GPU sources (views, adjoints, permuted dims) take the stride-based
+        # fast path over their dense parent buffer; the offset view exercises a nonzero
+        # base offset. Broadcasted/lazy sources still take the generic fallback.
+        vh = reshape(Int32(1):Int32(40), 5, 8)
+        v = array_from_host(vh)
+        @test Array(AK.mapreduce(x -> x - Int32(1), +, @view(v[:, 1:2:end]); prefer_threads, init=Int32(0), dims=2)) ==
+            mapreduce(x -> x - Int32(1), +, @view(vh[:, 1:2:end]); init=Int32(0), dims=2)
+        @test Array(AK.mapreduce(x -> x - Int32(1), +, @view(v[2:end, 1:2:end]); prefer_threads, init=Int32(0), dims=2)) ==
+            mapreduce(x -> x - Int32(1), +, @view(vh[2:end, 1:2:end]); init=Int32(0), dims=2)
+        @test Array(AK.mapreduce(x -> x - Int32(1), +, PermutedDimsArray(v, (2, 1)); prefer_threads, init=Int32(0), dims=1)) ==
+            mapreduce(x -> x - Int32(1), +, PermutedDimsArray(vh, (2, 1)); init=Int32(0), dims=1)
+    end
+
     # Test that undefined kwargs are not accepted
     @test_throws MethodError AK.mapreduce(-, +, array_from_host(rand(Int32, 3, 4, 5)); prefer_threads, init=10, bad=:kwarg)
+    if !IS_CPU_BACKEND
+        @test_throws ArgumentError AK.mapreduce(-, +, array_from_host(rand(Int32, 16, 16)); prefer_threads, init=Int32(0), dims=1, block_size=192)
+    end
 
     # Testing different settings
     AK.mapreduce(
@@ -528,6 +751,7 @@ end
     # Testing different settings
     v = array_from_host(rand(-5:5, 100_000))
     AK.sum(v; prefer_threads, block_size=64)
+    @test AK.sum(v; prefer_threads, dims=:) == sum(Array(v); dims=:)
 
     # Test that undefined kwargs are not accepted
     @test_throws MethodError AK.sum(v; prefer_threads, bad=:kwarg)
@@ -573,6 +797,7 @@ end
     # Testing different settings
     v = array_from_host(rand(-5:5, 100_000))
     AK.prod(v; prefer_threads, block_size=64)
+    @test AK.prod(v; prefer_threads, dims=:) == prod(Array(v); dims=:)
 
     # Test that undefined kwargs are not accepted
     @test_throws MethodError AK.prod(v; prefer_threads, bad=:kwarg)
@@ -618,6 +843,7 @@ end
     # Testing different settings
     v = array_from_host(rand(-5:5, 100_000))
     AK.minimum(v; prefer_threads, block_size=64)
+    @test AK.minimum(v; prefer_threads, dims=:) == minimum(Array(v); dims=:)
 
     # Test that undefined kwargs are not accepted
     @test_throws MethodError AK.minimum(v; prefer_threads, bad=:kwarg)
@@ -663,6 +889,7 @@ end
     # Testing different settings
     v = array_from_host(rand(-5:5, 100_000))
     AK.maximum(v; prefer_threads, block_size=64)
+    @test AK.maximum(v; prefer_threads, dims=:) == maximum(Array(v); dims=:)
 
     # Test that undefined kwargs are not accepted
     @test_throws MethodError AK.maximum(v; prefer_threads, bad=:kwarg)
@@ -715,6 +942,7 @@ end
     # Testing different settings
     v = array_from_host(rand(-5:5, 100_000))
     AK.count(x->x>0, v; prefer_threads, block_size=64)
+    @test AK.count(x->x>0, v; prefer_threads, dims=:) == count(x->x>0, Array(v); dims=:)
 
     # Test that undefined kwargs are not accepted
     @test_throws MethodError AK.count(v; prefer_threads, bad=:kwarg)