Skip to content

Commit 0cd4f36

Browse files
fix: avoid rebuild in ListView take_reduce for dense selections
When the selection density is above the threshold, skip take_reduce so we don't rebuild the ListView for dense takes. Signed-off-by: Dimitar Dimitrov <mitko@spiraldb.com> Signed-off-by: Dimitar Dimitrov <dimitar@spiraldb.com>
1 parent 95befa7 commit 0cd4f36

File tree

4 files changed

+109
-70
lines changed

4 files changed

+109
-70
lines changed

vortex-array/public-api.lock

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2200,6 +2200,10 @@ impl vortex_array::arrays::dict::TakeExecute for vortex_array::arrays::List
22002200

22012201
pub fn vortex_array::arrays::List::take(array: vortex_array::ArrayView<'_, vortex_array::arrays::List>, indices: &vortex_array::ArrayRef, ctx: &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::ArrayRef>>
22022202

2203+
impl vortex_array::arrays::dict::TakeExecute for vortex_array::arrays::ListView
2204+
2205+
pub fn vortex_array::arrays::ListView::take(array: vortex_array::ArrayView<'_, vortex_array::arrays::ListView>, indices: &vortex_array::ArrayRef, _ctx: &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::ArrayRef>>
2206+
22032207
impl vortex_array::arrays::dict::TakeExecute for vortex_array::arrays::Primitive
22042208

22052209
pub fn vortex_array::arrays::Primitive::take(array: vortex_array::ArrayView<'_, vortex_array::arrays::Primitive>, indices: &vortex_array::ArrayRef, ctx: &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::ArrayRef>>
@@ -2960,6 +2964,10 @@ impl vortex_array::ValidityVTable<vortex_array::arrays::ListView> for vortex_arr
29602964

29612965
pub fn vortex_array::arrays::ListView::validity(array: vortex_array::ArrayView<'_, vortex_array::arrays::ListView>) -> vortex_error::VortexResult<vortex_array::validity::Validity>
29622966

2967+
impl vortex_array::arrays::dict::TakeExecute for vortex_array::arrays::ListView
2968+
2969+
pub fn vortex_array::arrays::ListView::take(array: vortex_array::ArrayView<'_, vortex_array::arrays::ListView>, indices: &vortex_array::ArrayRef, _ctx: &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::ArrayRef>>
2970+
29632971
impl vortex_array::arrays::dict::TakeReduce for vortex_array::arrays::ListView
29642972

29652973
pub fn vortex_array::arrays::ListView::take(array: vortex_array::ArrayView<'_, vortex_array::arrays::ListView>, indices: &vortex_array::ArrayRef) -> vortex_error::VortexResult<core::option::Option<vortex_array::ArrayRef>>
@@ -5976,6 +5984,10 @@ impl vortex_array::ValidityVTable<vortex_array::arrays::ListView> for vortex_arr
59765984

59775985
pub fn vortex_array::arrays::ListView::validity(array: vortex_array::ArrayView<'_, vortex_array::arrays::ListView>) -> vortex_error::VortexResult<vortex_array::validity::Validity>
59785986

5987+
impl vortex_array::arrays::dict::TakeExecute for vortex_array::arrays::ListView
5988+
5989+
pub fn vortex_array::arrays::ListView::take(array: vortex_array::ArrayView<'_, vortex_array::arrays::ListView>, indices: &vortex_array::ArrayRef, _ctx: &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::ArrayRef>>
5990+
59795991
impl vortex_array::arrays::dict::TakeReduce for vortex_array::arrays::ListView
59805992

59815993
pub fn vortex_array::arrays::ListView::take(array: vortex_array::ArrayView<'_, vortex_array::arrays::ListView>, indices: &vortex_array::ArrayRef) -> vortex_error::VortexResult<core::option::Option<vortex_array::ArrayRef>>

vortex-array/src/arrays/dict/execute.rs

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ pub fn take_canonical(
4646
Canonical::Primitive(a) => Canonical::Primitive(take_primitive(&a, codes, ctx)),
4747
Canonical::Decimal(a) => Canonical::Decimal(take_decimal(&a, codes, ctx)),
4848
Canonical::VarBinView(a) => Canonical::VarBinView(take_varbinview(&a, codes, ctx)),
49-
Canonical::List(a) => Canonical::List(take_listview(&a, codes)),
49+
Canonical::List(a) => Canonical::List(take_listview(&a, codes, ctx)),
5050
Canonical::FixedSizeList(a) => {
5151
Canonical::FixedSizeList(take_fixed_size_list(&a, codes, ctx))
5252
}
@@ -123,14 +123,24 @@ fn take_varbinview(
123123
.into_owned()
124124
}
125125

126-
fn take_listview(array: &ListViewArray, codes: &PrimitiveArray) -> ListViewArray {
126+
fn take_listview(
127+
array: &ListViewArray,
128+
codes: &PrimitiveArray,
129+
ctx: &mut ExecutionCtx,
130+
) -> ListViewArray {
127131
let codes_ref = codes.clone().into_array();
128132
let array = array.as_view();
129-
<ListView as TakeReduce>::take(array, &codes_ref)
130-
.vortex_expect("take listview array")
131-
.vortex_expect("take listview should not return None")
132-
.as_::<ListView>()
133-
.into_owned()
133+
// Try the cheap metadata-only path first; fall back to the rebuilding execute path when the
134+
// reduce impl declines (see `ListView::TakeReduce` for the density heuristic).
135+
let taken = match <ListView as TakeReduce>::take(array, &codes_ref)
136+
.vortex_expect("take listview reduce")
137+
{
138+
Some(taken) => taken,
139+
None => <ListView as TakeExecute>::take(array, &codes_ref, ctx)
140+
.vortex_expect("take listview execute")
141+
.vortex_expect("ListView TakeExecute should not return None"),
142+
};
143+
taken.as_::<ListView>().into_owned()
134144
}
135145

136146
fn take_fixed_size_list(

vortex-array/src/arrays/filter/execute/listview.rs

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,18 +12,6 @@ use crate::arrays::filter::execute::values_to_mask;
1212
use crate::arrays::listview::ListViewArrayExt;
1313
use crate::arrays::listview::ListViewRebuildMode;
1414

15-
// TODO(connor)[ListView]: Make use of this threshold after we start migrating operators.
16-
/// The threshold for triggering a rebuild of the [`ListViewArray`].
17-
///
18-
/// By default, we will not touch the underlying `elements` array of the [`ListViewArray`] since it
19-
/// can be potentially expensive to reorganize the array based on what views we have into it.
20-
///
21-
/// However, we also do not want to carry around a large amount of garbage data. Below this
22-
/// threshold of the density of the selection mask, we will rebuild the [`ListViewArray`], removing
23-
/// any garbage data.
24-
#[allow(unused)]
25-
const REBUILD_DENSITY_THRESHOLD: f64 = 0.1;
26-
2715
/// [`ListViewArray`] filter implementation.
2816
///
2917
/// This implementation is deliberately simple and read-optimized. We just filter the `offsets` and

vortex-array/src/arrays/listview/compute/take.rs

Lines changed: 80 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,12 @@ use num_traits::Zero;
55
use vortex_error::VortexResult;
66

77
use crate::ArrayRef;
8+
use crate::ExecutionCtx;
89
use crate::IntoArray;
910
use crate::array::ArrayView;
1011
use crate::arrays::ListView;
1112
use crate::arrays::ListViewArray;
13+
use crate::arrays::dict::TakeExecute;
1214
use crate::arrays::dict::TakeReduce;
1315
use crate::arrays::listview::ListViewArrayExt;
1416
use crate::arrays::listview::ListViewRebuildMode;
@@ -17,72 +19,99 @@ use crate::dtype::Nullability;
1719
use crate::match_each_integer_ptype;
1820
use crate::scalar::Scalar;
1921

20-
// TODO(connor)[ListView]: Make use of this threshold after we start migrating operators.
21-
/// The threshold for triggering a rebuild of the [`ListViewArray`].
22+
/// The threshold below which we return `None` from [`TakeReduce`] so callers fall back to
23+
/// [`TakeExecute`] and rebuild the underlying `elements` buffer.
2224
///
23-
/// By default, we will not touch the underlying `elements` array of the [`ListViewArray`] since it
24-
/// can be potentially expensive to reorganize the array based on what views we have into it.
25-
///
26-
/// However, we also do not want to carry around a large amount of garbage data. Below this
27-
/// threshold of the density of the selection mask, we will rebuild the [`ListViewArray`], removing
28-
/// any garbage data.
29-
#[allow(unused)]
30-
const REBUILD_DENSITY_THRESHOLD: f64 = 0.1;
25+
/// We don't touch `elements` on the metadata-only path since reorganizing it can be expensive.
26+
/// However, we also don't want to drag around a large amount of garbage data when the selection
27+
/// is sparse. Below this fraction of list rows retained, the rebuild is worth it.
28+
const REBUILD_DENSITY_THRESHOLD: f32 = 0.1;
3129

32-
/// [`ListViewArray`] take implementation.
30+
/// Metadata-only take for [`ListViewArray`].
3331
///
3432
/// This implementation is deliberately simple and read-optimized. We just take the `offsets` and
35-
/// `sizes` at the requested indices and reuse the original `elements` array. This works because
36-
/// `ListView` (unlike `List`) allows non-contiguous and out-of-order lists.
33+
/// `sizes` at the requested indices and reuse the original `elements` buffer as-is. This works
34+
/// because `ListView` (unlike `List`) allows non-contiguous and out-of-order lists.
3735
///
3836
/// We don't slice the `elements` array because it would require computing min/max offsets and
39-
/// adjusting all offsets accordingly, which is not really worth the small potential memory we would
40-
/// be able to get back.
37+
/// adjusting all offsets accordingly, which is not really worth the small potential memory we
38+
/// would be able to get back.
39+
///
40+
/// The trade-off is that we may keep unreferenced elements in memory, but this is acceptable
41+
/// since we're optimizing for read performance and the data isn't being copied.
4142
///
42-
/// The trade-off is that we may keep unreferenced elements in memory, but this is acceptable since
43-
/// we're optimizing for read performance and the data isn't being copied.
43+
/// When the selection density drops below `REBUILD_DENSITY_THRESHOLD`, we return `None` so
44+
/// callers can fall back to [`TakeExecute`], which compacts `elements` via a rebuild. Dense
45+
/// selections keep the cheap metadata-only path.
4446
impl TakeReduce for ListView {
4547
fn take(array: ArrayView<'_, ListView>, indices: &ArrayRef) -> VortexResult<Option<ArrayRef>> {
46-
let elements = array.elements();
47-
let offsets = array.offsets();
48-
let sizes = array.sizes();
49-
50-
// Compute the new validity by combining the array's validity with the indices' validity.
51-
let new_validity = array.validity()?.take(indices)?;
52-
53-
// Take the offsets and sizes arrays at the requested indices.
54-
// Take can reorder offsets, create gaps, and may introduce overlaps if the `indices`
55-
// contain duplicates.
56-
let nullable_new_offsets = offsets.take(indices.clone())?;
57-
let nullable_new_sizes = sizes.take(indices.clone())?;
48+
// Approximate element density by the fraction of list rows retained. Assumes roughly
49+
// uniform list sizes; good enough to decide whether dragging along the full `elements`
50+
// buffer is worth avoiding a rebuild.
51+
let kept_row_fraction = indices.len() as f32 / array.sizes().len() as f32;
52+
if kept_row_fraction < REBUILD_DENSITY_THRESHOLD {
53+
return Ok(None);
54+
}
5855

59-
// Since `take` returns nullable arrays, we simply cast it back to non-nullable (filled with
60-
// zeros to represent null lists).
61-
let new_offsets = match_each_integer_ptype!(nullable_new_offsets.dtype().as_ptype(), |O| {
62-
nullable_new_offsets
63-
.fill_null(Scalar::primitive(O::zero(), Nullability::NonNullable))?
64-
});
65-
let new_sizes = match_each_integer_ptype!(nullable_new_sizes.dtype().as_ptype(), |S| {
66-
nullable_new_sizes.fill_null(Scalar::primitive(S::zero(), Nullability::NonNullable))?
67-
});
68-
// SAFETY: Take operation maintains all `ListViewArray` invariants:
69-
// - `new_offsets` and `new_sizes` are derived from existing valid child arrays.
70-
// - `new_offsets` and `new_sizes` are non-nullable.
71-
// - `new_offsets` and `new_sizes` have the same length (both taken with the same
72-
// `indices`).
73-
// - Validity correctly reflects the combination of array and indices validity.
74-
let new_array = unsafe {
75-
ListViewArray::new_unchecked(elements.clone(), new_offsets, new_sizes, new_validity)
76-
};
56+
Ok(Some(apply_take(array, indices)?.into_array()))
57+
}
58+
}
7759

60+
/// Execution-path take for [`ListViewArray`].
61+
///
62+
/// This does the same metadata-only take as [`TakeReduce`], then unconditionally rebuilds the
63+
/// result via [`ListViewRebuildMode::MakeZeroCopyToList`] so the output does not carry
64+
/// unreferenced elements from the source. Callers reach this path when [`TakeReduce`] returns
65+
/// `None` (sparse selections) or during `Dict` canonicalization, where we want to materialize a
66+
/// compacted result.
67+
impl TakeExecute for ListView {
68+
fn take(
69+
array: ArrayView<'_, ListView>,
70+
indices: &ArrayRef,
71+
_ctx: &mut ExecutionCtx,
72+
) -> VortexResult<Option<ArrayRef>> {
7873
// TODO(connor)[ListView]: Ideally, we would only rebuild after all `take`s and `filter`
79-
// compute functions have run, at the "top" of the operator tree. However, we cannot do this
80-
// right now, so we will just rebuild every time (similar to `ListArray`).
81-
74+
// compute functions have run, at the "top" of the operator tree. However, we cannot do
75+
// this right now, so we will just rebuild every time (similar to `ListArray`).
76+
let taken = apply_take(array, indices)?;
8277
Ok(Some(
83-
new_array
78+
taken
8479
.rebuild(ListViewRebuildMode::MakeZeroCopyToList)?
8580
.into_array(),
8681
))
8782
}
8883
}
84+
85+
/// Shared metadata-only take: take `offsets`, `sizes` and `validity` at `indices` while reusing
86+
/// the original `elements` buffer as-is.
87+
fn apply_take(array: ArrayView<'_, ListView>, indices: &ArrayRef) -> VortexResult<ListViewArray> {
88+
let elements = array.elements();
89+
let offsets = array.offsets();
90+
let sizes = array.sizes();
91+
92+
// Combine the array's validity with the indices' validity.
93+
let new_validity = array.validity()?.take(indices)?;
94+
95+
// Take can reorder offsets, create gaps, and may introduce overlaps if `indices` contain
96+
// duplicates.
97+
let nullable_new_offsets = offsets.take(indices.clone())?;
98+
let nullable_new_sizes = sizes.take(indices.clone())?;
99+
100+
// `take` returns nullable arrays; cast back to non-nullable (filling with zeros to represent
101+
// the null lists — the validity mask tracks nullness separately).
102+
let new_offsets = match_each_integer_ptype!(nullable_new_offsets.dtype().as_ptype(), |O| {
103+
nullable_new_offsets.fill_null(Scalar::primitive(O::zero(), Nullability::NonNullable))?
104+
});
105+
let new_sizes = match_each_integer_ptype!(nullable_new_sizes.dtype().as_ptype(), |S| {
106+
nullable_new_sizes.fill_null(Scalar::primitive(S::zero(), Nullability::NonNullable))?
107+
});
108+
109+
// SAFETY: Take operation maintains all `ListViewArray` invariants:
110+
// - `new_offsets` and `new_sizes` are derived from existing valid child arrays.
111+
// - `new_offsets` and `new_sizes` are non-nullable.
112+
// - `new_offsets` and `new_sizes` have the same length (both taken with the same `indices`).
113+
// - Validity correctly reflects the combination of array and indices validity.
114+
Ok(unsafe {
115+
ListViewArray::new_unchecked(elements.clone(), new_offsets, new_sizes, new_validity)
116+
})
117+
}

0 commit comments

Comments
 (0)