@@ -5,10 +5,12 @@ use num_traits::Zero;
55use vortex_error:: VortexResult ;
66
77use crate :: ArrayRef ;
8+ use crate :: ExecutionCtx ;
89use crate :: IntoArray ;
910use crate :: array:: ArrayView ;
1011use crate :: arrays:: ListView ;
1112use crate :: arrays:: ListViewArray ;
13+ use crate :: arrays:: dict:: TakeExecute ;
1214use crate :: arrays:: dict:: TakeReduce ;
1315use crate :: arrays:: listview:: ListViewArrayExt ;
1416use crate :: arrays:: listview:: ListViewRebuildMode ;
@@ -17,72 +19,99 @@ use crate::dtype::Nullability;
1719use crate :: match_each_integer_ptype;
1820use crate :: scalar:: Scalar ;
1921
20- // TODO(connor)[ListView]: Make use of this threshold after we start migrating operators.
21- /// The threshold for triggering a rebuild of the [`ListViewArray`] .
22+ /// The threshold below which we return `None` from [`TakeReduce`] so callers fall back to
23+ /// [`TakeExecute`] and rebuild the underlying `elements` buffer .
2224///
23- /// By default, we will not touch the underlying `elements` array of the [`ListViewArray`] since it
24- /// can be potentially expensive to reorganize the array based on what views we have into it.
25- ///
26- /// However, we also do not want to carry around a large amount of garbage data. Below this
27- /// threshold of the density of the selection mask, we will rebuild the [`ListViewArray`], removing
28- /// any garbage data.
29- #[ allow( unused) ]
30- const REBUILD_DENSITY_THRESHOLD : f64 = 0.1 ;
25+ /// We don't touch `elements` on the metadata-only path since reorganizing it can be expensive.
26+ /// However, we also don't want to drag around a large amount of garbage data when the selection
27+ /// is sparse. Below this fraction of list rows retained, the rebuild is worth it.
28+ const REBUILD_DENSITY_THRESHOLD : f32 = 0.1 ;
3129
32- /// [`ListViewArray`] take implementation .
30+ /// Metadata-only take for [`ListViewArray`].
3331///
3432/// This implementation is deliberately simple and read-optimized. We just take the `offsets` and
35- /// `sizes` at the requested indices and reuse the original `elements` array . This works because
36- /// `ListView` (unlike `List`) allows non-contiguous and out-of-order lists.
33+ /// `sizes` at the requested indices and reuse the original `elements` buffer as-is . This works
34+ /// because `ListView` (unlike `List`) allows non-contiguous and out-of-order lists.
3735///
3836/// We don't slice the `elements` array because it would require computing min/max offsets and
39- /// adjusting all offsets accordingly, which is not really worth the small potential memory we would
40- /// be able to get back.
37+ /// adjusting all offsets accordingly, which is not really worth the small potential memory we
38+ /// would be able to get back.
39+ ///
40+ /// The trade-off is that we may keep unreferenced elements in memory, but this is acceptable
41+ /// since we're optimizing for read performance and the data isn't being copied.
4142///
42- /// The trade-off is that we may keep unreferenced elements in memory, but this is acceptable since
43- /// we're optimizing for read performance and the data isn't being copied.
43+ /// When the selection density drops below `REBUILD_DENSITY_THRESHOLD`, we return `None` so
44+ /// callers can fall back to [`TakeExecute`], which compacts `elements` via a rebuild. Dense
45+ /// selections keep the cheap metadata-only path.
4446impl TakeReduce for ListView {
4547 fn take ( array : ArrayView < ' _ , ListView > , indices : & ArrayRef ) -> VortexResult < Option < ArrayRef > > {
46- let elements = array. elements ( ) ;
47- let offsets = array. offsets ( ) ;
48- let sizes = array. sizes ( ) ;
49-
50- // Compute the new validity by combining the array's validity with the indices' validity.
51- let new_validity = array. validity ( ) ?. take ( indices) ?;
52-
53- // Take the offsets and sizes arrays at the requested indices.
54- // Take can reorder offsets, create gaps, and may introduce overlaps if the `indices`
55- // contain duplicates.
56- let nullable_new_offsets = offsets. take ( indices. clone ( ) ) ?;
57- let nullable_new_sizes = sizes. take ( indices. clone ( ) ) ?;
48+ // Approximate element density by the fraction of list rows retained. Assumes roughly
49+ // uniform list sizes; good enough to decide whether dragging along the full `elements`
50+ // buffer is worth avoiding a rebuild.
51+ let kept_row_fraction = indices. len ( ) as f32 / array. sizes ( ) . len ( ) as f32 ;
52+ if kept_row_fraction < REBUILD_DENSITY_THRESHOLD {
53+ return Ok ( None ) ;
54+ }
5855
59- // Since `take` returns nullable arrays, we simply cast it back to non-nullable (filled with
60- // zeros to represent null lists).
61- let new_offsets = match_each_integer_ptype ! ( nullable_new_offsets. dtype( ) . as_ptype( ) , |O | {
62- nullable_new_offsets
63- . fill_null( Scalar :: primitive( O :: zero( ) , Nullability :: NonNullable ) ) ?
64- } ) ;
65- let new_sizes = match_each_integer_ptype ! ( nullable_new_sizes. dtype( ) . as_ptype( ) , |S | {
66- nullable_new_sizes. fill_null( Scalar :: primitive( S :: zero( ) , Nullability :: NonNullable ) ) ?
67- } ) ;
68- // SAFETY: Take operation maintains all `ListViewArray` invariants:
69- // - `new_offsets` and `new_sizes` are derived from existing valid child arrays.
70- // - `new_offsets` and `new_sizes` are non-nullable.
71- // - `new_offsets` and `new_sizes` have the same length (both taken with the same
72- // `indices`).
73- // - Validity correctly reflects the combination of array and indices validity.
74- let new_array = unsafe {
75- ListViewArray :: new_unchecked ( elements. clone ( ) , new_offsets, new_sizes, new_validity)
76- } ;
56+ Ok ( Some ( apply_take ( array, indices) ?. into_array ( ) ) )
57+ }
58+ }
7759
60+ /// Execution-path take for [`ListViewArray`].
61+ ///
62+ /// This does the same metadata-only take as [`TakeReduce`], then unconditionally rebuilds the
63+ /// result via [`ListViewRebuildMode::MakeZeroCopyToList`] so the output does not carry
64+ /// unreferenced elements from the source. Callers reach this path when [`TakeReduce`] returns
65+ /// `None` (sparse selections) or during `Dict` canonicalization, where we want to materialize a
66+ /// compacted result.
67+ impl TakeExecute for ListView {
68+ fn take (
69+ array : ArrayView < ' _ , ListView > ,
70+ indices : & ArrayRef ,
71+ _ctx : & mut ExecutionCtx ,
72+ ) -> VortexResult < Option < ArrayRef > > {
7873 // TODO(connor)[ListView]: Ideally, we would only rebuild after all `take`s and `filter`
79- // compute functions have run, at the "top" of the operator tree. However, we cannot do this
80- // right now, so we will just rebuild every time (similar to `ListArray`).
81-
74+ // compute functions have run, at the "top" of the operator tree. However, we cannot do
75+ // this right now, so we will just rebuild every time (similar to `ListArray`).
76+ let taken = apply_take ( array , indices ) ? ;
8277 Ok ( Some (
83- new_array
78+ taken
8479 . rebuild ( ListViewRebuildMode :: MakeZeroCopyToList ) ?
8580 . into_array ( ) ,
8681 ) )
8782 }
8883}
84+
85+ /// Shared metadata-only take: take `offsets`, `sizes` and `validity` at `indices` while reusing
86+ /// the original `elements` buffer as-is.
87+ fn apply_take ( array : ArrayView < ' _ , ListView > , indices : & ArrayRef ) -> VortexResult < ListViewArray > {
88+ let elements = array. elements ( ) ;
89+ let offsets = array. offsets ( ) ;
90+ let sizes = array. sizes ( ) ;
91+
92+ // Combine the array's validity with the indices' validity.
93+ let new_validity = array. validity ( ) ?. take ( indices) ?;
94+
95+ // Take can reorder offsets, create gaps, and may introduce overlaps if `indices` contain
96+ // duplicates.
97+ let nullable_new_offsets = offsets. take ( indices. clone ( ) ) ?;
98+ let nullable_new_sizes = sizes. take ( indices. clone ( ) ) ?;
99+
100+ // `take` returns nullable arrays; cast back to non-nullable (filling with zeros to represent
101+ // the null lists — the validity mask tracks nullness separately).
102+ let new_offsets = match_each_integer_ptype ! ( nullable_new_offsets. dtype( ) . as_ptype( ) , |O | {
103+ nullable_new_offsets. fill_null( Scalar :: primitive( O :: zero( ) , Nullability :: NonNullable ) ) ?
104+ } ) ;
105+ let new_sizes = match_each_integer_ptype ! ( nullable_new_sizes. dtype( ) . as_ptype( ) , |S | {
106+ nullable_new_sizes. fill_null( Scalar :: primitive( S :: zero( ) , Nullability :: NonNullable ) ) ?
107+ } ) ;
108+
109+ // SAFETY: Take operation maintains all `ListViewArray` invariants:
110+ // - `new_offsets` and `new_sizes` are derived from existing valid child arrays.
111+ // - `new_offsets` and `new_sizes` are non-nullable.
112+ // - `new_offsets` and `new_sizes` have the same length (both taken with the same `indices`).
113+ // - Validity correctly reflects the combination of array and indices validity.
114+ Ok ( unsafe {
115+ ListViewArray :: new_unchecked ( elements. clone ( ) , new_offsets, new_sizes, new_validity)
116+ } )
117+ }
0 commit comments