From 7540e45481ca4630896af6f892b171fba8e866d1 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Sat, 16 May 2026 00:00:18 -0700 Subject: [PATCH 1/6] feat: simplify get_field over inline struct constructors Add a logical simplifier rule that resolves field access against inline `named_struct(...)` and `struct(...)` constructors at plan time, so a struct that is built and immediately read back never needs to be materialized: * `get_field(named_struct('min', a, 'max', b), 'max')` => `b` * `get_field(struct(a, b), 'c1')` => `b` * nested constructors collapse all the way through The rewrite lives in `GetFieldFunc::simplify` (the same hook that already flattens nested `get_field` calls) and bails out whenever it cannot be proven safe. Co-Authored-By: Claude Opus 4.7 (1M context) --- datafusion/functions/src/core/getfield.rs | 317 ++++++++++++++++-- datafusion/sqllogictest/test_files/order.slt | 14 +- datafusion/sqllogictest/test_files/struct.slt | 43 +++ 3 files changed, 345 insertions(+), 29 deletions(-) diff --git a/datafusion/functions/src/core/getfield.rs b/datafusion/functions/src/core/getfield.rs index b7092afcee492..d00daed252755 100644 --- a/datafusion/functions/src/core/getfield.rs +++ b/datafusion/functions/src/core/getfield.rs @@ -37,6 +37,9 @@ use datafusion_expr::{ }; use datafusion_macros::user_doc; +use super::named_struct::NamedStructFunc; +use super::r#struct::StructFunc; + #[user_doc( doc_section(label = "Other Functions"), description = r#"Returns a field within a map or a struct with the given key. @@ -249,6 +252,96 @@ fn extract_single_field(base: ColumnarValue, name: ScalarValue) -> Result `b` +/// * `get_field(struct(a, b), 'c1')` => `b` +/// +/// `args` is the (already flattened) argument list of the `get_field` call: +/// `[base, field_name, rest_of_path...]`. When extra path elements remain +/// after resolving the first one (`get_field(named_struct('s', inner), 's', 'k')`), +/// the resolved value is re-wrapped in a `get_field` call for the remaining +/// path so the simplifier can recurse into it on the next pass. +/// +/// Returns `None` — leaving the expression untouched — whenever the rewrite +/// cannot be proven safe, e.g. a non-literal field name, a `named_struct` +/// with a non-literal field name (which might shadow the requested field at +/// runtime), or a field the constructor does not produce. +fn simplify_get_field_over_struct_constructor(args: &[Expr]) -> Option { + let [base, field_name, rest @ ..] = args else { + return None; + }; + + // The accessed field name must be a non-empty string literal. + let Expr::Literal(field_name, _) = field_name else { + return None; + }; + let field_name = field_name + .try_as_str() + .flatten() + .filter(|s| !s.is_empty())?; + + let Expr::ScalarFunction(ScalarFunction { + func, + args: ctor_args, + }) = base + else { + return None; + }; + + let value = if func.inner().is::() { + // named_struct(name1, value1, name2, value2, ...) + if !ctor_args.len().is_multiple_of(2) { + return None; + } + let mut matched = None; + for pair in ctor_args.chunks_exact(2) { + // Every name must be a literal string: a non-literal name could + // evaluate to `field_name` at runtime and shadow a later match, + // so we cannot safely pick a field unless all names are known. + let Expr::Literal(name, _) = &pair[0] else { + return None; + }; + let name = name.try_as_str().flatten()?; + // `column_by_name` resolves to the first match, so do the same. + if matched.is_none() && name == field_name { + matched = Some(&pair[1]); + } + } + matched?.clone() + } else if func.inner().is::() { + // struct(value0, value1, ...) produces fields named c0, c1, ... + let index: usize = field_name.strip_prefix('c')?.parse().ok()?; + // Reject non-canonical spellings (e.g. "c01") that name no real field. + if format!("c{index}") != field_name { + return None; + } + ctor_args.get(index)?.clone() + } else { + return None; + }; + + if rest.is_empty() { + return Some(value); + } + + // Remaining path elements: re-wrap as get_field(value, rest...) and let + // the simplifier resolve the rest on a subsequent pass. + let mut new_args = Vec::with_capacity(rest.len() + 1); + new_args.push(value); + new_args.extend_from_slice(rest); + Some(Expr::ScalarFunction(ScalarFunction::new_udf( + Arc::new(ScalarUDF::new_from_impl(GetFieldFunc::new())), + new_args, + ))) +} + impl GetFieldFunc { pub fn new() -> Self { Self { @@ -479,14 +572,12 @@ impl ScalarUDFImpl for GetFieldFunc { // Flatten all nested get_field calls in a single pass // Pattern: get_field(get_field(get_field(base, a), b), c) => get_field(base, a, b, c) - - // Collect path arguments from all nested levels - let mut path_args_stack = Vec::new(); + // + // `path_args_stack` collects each level's field-name arguments, + // outermost first; it is reversed below to restore access order. + let mut path_args_stack = vec![&args[1..]]; let mut current_expr = &args[0]; - // Push the outermost path arguments first - path_args_stack.push(&args[1..]); - // Walk down the chain of nested get_field calls let base_expr = loop { if let Expr::ScalarFunction(ScalarFunction { @@ -506,28 +597,33 @@ impl ScalarUDFImpl for GetFieldFunc { break current_expr; }; - // If no nested get_field calls were found, return original - if path_args_stack.len() == args.len() - 1 { - return Ok(ExprSimplifyResult::Original(args)); - } + // Whether any nested get_field calls were collapsed above. + let did_flatten = path_args_stack.len() > 1; - // If we found any nested get_field calls, flatten them - // Build merged args: [base, ...all_path_args_in_correct_order] + // Build merged args: [base, ...all path args in access order]. + // The stack holds path slices outermost-first, so iterate in reverse. let mut merged_args = vec![base_expr.clone()]; - - // Add path args in reverse order (innermost to outermost) - // Stack is: [outermost_paths, ..., innermost_paths] - // We want: [base, innermost_paths, ..., outermost_paths] for path_slice in path_args_stack.iter().rev() { merged_args.extend_from_slice(path_slice); } - Ok(ExprSimplifyResult::Simplified(Expr::ScalarFunction( - ScalarFunction::new_udf( - Arc::new(ScalarUDF::new_from_impl(GetFieldFunc::new())), - merged_args, - ), - ))) + // Resolve field accesses against an inline struct constructor: + // get_field(named_struct('min', a, 'max', b), 'max') => b + if let Some(simplified) = simplify_get_field_over_struct_constructor(&merged_args) + { + return Ok(ExprSimplifyResult::Simplified(simplified)); + } + + if did_flatten { + return Ok(ExprSimplifyResult::Simplified(Expr::ScalarFunction( + ScalarFunction::new_udf( + Arc::new(ScalarUDF::new_from_impl(GetFieldFunc::new())), + merged_args, + ), + ))); + } + + Ok(ExprSimplifyResult::Original(args)) } fn coerce_types(&self, arg_types: &[DataType]) -> Result> { @@ -828,4 +924,181 @@ mod tests { let args = vec![ExpressionPlacement::Literal, ExpressionPlacement::Literal]; assert_eq!(func.placement(&args), ExpressionPlacement::KeepInPlace); } + + // --- get_field over struct constructor simplification -------------------- + + use datafusion_common::Column; + use datafusion_expr::simplify::SimplifyContext; + + /// A non-empty string literal expression. + fn lit_str(s: &str) -> Expr { + Expr::Literal(ScalarValue::Utf8(Some(s.to_string())), None) + } + + /// A column reference expression. + fn col(name: &str) -> Expr { + Expr::Column(Column::from_name(name)) + } + + fn scalar_fn(udf: ScalarUDF, args: Vec) -> Expr { + Expr::ScalarFunction(ScalarFunction::new_udf(Arc::new(udf), args)) + } + + /// `named_struct(name1, value1, name2, value2, ...)`. + fn named_struct(pairs: Vec<(&str, Expr)>) -> Expr { + let args = pairs + .into_iter() + .flat_map(|(name, value)| [lit_str(name), value]) + .collect(); + scalar_fn(ScalarUDF::new_from_impl(NamedStructFunc::new()), args) + } + + /// `struct(value0, value1, ...)`. + fn struct_fn(values: Vec) -> Expr { + scalar_fn(ScalarUDF::new_from_impl(StructFunc::new()), values) + } + + /// `get_field(args...)`. + fn get_field(args: Vec) -> Expr { + scalar_fn(ScalarUDF::new_from_impl(GetFieldFunc::new()), args) + } + + /// Run `GetFieldFunc::simplify` once and return the rewritten expression, + /// panicking if the input was left unchanged. + fn simplified(args: Vec) -> Expr { + match GetFieldFunc::new() + .simplify(args, &SimplifyContext::default()) + .unwrap() + { + ExprSimplifyResult::Simplified(expr) => expr, + ExprSimplifyResult::Original(args) => { + panic!("expected the expression to be simplified, got {args:?}") + } + } + } + + /// Assert that `GetFieldFunc::simplify` leaves the arguments unchanged. + fn assert_not_simplified(args: Vec) { + match GetFieldFunc::new() + .simplify(args.clone(), &SimplifyContext::default()) + .unwrap() + { + ExprSimplifyResult::Original(unchanged) => assert_eq!(unchanged, args), + ExprSimplifyResult::Simplified(expr) => { + panic!("expected no simplification, got {expr:?}") + } + } + } + + #[test] + fn simplify_get_field_named_struct_returns_matching_value() { + // get_field(named_struct('min', a, 'max', b), 'max') => b + let expr = get_field(vec![ + named_struct(vec![("min", col("a")), ("max", col("b"))]), + lit_str("max"), + ]); + let Expr::ScalarFunction(ScalarFunction { args, .. }) = expr else { + unreachable!() + }; + assert_eq!(simplified(args), col("b")); + } + + #[test] + fn simplify_get_field_named_struct_first_field() { + // get_field(named_struct('min', a, 'max', b), 'min') => a + let args = vec![ + named_struct(vec![("min", col("a")), ("max", col("b"))]), + lit_str("min"), + ]; + assert_eq!(simplified(args), col("a")); + } + + #[test] + fn simplify_get_field_named_struct_duplicate_names_picks_first() { + // Arrow's `column_by_name` resolves to the first match; mirror that. + let args = vec![ + named_struct(vec![("k", col("a")), ("k", col("b"))]), + lit_str("k"), + ]; + assert_eq!(simplified(args), col("a")); + } + + #[test] + fn simplify_get_field_struct_positional() { + // get_field(struct(a, b), 'c1') => b + let args = vec![struct_fn(vec![col("a"), col("b")]), lit_str("c1")]; + assert_eq!(simplified(args), col("b")); + } + + #[test] + fn simplify_get_field_nested_named_struct() { + // get_field(named_struct('s', named_struct('k', x)), 's', 'k') + // => get_field(named_struct('k', x), 'k') (first pass) + // => x (second pass) + let args = vec![ + named_struct(vec![("s", named_struct(vec![("k", col("x"))]))]), + lit_str("s"), + lit_str("k"), + ]; + let first_pass = simplified(args); + let Expr::ScalarFunction(ScalarFunction { args, .. }) = first_pass else { + panic!("expected a get_field call after the first pass") + }; + assert_eq!(simplified(args), col("x")); + } + + #[test] + fn simplify_get_field_flattens_then_resolves_named_struct() { + // get_field(get_field(named_struct('s', named_struct('k', x)), 's'), 'k') + // flattens to get_field(named_struct(...), 's', 'k') and resolves 's'. + let args = vec![ + get_field(vec![ + named_struct(vec![("s", named_struct(vec![("k", col("x"))]))]), + lit_str("s"), + ]), + lit_str("k"), + ]; + let expected = get_field(vec![named_struct(vec![("k", col("x"))]), lit_str("k")]); + assert_eq!(simplified(args), expected); + } + + #[test] + fn simplify_get_field_dynamic_field_name_left_alone() { + // A non-literal field name cannot be resolved at plan time. + let args = vec![named_struct(vec![("a", col("x"))]), col("field_name")]; + assert_not_simplified(args); + } + + #[test] + fn simplify_get_field_dynamic_struct_name_left_alone() { + // A non-literal name inside named_struct could shadow the requested + // field at runtime, so the rewrite must bail out entirely. + let named_struct_with_dynamic_name = scalar_fn( + ScalarUDF::new_from_impl(NamedStructFunc::new()), + vec![col("dynamic_name"), col("x")], + ); + let args = vec![named_struct_with_dynamic_name, lit_str("a")]; + assert_not_simplified(args); + } + + #[test] + fn simplify_get_field_missing_field_left_alone() { + // The named_struct does not produce field 'missing'. + let args = vec![named_struct(vec![("a", col("x"))]), lit_str("missing")]; + assert_not_simplified(args); + } + + #[test] + fn simplify_get_field_non_canonical_struct_field_left_alone() { + // 'c01' is not a real field name produced by `struct(...)`. + let args = vec![struct_fn(vec![col("a"), col("b")]), lit_str("c01")]; + assert_not_simplified(args); + } + + #[test] + fn simplify_get_field_column_base_left_alone() { + // A plain column base is not a struct constructor. + let args = vec![col("s"), lit_str("a")]; + assert_not_simplified(args); + } } diff --git a/datafusion/sqllogictest/test_files/order.slt b/datafusion/sqllogictest/test_files/order.slt index ffd48d5996576..6907e489e6905 100644 --- a/datafusion/sqllogictest/test_files/order.slt +++ b/datafusion/sqllogictest/test_files/order.slt @@ -1705,15 +1705,16 @@ EXPLAIN SELECT named_struct('sum', a + b) AS s FROM ordered ORDER BY s['sum']; ---- physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/data/composite_order.csv]]}, projection=[named_struct(sum, a@0 + b@1) as s], file_type=csv, has_header=true -# Wrapping a non-ordered column into a struct — SortExec required +# Wrapping a non-ordered column into a struct — SortExec required. # Reuses the `ordered` table above which has WITH ORDER (a + b). +# The simplifier resolves `get_field(named_struct(...), 'a')` so the sort key +# is not extracted into a separate scan projection column. query TT EXPLAIN SELECT named_struct('a', a, 'b', b) AS s FROM ordered ORDER BY s['a']; ---- physical_plan -01)ProjectionExec: expr=[s@0 as s] -02)--SortExec: expr=[__datafusion_extracted_1@1 ASC NULLS LAST], preserve_partitioning=[false] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/data/composite_order.csv]]}, projection=[named_struct(a, a@0, b, b@1) as s, get_field(named_struct(a, a@0, b, b@1), a) as __datafusion_extracted_1], file_type=csv, has_header=true +01)SortExec: expr=[get_field(s@0, a) ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/data/composite_order.csv]]}, projection=[named_struct(a, a@0, b, b@1) as s], file_type=csv, has_header=true # Simple column ordering tests using a table ordered by (a) statement ok @@ -1737,9 +1738,8 @@ query TT EXPLAIN SELECT named_struct('a', a, 'b', b) AS s FROM ordered_by_a ORDER BY s['b']; ---- physical_plan -01)ProjectionExec: expr=[s@0 as s] -02)--SortExec: expr=[__datafusion_extracted_1@1 ASC NULLS LAST], preserve_partitioning=[false] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/data/composite_order.csv]]}, projection=[named_struct(a, a@0, b, b@1) as s, get_field(named_struct(a, a@0, b, b@1), b) as __datafusion_extracted_1], file_type=csv, has_header=true +01)SortExec: expr=[get_field(s@0, b) ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/data/composite_order.csv]]}, projection=[named_struct(a, a@0, b, b@1) as s], file_type=csv, has_header=true # Mixed projection: top-level column alongside struct, order by struct field query TT diff --git a/datafusion/sqllogictest/test_files/struct.slt b/datafusion/sqllogictest/test_files/struct.slt index 5cf6e4817d475..982e2c6f4acce 100644 --- a/datafusion/sqllogictest/test_files/struct.slt +++ b/datafusion/sqllogictest/test_files/struct.slt @@ -126,6 +126,49 @@ physical_plan 01)ProjectionExec: expr=[struct(a@0, b@1, c@2) as struct(values.a,values.b,values.c)] 02)--DataSourceExec: partitions=1, partition_sizes=[1] +# get_field over an inline named_struct is resolved during logical +# simplification: the field access collapses to the underlying expression +# instead of materializing the intermediate struct. +query R +select get_field(named_struct('min', a, 'max', b), 'max') from values; +---- +1.1 +2.2 +3.3 + +query TT +explain select get_field(named_struct('min', a, 'max', b), 'max') from values; +---- +logical_plan +01)Projection: values.b AS named_struct(Utf8("min"),values.a,Utf8("max"),values.b)[max] +02)--TableScan: values projection=[b] +physical_plan +01)ProjectionExec: expr=[b@0 as named_struct(Utf8("min"),values.a,Utf8("max"),values.b)[max]] +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +# the same simplification applies to the positional struct() constructor, +# whose fields are named c0, c1, ... +query TT +explain select get_field(struct(a, b, c), 'c1') from values; +---- +logical_plan +01)Projection: values.b AS struct(values.a,values.b,values.c)[c1] +02)--TableScan: values projection=[b] +physical_plan +01)ProjectionExec: expr=[b@0 as struct(values.a,values.b,values.c)[c1]] +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +# nested constructors collapse all the way through +query TT +explain select named_struct('outer', named_struct('inner', a))['outer']['inner'] from values; +---- +logical_plan +01)Projection: values.a AS named_struct(Utf8("outer"),named_struct(Utf8("inner"),values.a))[outer][inner] +02)--TableScan: values projection=[a] +physical_plan +01)ProjectionExec: expr=[a@0 as named_struct(Utf8("outer"),named_struct(Utf8("inner"),values.a))[outer][inner]] +02)--DataSourceExec: partitions=1, partition_sizes=[1] + # error on 0 arguments query error select named_struct(); From cd20ebbb605ac359b3415ee046bfa256d3081232 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Sat, 23 May 2026 22:19:28 -0500 Subject: [PATCH 2/6] refactor: centralize get_field UDF construction behind get_field_udf() Both the re-wrap path in simplify_get_field_over_struct_constructor and the flatten path in GetFieldFunc::simplify built a fresh Arc::new(ScalarUDF::new_from_impl(GetFieldFunc::new())) each time. Share a single OnceLock-backed instance via get_field_udf() to make the reuse and intent clearer. Co-Authored-By: Claude Opus 4.7 (1M context) --- datafusion/functions/src/core/getfield.rs | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/datafusion/functions/src/core/getfield.rs b/datafusion/functions/src/core/getfield.rs index d00daed252755..948313855edea 100644 --- a/datafusion/functions/src/core/getfield.rs +++ b/datafusion/functions/src/core/getfield.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::{ Array, BooleanArray, Capacities, MutableArrayData, Scalar, cast::AsArray, make_array, @@ -252,6 +252,16 @@ fn extract_single_field(base: ColumnarValue, name: ScalarValue) -> Result Arc { + static GET_FIELD_UDF: OnceLock> = OnceLock::new(); + Arc::clone( + GET_FIELD_UDF + .get_or_init(|| Arc::new(ScalarUDF::new_from_impl(GetFieldFunc::new()))), + ) +} + /// Try to simplify a `get_field` call whose base is an inline struct /// constructor by resolving the field access at plan time. /// @@ -337,7 +347,7 @@ fn simplify_get_field_over_struct_constructor(args: &[Expr]) -> Option { new_args.push(value); new_args.extend_from_slice(rest); Some(Expr::ScalarFunction(ScalarFunction::new_udf( - Arc::new(ScalarUDF::new_from_impl(GetFieldFunc::new())), + get_field_udf(), new_args, ))) } @@ -616,10 +626,7 @@ impl ScalarUDFImpl for GetFieldFunc { if did_flatten { return Ok(ExprSimplifyResult::Simplified(Expr::ScalarFunction( - ScalarFunction::new_udf( - Arc::new(ScalarUDF::new_from_impl(GetFieldFunc::new())), - merged_args, - ), + ScalarFunction::new_udf(get_field_udf(), merged_args), ))); } From 4f70d80edc6762b8a6a89eb5f559d24cc870060b Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Sat, 23 May 2026 22:19:41 -0500 Subject: [PATCH 3/6] docs: note that unaccessed struct fields are intentionally not evaluated Document on simplify_get_field_over_struct_constructor that replacing the access with the selected field drops the expressions for the other fields, so they are no longer evaluated (e.g. get_field(named_struct('a', 1/0, 'b', x), 'b') => x). This matches the optimizer's immutable-expression contract. Co-Authored-By: Claude Opus 4.7 (1M context) --- datafusion/functions/src/core/getfield.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/datafusion/functions/src/core/getfield.rs b/datafusion/functions/src/core/getfield.rs index 948313855edea..3e14aed32c932 100644 --- a/datafusion/functions/src/core/getfield.rs +++ b/datafusion/functions/src/core/getfield.rs @@ -283,6 +283,13 @@ fn get_field_udf() -> Arc { /// cannot be proven safe, e.g. a non-literal field name, a `named_struct` /// with a non-literal field name (which might shadow the requested field at /// runtime), or a field the constructor does not produce. +/// +/// Replacing the access with the selected field expression drops the +/// expressions for the other (unaccessed) fields, so they are no longer +/// evaluated — e.g. `get_field(named_struct('a', 1/0, 'b', x), 'b')` becomes +/// `x` and the `1/0` is never evaluated. This is intentional and matches the +/// optimizer's contract for immutable expressions: a simplification may drop +/// sub-expressions whose value is not observed. fn simplify_get_field_over_struct_constructor(args: &[Expr]) -> Option { let [base, field_name, rest @ ..] = args else { return None; From f799333b0fafe3d0f009c094527a752a9f175c95 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Sat, 23 May 2026 22:19:52 -0500 Subject: [PATCH 4/6] docs: clarify dynamic-name bail-out is a deliberate conservative choice Rewrite the comment in the named_struct branch to state the rationale accurately. A non-literal name appearing *before* the first match could evaluate to field_name at runtime and become the real first match, so we must bail. But once a literal match has been found, a later non-literal name can never precede it (column_by_name returns the first match), so bailing there is a deliberate approximation we accept for simplicity, not a correctness requirement. Co-Authored-By: Claude Opus 4.7 (1M context) --- datafusion/functions/src/core/getfield.rs | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/datafusion/functions/src/core/getfield.rs b/datafusion/functions/src/core/getfield.rs index 3e14aed32c932..e2d87603faba1 100644 --- a/datafusion/functions/src/core/getfield.rs +++ b/datafusion/functions/src/core/getfield.rs @@ -319,9 +319,16 @@ fn simplify_get_field_over_struct_constructor(args: &[Expr]) -> Option { } let mut matched = None; for pair in ctor_args.chunks_exact(2) { - // Every name must be a literal string: a non-literal name could - // evaluate to `field_name` at runtime and shadow a later match, - // so we cannot safely pick a field unless all names are known. + // Every name must be a literal string: a non-literal name appearing + // *before* the first match could evaluate to `field_name` at runtime + // and become the real first match (Arrow's `column_by_name` returns + // the first match), so we cannot resolve the access. + // + // We conservatively bail on *any* non-literal name. Once a literal + // match has been found, a later non-literal name is in fact harmless + // — it can never precede the first match — so bailing there is a + // deliberate approximation we accept to keep this check simple, not a + // correctness requirement. let Expr::Literal(name, _) = &pair[0] else { return None; }; From 11a390ea0dca7eba98266ba01a735c25dfed5f27 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Sat, 23 May 2026 22:20:53 -0500 Subject: [PATCH 5/6] test: pass args directly in simplify_get_field_named_struct test simplify_get_field_named_struct_returns_matching_value built a get_field expr only to immediately destructure it back into args. Pass args directly, matching the style of the neighboring tests. Co-Authored-By: Claude Opus 4.7 (1M context) --- datafusion/functions/src/core/getfield.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/datafusion/functions/src/core/getfield.rs b/datafusion/functions/src/core/getfield.rs index e2d87603faba1..5a5db11c95109 100644 --- a/datafusion/functions/src/core/getfield.rs +++ b/datafusion/functions/src/core/getfield.rs @@ -1014,13 +1014,10 @@ mod tests { #[test] fn simplify_get_field_named_struct_returns_matching_value() { // get_field(named_struct('min', a, 'max', b), 'max') => b - let expr = get_field(vec![ + let args = vec![ named_struct(vec![("min", col("a")), ("max", col("b"))]), lit_str("max"), - ]); - let Expr::ScalarFunction(ScalarFunction { args, .. }) = expr else { - unreachable!() - }; + ]; assert_eq!(simplified(args), col("b")); } From b2e09783827a49880bf94c45a2724f907c3c514a Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Sat, 23 May 2026 22:21:01 -0500 Subject: [PATCH 6/6] test: document NULL field-name invariant with a dedicated test Add simplify_get_field_null_field_name_left_alone to explicitly cover the ScalarValue::Utf8(None) case, where try_as_str().flatten() yields no field name and the expression must be left unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) --- datafusion/functions/src/core/getfield.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/datafusion/functions/src/core/getfield.rs b/datafusion/functions/src/core/getfield.rs index 5a5db11c95109..93a4cddef453e 100644 --- a/datafusion/functions/src/core/getfield.rs +++ b/datafusion/functions/src/core/getfield.rs @@ -1087,6 +1087,15 @@ mod tests { assert_not_simplified(args); } + #[test] + fn simplify_get_field_null_field_name_left_alone() { + // A NULL string literal field name resolves to no field, so the + // `try_as_str().flatten()` guard must leave the expression untouched. + let null_field_name = Expr::Literal(ScalarValue::Utf8(None), None); + let args = vec![named_struct(vec![("a", col("x"))]), null_field_name]; + assert_not_simplified(args); + } + #[test] fn simplify_get_field_dynamic_struct_name_left_alone() { // A non-literal name inside named_struct could shadow the requested