diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 2ef71d5745a2..5bd7c71015bc 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -1663,6 +1663,7 @@ fn get_fsb_array_slice( #[cfg(test)] mod tests { use super::*; + use std::cmp::Ordering; use std::collections::HashMap; use std::fs::File; @@ -2918,8 +2919,29 @@ mod tests { for column in row_group.columns() { assert!(column.offset_index_offset().is_some()); assert!(column.offset_index_length().is_some()); - assert!(column.column_index_offset().is_none()); - assert!(column.column_index_length().is_none()); + assert!(column.column_index_offset().is_some()); + assert!(column.column_index_length().is_some()); + } + } + assert!(file_meta_data.column_index().is_some()); + if let Some(col_indexes) = file_meta_data.column_index() { + for rg_idx in col_indexes { + for idx in rg_idx { + let float_idx = match idx { + ColumnIndexMetaData::DOUBLE(idx) => idx, + _ => panic!("expected double statistics"), + }; + for i in 0..idx.num_pages() as usize { + assert_eq!( + f64::NAN.total_cmp(float_idx.min_value(i).unwrap()), + Ordering::Equal + ); + assert_eq!( + f64::NAN.total_cmp(float_idx.max_value(i).unwrap()), + Ordering::Equal + ); + } + } } } } diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index ba8ffc2e92c3..22c84532595c 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -1159,6 +1159,8 @@ pub enum SortOrder { UNSIGNED, /// Comparison is undefined. UNDEFINED, + /// Use IEEE 754 total order. + TOTAL_ORDER, } impl SortOrder { @@ -1179,6 +1181,8 @@ pub enum ColumnOrder { /// Column uses the order defined by its logical or physical type /// (if there is no logical type), parquet-format 2.4.0+. TYPE_DEFINED_ORDER(SortOrder), + /// Column ordering to use for floating point types. + IEEE_754_TOTAL_ORDER, // The following are not defined in the Parquet spec and should always be last. /// Undefined column order, means legacy behaviour before parquet-format 2.4.0. /// Sort order is always SIGNED. @@ -1199,14 +1203,36 @@ impl ColumnOrder { converted_type: ConvertedType, physical_type: Type, ) -> SortOrder { - Self::sort_order_for_type(logical_type.as_ref(), converted_type, physical_type) + Self::column_order_for_type(logical_type.as_ref(), converted_type, physical_type) + .sort_order() + } + + /// Returns the `ColumnOrder` for a physical/logical type. + pub fn column_order_for_type( + logical_type: Option<&LogicalType>, + converted_type: ConvertedType, + physical_type: Type, + ) -> ColumnOrder { + if Some(&LogicalType::Float16) == logical_type + || matches!(physical_type, Type::FLOAT | Type::DOUBLE) + { + ColumnOrder::IEEE_754_TOTAL_ORDER + } else { + let sort_order = + Self::sort_order_for_type(logical_type, converted_type, physical_type, true); + ColumnOrder::TYPE_DEFINED_ORDER(sort_order) + } } /// Returns sort order for a physical/logical type. + /// + /// `is_type_defined` indicates whether the column order for this type is + /// [`ColumnOrder::TYPE_DEFINED_ORDER`]. pub fn sort_order_for_type( logical_type: Option<&LogicalType>, converted_type: ConvertedType, physical_type: Type, + is_type_defined: bool, ) -> SortOrder { match logical_type { Some(logical) => match logical { @@ -1224,18 +1250,28 @@ impl ColumnOrder { LogicalType::Timestamp { .. } => SortOrder::SIGNED, LogicalType::Unknown => SortOrder::UNDEFINED, LogicalType::Uuid => SortOrder::UNSIGNED, - LogicalType::Float16 => SortOrder::SIGNED, + LogicalType::Float16 => { + if is_type_defined { + SortOrder::SIGNED + } else { + SortOrder::TOTAL_ORDER + } + } LogicalType::Variant { .. } | LogicalType::Geometry { .. } | LogicalType::Geography { .. } | LogicalType::_Unknown { .. } => SortOrder::UNDEFINED, }, // Fall back to converted type - None => Self::get_converted_sort_order(converted_type, physical_type), + None => Self::get_converted_sort_order(converted_type, physical_type, is_type_defined), } } - fn get_converted_sort_order(converted_type: ConvertedType, physical_type: Type) -> SortOrder { + fn get_converted_sort_order( + converted_type: ConvertedType, + physical_type: Type, + is_type_defined: bool, + ) -> SortOrder { match converted_type { // Unsigned byte-wise comparison. ConvertedType::UTF8 @@ -1270,24 +1306,35 @@ impl ColumnOrder { } // Fall back to physical type. - ConvertedType::NONE => Self::get_default_sort_order(physical_type), + ConvertedType::NONE => Self::get_default_sort_order(physical_type, is_type_defined), } } /// Returns default sort order based on physical type. - fn get_default_sort_order(physical_type: Type) -> SortOrder { + fn get_default_sort_order(physical_type: Type, is_type_defined: bool) -> SortOrder { match physical_type { // Order: false, true Type::BOOLEAN => SortOrder::UNSIGNED, Type::INT32 | Type::INT64 => SortOrder::SIGNED, Type::INT96 => SortOrder::UNDEFINED, // Notes to remember when comparing float/double values: - // If the min is a NaN, it should be ignored. - // If the max is a NaN, it should be ignored. - // If the min is +0, the row group may contain -0 values as well. - // If the max is -0, the row group may contain +0 values as well. - // When looking for NaN values, min and max should be ignored. - Type::FLOAT | Type::DOUBLE => SortOrder::SIGNED, + // If legacy TYPE_DEFINED_ORDER is specified: + // If the min is a NaN, it should be ignored. + // If the max is a NaN, it should be ignored. + // If the min is +0, the row group may contain -0 values as well. + // If the max is -0, the row group may contain +0 values as well. + // When looking for NaN values, min and max should be ignored. + // If IEEE_754_TOTAL_ORDER: + // Examine nan_count to see if NaNs are present. + // If min/max are NaN, that means only NaNs are present. + // If min/max are not NaN, they are ordered according to total order. + Type::FLOAT | Type::DOUBLE => { + if is_type_defined { + SortOrder::SIGNED + } else { + SortOrder::TOTAL_ORDER + } + } // Unsigned byte-wise comparison Type::BYTE_ARRAY | Type::FIXED_LEN_BYTE_ARRAY => SortOrder::UNSIGNED, } @@ -1297,6 +1344,7 @@ impl ColumnOrder { pub fn sort_order(&self) -> SortOrder { match *self { ColumnOrder::TYPE_DEFINED_ORDER(order) => order, + ColumnOrder::IEEE_754_TOTAL_ORDER => SortOrder::TOTAL_ORDER, ColumnOrder::UNDEFINED => SortOrder::SIGNED, ColumnOrder::UNKNOWN => SortOrder::UNDEFINED, } @@ -1315,6 +1363,10 @@ impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for ColumnOrder { prot.skip_empty_struct()?; Self::TYPE_DEFINED_ORDER(SortOrder::SIGNED) } + 2 => { + prot.skip_empty_struct()?; + Self::IEEE_754_TOTAL_ORDER + } _ => { prot.skip(field_ident.field_type)?; Self::UNKNOWN @@ -1339,6 +1391,10 @@ impl WriteThrift for ColumnOrder { writer.write_field_begin(FieldType::Struct, 1, 0)?; writer.write_struct_end()?; } + Self::IEEE_754_TOTAL_ORDER => { + writer.write_field_begin(FieldType::Struct, 2, 0)?; + writer.write_struct_end()?; + } _ => return Err(general_err!("Attempt to write undefined ColumnOrder")), } // write end of struct for this union @@ -2181,6 +2237,7 @@ mod tests { assert_eq!(SortOrder::SIGNED.to_string(), "SIGNED"); assert_eq!(SortOrder::UNSIGNED.to_string(), "UNSIGNED"); assert_eq!(SortOrder::UNDEFINED.to_string(), "UNDEFINED"); + assert_eq!(SortOrder::TOTAL_ORDER.to_string(), "TOTAL_ORDER"); } #[test] @@ -2197,6 +2254,10 @@ mod tests { ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNDEFINED).to_string(), "TYPE_DEFINED_ORDER(UNDEFINED)" ); + assert_eq!( + ColumnOrder::IEEE_754_TOTAL_ORDER.to_string(), + "IEEE_754_TOTAL_ORDER" + ); assert_eq!(ColumnOrder::UNDEFINED.to_string(), "UNDEFINED"); } @@ -2213,7 +2274,12 @@ mod tests { fn check_sort_order(types: Vec, expected_order: SortOrder) { for tpe in types { assert_eq!( - ColumnOrder::get_sort_order(Some(tpe), ConvertedType::NONE, Type::BYTE_ARRAY), + ColumnOrder::column_order_for_type( + Some(&tpe), + ConvertedType::NONE, + Type::BYTE_ARRAY + ) + .sort_order(), expected_order ); } @@ -2292,10 +2358,12 @@ mod tests { is_adjusted_to_u_t_c: true, unit: TimeUnit::NANOS, }, - LogicalType::Float16, ]; check_sort_order(signed, SortOrder::SIGNED); + let float = vec![LogicalType::Float16]; + check_sort_order(float, SortOrder::TOTAL_ORDER); + // Undefined comparison let undefined = vec![ LogicalType::List, @@ -2316,7 +2384,7 @@ mod tests { fn check_sort_order(types: Vec, expected_order: SortOrder) { for tpe in types { assert_eq!( - ColumnOrder::get_sort_order(None, tpe, Type::BYTE_ARRAY), + ColumnOrder::column_order_for_type(None, tpe, Type::BYTE_ARRAY).sort_order(), expected_order ); } @@ -2368,35 +2436,43 @@ mod tests { fn test_column_order_get_default_sort_order() { // Comparison based on physical type assert_eq!( - ColumnOrder::get_default_sort_order(Type::BOOLEAN), + ColumnOrder::get_default_sort_order(Type::BOOLEAN, true), SortOrder::UNSIGNED ); assert_eq!( - ColumnOrder::get_default_sort_order(Type::INT32), + ColumnOrder::get_default_sort_order(Type::INT32, true), SortOrder::SIGNED ); assert_eq!( - ColumnOrder::get_default_sort_order(Type::INT64), + ColumnOrder::get_default_sort_order(Type::INT64, true), SortOrder::SIGNED ); assert_eq!( - ColumnOrder::get_default_sort_order(Type::INT96), + ColumnOrder::get_default_sort_order(Type::INT96, true), SortOrder::UNDEFINED ); assert_eq!( - ColumnOrder::get_default_sort_order(Type::FLOAT), + ColumnOrder::get_default_sort_order(Type::FLOAT, false), + SortOrder::TOTAL_ORDER + ); + assert_eq!( + ColumnOrder::get_default_sort_order(Type::DOUBLE, false), + SortOrder::TOTAL_ORDER + ); + assert_eq!( + ColumnOrder::get_default_sort_order(Type::FLOAT, true), SortOrder::SIGNED ); assert_eq!( - ColumnOrder::get_default_sort_order(Type::DOUBLE), + ColumnOrder::get_default_sort_order(Type::DOUBLE, true), SortOrder::SIGNED ); assert_eq!( - ColumnOrder::get_default_sort_order(Type::BYTE_ARRAY), + ColumnOrder::get_default_sort_order(Type::BYTE_ARRAY, true), SortOrder::UNSIGNED ); assert_eq!( - ColumnOrder::get_default_sort_order(Type::FIXED_LEN_BYTE_ARRAY), + ColumnOrder::get_default_sort_order(Type::FIXED_LEN_BYTE_ARRAY, true), SortOrder::UNSIGNED ); } @@ -2415,6 +2491,10 @@ mod tests { ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNDEFINED).sort_order(), SortOrder::UNDEFINED ); + assert_eq!( + ColumnOrder::IEEE_754_TOTAL_ORDER.sort_order(), + SortOrder::TOTAL_ORDER + ); assert_eq!(ColumnOrder::UNDEFINED.sort_order(), SortOrder::SIGNED); } diff --git a/parquet/src/column/writer/encoder.rs b/parquet/src/column/writer/encoder.rs index 11d4f3142a20..1e2d7e2ee16f 100644 --- a/parquet/src/column/writer/encoder.rs +++ b/parquet/src/column/writer/encoder.rs @@ -16,12 +16,11 @@ // under the License. use bytes::Bytes; -use half::f16; -use crate::basic::{ConvertedType, Encoding, LogicalType, Type}; +use crate::basic::{ConvertedType, Encoding}; use crate::bloom_filter::Sbbf; use crate::column::writer::{ - compare_greater, fallback_encoding, has_dictionary_support, is_nan, update_max, update_min, + compare_greater, fallback_encoding, has_dictionary_support, update_max, update_min, }; use crate::data_type::DataType; use crate::data_type::private::ParquetValueType; @@ -330,19 +329,11 @@ where T: ParquetValueType + 'a, I: Iterator, { - let first = loop { - let next = iter.next()?; - if !is_nan(descr, next) { - break next; - } - }; + let first = iter.next()?; let mut min = first; let mut max = first; for val in iter { - if is_nan(descr, val) { - continue; - } if compare_greater(descr, min, val) { min = val; } @@ -351,37 +342,7 @@ where } } - // Float/Double statistics have special case for zero. - // - // If computed min is zero, whether negative or positive, - // the spec states that the min should be written as -0.0 - // (negative zero) - // - // For max, it has similar logic but will be written as 0.0 - // (positive zero) - let min = replace_zero(min, descr, -0.0); - let max = replace_zero(max, descr, 0.0); - - Some((min, max)) -} - -#[inline] -fn replace_zero(val: &T, descr: &ColumnDescriptor, replace: f32) -> T { - match T::PHYSICAL_TYPE { - Type::FLOAT if f32::from_le_bytes(val.as_bytes().try_into().unwrap()) == 0.0 => { - T::try_from_le_slice(&f32::to_le_bytes(replace)).unwrap() - } - Type::DOUBLE if f64::from_le_bytes(val.as_bytes().try_into().unwrap()) == 0.0 => { - T::try_from_le_slice(&f64::to_le_bytes(replace as f64)).unwrap() - } - Type::FIXED_LEN_BYTE_ARRAY - if descr.logical_type_ref() == Some(LogicalType::Float16).as_ref() - && f16::from_le_bytes(val.as_bytes().try_into().unwrap()) == f16::NEG_ZERO => - { - T::try_from_le_slice(&f16::to_le_bytes(f16::from_f32(replace))).unwrap() - } - _ => val.clone(), - } + Some((min.clone(), max.clone())) } fn update_geo_stats_accumulator<'a, T, I>(bounder: &mut dyn GeoStatsAccumulator, iter: I) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 46f90d3f7762..dc4f30127ace 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -23,6 +23,7 @@ use half::f16; use crate::bloom_filter::Sbbf; use crate::file::page_index::column_index::ColumnIndexMetaData; use crate::file::page_index::offset_index::OffsetIndexMetaData; +use std::cmp::Ordering; use std::collections::{BTreeSet, VecDeque}; use std::str; @@ -1360,43 +1361,21 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { } fn update_min(descr: &ColumnDescriptor, val: &T, min: &mut Option) { - update_stat::(descr, val, min, |cur| compare_greater(descr, cur, val)) + update_stat::(val, min, |cur| compare_greater(descr, cur, val)) } fn update_max(descr: &ColumnDescriptor, val: &T, max: &mut Option) { - update_stat::(descr, val, max, |cur| compare_greater(descr, val, cur)) -} - -#[inline] -#[allow(clippy::eq_op)] -fn is_nan(descr: &ColumnDescriptor, val: &T) -> bool { - match T::PHYSICAL_TYPE { - Type::FLOAT | Type::DOUBLE => val != val, - Type::FIXED_LEN_BYTE_ARRAY if descr.logical_type_ref() == Some(&LogicalType::Float16) => { - let val = val.as_bytes(); - let val = f16::from_le_bytes([val[0], val[1]]); - val.is_nan() - } - _ => false, - } + update_stat::(val, max, |cur| compare_greater(descr, val, cur)) } /// Perform a conditional update of `cur`, skipping any NaN values /// /// If `cur` is `None`, sets `cur` to `Some(val)`, otherwise calls `should_update` with /// the value of `cur`, and updates `cur` to `Some(val)` if it returns `true` -fn update_stat( - descr: &ColumnDescriptor, - val: &T, - cur: &mut Option, - should_update: F, -) where +fn update_stat(val: &T, cur: &mut Option, should_update: F) +where F: Fn(&T) -> bool, { - if is_nan(descr, val) { - return; - } - if cur.as_ref().is_none_or(should_update) { *cur = Some(val.clone()); } @@ -1405,6 +1384,16 @@ fn update_stat( /// Evaluate `a > b` according to underlying logical type. fn compare_greater(descr: &ColumnDescriptor, a: &T, b: &T) -> bool { match T::PHYSICAL_TYPE { + Type::FLOAT => { + let a = f32::from_le_bytes(a.as_bytes().try_into().unwrap()); + let b = f32::from_le_bytes(b.as_bytes().try_into().unwrap()); + return a.total_cmp(&b) == Ordering::Greater; + } + Type::DOUBLE => { + let a = f64::from_le_bytes(a.as_bytes().try_into().unwrap()); + let b = f64::from_le_bytes(b.as_bytes().try_into().unwrap()); + return a.total_cmp(&b) == Ordering::Greater; + } Type::INT32 | Type::INT64 => { if let Some(LogicalType::Integer { is_signed: false, .. @@ -1482,7 +1471,7 @@ fn compare_greater_unsigned_int(a: &T, b: &T) -> bool { fn compare_greater_f16(a: &[u8], b: &[u8]) -> bool { let a = f16::from_le_bytes(a.try_into().unwrap()); let b = f16::from_le_bytes(b.try_into().unwrap()); - a > b + a.total_cmp(&b) == Ordering::Greater } /// Signed comparison of bytes arrays @@ -2572,7 +2561,7 @@ mod tests { #[test] fn test_float_statistics() { let stats = statistics_roundtrip::(&[-1.0, 3.0, -2.0, 2.0]); - assert!(stats.is_min_max_backwards_compatible()); + assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &-2.0); assert_eq!(stats.max_opt().unwrap(), &3.0); @@ -2584,7 +2573,7 @@ mod tests { #[test] fn test_double_statistics() { let stats = statistics_roundtrip::(&[-1.0, 3.0, -2.0, 2.0]); - assert!(stats.is_min_max_backwards_compatible()); + assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &-2.0); assert_eq!(stats.max_opt().unwrap(), &3.0); @@ -2629,6 +2618,139 @@ mod tests { } } + #[test] + fn test_ieee754_total_order_float() { + // Test IEEE 754 total order for f32 + // Order should be: -NaN < -Inf < -1.0 < -0.0 < +0.0 < 1.0 < +Inf < +NaN + let neg_nan = f32::from_bits(0xffc00000); + let neg_inf = f32::NEG_INFINITY; + let neg_one = -1.0_f32; + let neg_zero = -0.0_f32; + let pos_zero = 0.0_f32; + let pos_one = 1.0_f32; + let pos_inf = f32::INFINITY; + let pos_nan = f32::from_bits(0x7fc00000); + + let values = vec![ + pos_nan, neg_zero, pos_inf, neg_one, neg_nan, pos_one, neg_inf, pos_zero, + ]; + + let stats = statistics_roundtrip::(&values); + if let Statistics::Float(stats) = stats { + // With IEEE 754 total order, min should be -NaN, max should be +NaN + assert_eq!( + stats.min_opt().unwrap().total_cmp(&neg_nan), + Ordering::Equal + ); + assert_eq!( + stats.max_opt().unwrap().total_cmp(&pos_nan), + Ordering::Equal + ); + } else { + panic!("Expected float statistics"); + } + } + + #[test] + fn test_ieee754_total_order_float_only_nan() { + let neg_nan1 = f32::from_bits(0xffc00000); + let neg_nan2 = f32::from_bits(0xffc00001); + let neg_nan3 = f32::from_bits(0xffc00002); + let pos_nan1 = f32::from_bits(0x7fc00000); + let pos_nan2 = f32::from_bits(0x7fc00001); + let pos_nan3 = f32::from_bits(0x7fc00002); + + let values = vec![neg_nan1, neg_nan2, neg_nan3, pos_nan1, pos_nan2, pos_nan3]; + + let stats = statistics_roundtrip::(&values); + if let Statistics::Float(stats) = stats { + // With IEEE 754 total order, min should be neg_nan3, max should be pos_nan3 + assert_eq!( + stats.min_opt().unwrap().total_cmp(&neg_nan3), + Ordering::Equal + ); + assert_eq!( + stats.max_opt().unwrap().total_cmp(&pos_nan3), + Ordering::Equal + ); + } else { + panic!("Expected float statistics"); + } + } + + #[test] + fn test_ieee754_total_order_double() { + // Test IEEE 754 total order for f64 + let neg_nan = f64::from_bits(0xfff8000000000000); + let neg_inf = f64::NEG_INFINITY; + let neg_one = -1.0_f64; + let neg_zero = -0.0_f64; + let pos_zero = 0.0_f64; + let pos_one = 1.0_f64; + let pos_inf = f64::INFINITY; + let pos_nan = f64::from_bits(0x7ff8000000000000); + + let values = vec![ + pos_nan, neg_zero, pos_inf, neg_one, neg_nan, pos_one, neg_inf, pos_zero, + ]; + + let stats = statistics_roundtrip::(&values); + if let Statistics::Double(stats) = stats { + // With IEEE 754 total order, min should be -NaN, max should be +NaN + assert_eq!( + stats.min_opt().unwrap().total_cmp(&neg_nan), + Ordering::Equal + ); + assert_eq!( + stats.max_opt().unwrap().total_cmp(&pos_nan), + Ordering::Equal + ); + } else { + panic!("Expected double statistics"); + } + } + + #[test] + fn test_ieee754_total_order_double_only_nan() { + let neg_nan1 = f64::from_bits(0xfff8000000000000); + let neg_nan2 = f64::from_bits(0xfff8000000000001); + let neg_nan3 = f64::from_bits(0xfff8000000000002); + let pos_nan1 = f64::from_bits(0x7ff8000000000000); + let pos_nan2 = f64::from_bits(0x7ff8000000000001); + let pos_nan3 = f64::from_bits(0x7ff8000000000002); + + let values = vec![neg_nan1, neg_nan2, neg_nan3, pos_nan1, pos_nan2, pos_nan3]; + + let stats = statistics_roundtrip::(&values); + if let Statistics::Double(stats) = stats { + assert_eq!( + stats.min_opt().unwrap().total_cmp(&neg_nan3), + Ordering::Equal + ); + assert_eq!( + stats.max_opt().unwrap().total_cmp(&pos_nan3), + Ordering::Equal + ); + } else { + panic!("Expected float statistics"); + } + } + + #[test] + fn test_ieee754_total_order_zeros() { + // Test that -0.0 and +0.0 are handled correctly + let values = vec![-0.0_f32, 0.0_f32, -0.0_f32, 0.0_f32]; + + let stats = statistics_roundtrip::(&values); + if let Statistics::Float(stats) = stats { + // With IEEE 754 total order, -0.0 < +0.0 + assert_eq!(stats.min_opt().unwrap().to_bits(), (-0.0_f32).to_bits()); + assert_eq!(stats.max_opt().unwrap().to_bits(), 0.0_f32.to_bits()); + } else { + panic!("Expected float statistics"); + } + } + #[test] fn test_column_writer_check_float16_min_max() { let input = [ @@ -2642,7 +2764,7 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(stats.is_min_max_backwards_compatible()); + assert!(!stats.is_min_max_backwards_compatible()); assert_eq!( stats.min_opt().unwrap(), &ByteArray::from(-f16::from_f32(2.0)) @@ -2661,11 +2783,11 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(stats.is_min_max_backwards_compatible()); + assert!(!stats.is_min_max_backwards_compatible()); assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(f16::ONE)); assert_eq!( - stats.max_opt().unwrap(), - &ByteArray::from(f16::ONE + f16::ONE) + stats.max_bytes_opt(), + Some(ByteArray::from(f16::NAN).as_bytes()) ); } @@ -2677,11 +2799,11 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(stats.is_min_max_backwards_compatible()); + assert!(!stats.is_min_max_backwards_compatible()); assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(f16::ONE)); assert_eq!( - stats.max_opt().unwrap(), - &ByteArray::from(f16::ONE + f16::ONE) + stats.max_bytes_opt(), + Some(ByteArray::from(f16::NAN).as_bytes()) ); } @@ -2693,11 +2815,11 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(stats.is_min_max_backwards_compatible()); + assert!(!stats.is_min_max_backwards_compatible()); assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(f16::ONE)); assert_eq!( - stats.max_opt().unwrap(), - &ByteArray::from(f16::ONE + f16::ONE) + stats.max_bytes_opt(), + Some(ByteArray::from(f16::NAN).as_bytes()) ); } @@ -2709,9 +2831,15 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(stats.min_bytes_opt().is_none()); - assert!(stats.max_bytes_opt().is_none()); - assert!(stats.is_min_max_backwards_compatible()); + assert_eq!( + stats.min_bytes_opt(), + Some(ByteArray::from(f16::NAN).as_bytes()) + ); + assert_eq!( + stats.max_bytes_opt(), + Some(ByteArray::from(f16::NAN).as_bytes()) + ); + assert!(!stats.is_min_max_backwards_compatible()); } #[test] @@ -2722,8 +2850,8 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(stats.is_min_max_backwards_compatible()); - assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(f16::NEG_ZERO)); + assert!(!stats.is_min_max_backwards_compatible()); + assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(f16::ZERO)); assert_eq!(stats.max_opt().unwrap(), &ByteArray::from(f16::ZERO)); } @@ -2735,9 +2863,9 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(stats.is_min_max_backwards_compatible()); + assert!(!stats.is_min_max_backwards_compatible()); assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(f16::NEG_ZERO)); - assert_eq!(stats.max_opt().unwrap(), &ByteArray::from(f16::ZERO)); + assert_eq!(stats.max_opt().unwrap(), &ByteArray::from(f16::NEG_ZERO)); } #[test] @@ -2748,9 +2876,9 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(stats.is_min_max_backwards_compatible()); - assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(f16::NEG_ZERO)); - assert_eq!(stats.max_opt().unwrap(), &ByteArray::from(f16::PI)); + assert!(!stats.is_min_max_backwards_compatible()); + assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(f16::ZERO)); + assert_eq!(stats.max_opt().unwrap(), &ByteArray::from(f16::NAN)); } #[test] @@ -2761,18 +2889,21 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(stats.is_min_max_backwards_compatible()); + assert!(!stats.is_min_max_backwards_compatible()); assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(-f16::PI)); - assert_eq!(stats.max_opt().unwrap(), &ByteArray::from(f16::ZERO)); + assert_eq!(stats.max_opt().unwrap(), &ByteArray::from(f16::NAN)); } #[test] fn test_float_statistics_nan_middle() { let stats = statistics_roundtrip::(&[1.0, f32::NAN, 2.0]); - assert!(stats.is_min_max_backwards_compatible()); + assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &1.0); - assert_eq!(stats.max_opt().unwrap(), &2.0); + assert_eq!( + stats.max_opt().unwrap().total_cmp(&f32::NAN), + Ordering::Equal + ); } else { panic!("expecting Statistics::Float"); } @@ -2781,10 +2912,13 @@ mod tests { #[test] fn test_float_statistics_nan_start() { let stats = statistics_roundtrip::(&[f32::NAN, 1.0, 2.0]); - assert!(stats.is_min_max_backwards_compatible()); + assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &1.0); - assert_eq!(stats.max_opt().unwrap(), &2.0); + assert_eq!( + stats.max_opt().unwrap().total_cmp(&f32::NAN), + Ordering::Equal + ); } else { panic!("expecting Statistics::Float"); } @@ -2793,19 +2927,28 @@ mod tests { #[test] fn test_float_statistics_nan_only() { let stats = statistics_roundtrip::(&[f32::NAN, f32::NAN]); - assert!(stats.min_bytes_opt().is_none()); - assert!(stats.max_bytes_opt().is_none()); - assert!(stats.is_min_max_backwards_compatible()); - assert!(matches!(stats, Statistics::Float(_))); + assert!(!stats.is_min_max_backwards_compatible()); + if let Statistics::Float(stats) = stats { + assert_eq!( + stats.min_opt().unwrap().total_cmp(&f32::NAN), + Ordering::Equal + ); + assert_eq!( + stats.max_opt().unwrap().total_cmp(&f32::NAN), + Ordering::Equal + ); + } else { + panic!("expecting Statistics::Float"); + } } #[test] fn test_float_statistics_zero_only() { let stats = statistics_roundtrip::(&[0.0]); - assert!(stats.is_min_max_backwards_compatible()); + assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { - assert_eq!(stats.min_opt().unwrap(), &-0.0); - assert!(stats.min_opt().unwrap().is_sign_negative()); + assert_eq!(stats.min_opt().unwrap(), &0.0); + assert!(stats.min_opt().unwrap().is_sign_positive()); assert_eq!(stats.max_opt().unwrap(), &0.0); assert!(stats.max_opt().unwrap().is_sign_positive()); } else { @@ -2816,12 +2959,12 @@ mod tests { #[test] fn test_float_statistics_neg_zero_only() { let stats = statistics_roundtrip::(&[-0.0]); - assert!(stats.is_min_max_backwards_compatible()); + assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &-0.0); assert!(stats.min_opt().unwrap().is_sign_negative()); - assert_eq!(stats.max_opt().unwrap(), &0.0); - assert!(stats.max_opt().unwrap().is_sign_positive()); + assert_eq!(stats.max_opt().unwrap(), &-0.0); + assert!(stats.max_opt().unwrap().is_sign_negative()); } else { panic!("expecting Statistics::Float"); } @@ -2830,11 +2973,14 @@ mod tests { #[test] fn test_float_statistics_zero_min() { let stats = statistics_roundtrip::(&[0.0, 1.0, f32::NAN, 2.0]); - assert!(stats.is_min_max_backwards_compatible()); + assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { - assert_eq!(stats.min_opt().unwrap(), &-0.0); - assert!(stats.min_opt().unwrap().is_sign_negative()); - assert_eq!(stats.max_opt().unwrap(), &2.0); + assert_eq!(stats.min_opt().unwrap(), &0.0); + assert!(stats.min_opt().unwrap().is_sign_positive()); + assert_eq!( + stats.max_opt().unwrap().total_cmp(&f32::NAN), + Ordering::Equal + ); } else { panic!("expecting Statistics::Float"); } @@ -2842,12 +2988,15 @@ mod tests { #[test] fn test_float_statistics_neg_zero_max() { - let stats = statistics_roundtrip::(&[-0.0, -1.0, f32::NAN, -2.0]); - assert!(stats.is_min_max_backwards_compatible()); + let stats = statistics_roundtrip::(&[-0.0, -1.0, -f32::NAN, -2.0]); + assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { - assert_eq!(stats.min_opt().unwrap(), &-2.0); - assert_eq!(stats.max_opt().unwrap(), &0.0); - assert!(stats.max_opt().unwrap().is_sign_positive()); + assert_eq!( + stats.min_opt().unwrap().total_cmp(&-f32::NAN), + Ordering::Equal + ); + assert_eq!(stats.max_opt().unwrap(), &-0.0); + assert!(stats.max_opt().unwrap().is_sign_negative()); } else { panic!("expecting Statistics::Float"); } @@ -2856,10 +3005,13 @@ mod tests { #[test] fn test_double_statistics_nan_middle() { let stats = statistics_roundtrip::(&[1.0, f64::NAN, 2.0]); - assert!(stats.is_min_max_backwards_compatible()); + assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &1.0); - assert_eq!(stats.max_opt().unwrap(), &2.0); + assert_eq!( + stats.max_opt().unwrap().total_cmp(&f64::NAN), + Ordering::Equal + ); } else { panic!("expecting Statistics::Double"); } @@ -2868,10 +3020,13 @@ mod tests { #[test] fn test_double_statistics_nan_start() { let stats = statistics_roundtrip::(&[f64::NAN, 1.0, 2.0]); - assert!(stats.is_min_max_backwards_compatible()); + assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &1.0); - assert_eq!(stats.max_opt().unwrap(), &2.0); + assert_eq!( + stats.max_opt().unwrap().total_cmp(&f64::NAN), + Ordering::Equal + ); } else { panic!("expecting Statistics::Double"); } @@ -2880,19 +3035,28 @@ mod tests { #[test] fn test_double_statistics_nan_only() { let stats = statistics_roundtrip::(&[f64::NAN, f64::NAN]); - assert!(stats.min_bytes_opt().is_none()); - assert!(stats.max_bytes_opt().is_none()); - assert!(matches!(stats, Statistics::Double(_))); - assert!(stats.is_min_max_backwards_compatible()); + assert!(!stats.is_min_max_backwards_compatible()); + if let Statistics::Double(stats) = stats { + assert_eq!( + stats.min_opt().unwrap().total_cmp(&f64::NAN), + Ordering::Equal + ); + assert_eq!( + stats.max_opt().unwrap().total_cmp(&f64::NAN), + Ordering::Equal + ); + } else { + panic!("expecting Statistics::Double"); + } } #[test] fn test_double_statistics_zero_only() { let stats = statistics_roundtrip::(&[0.0]); - assert!(stats.is_min_max_backwards_compatible()); + assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { - assert_eq!(stats.min_opt().unwrap(), &-0.0); - assert!(stats.min_opt().unwrap().is_sign_negative()); + assert_eq!(stats.min_opt().unwrap(), &0.0); + assert!(stats.min_opt().unwrap().is_sign_positive()); assert_eq!(stats.max_opt().unwrap(), &0.0); assert!(stats.max_opt().unwrap().is_sign_positive()); } else { @@ -2903,12 +3067,12 @@ mod tests { #[test] fn test_double_statistics_neg_zero_only() { let stats = statistics_roundtrip::(&[-0.0]); - assert!(stats.is_min_max_backwards_compatible()); + assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &-0.0); assert!(stats.min_opt().unwrap().is_sign_negative()); - assert_eq!(stats.max_opt().unwrap(), &0.0); - assert!(stats.max_opt().unwrap().is_sign_positive()); + assert_eq!(stats.max_opt().unwrap(), &-0.0); + assert!(stats.max_opt().unwrap().is_sign_negative()); } else { panic!("expecting Statistics::Double"); } @@ -2917,11 +3081,14 @@ mod tests { #[test] fn test_double_statistics_zero_min() { let stats = statistics_roundtrip::(&[0.0, 1.0, f64::NAN, 2.0]); - assert!(stats.is_min_max_backwards_compatible()); + assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { - assert_eq!(stats.min_opt().unwrap(), &-0.0); - assert!(stats.min_opt().unwrap().is_sign_negative()); - assert_eq!(stats.max_opt().unwrap(), &2.0); + assert_eq!(stats.min_opt().unwrap(), &0.0); + assert!(stats.min_opt().unwrap().is_sign_positive()); + assert_eq!( + stats.max_opt().unwrap().total_cmp(&f64::NAN), + Ordering::Equal + ); } else { panic!("expecting Statistics::Double"); } @@ -2929,12 +3096,15 @@ mod tests { #[test] fn test_double_statistics_neg_zero_max() { - let stats = statistics_roundtrip::(&[-0.0, -1.0, f64::NAN, -2.0]); - assert!(stats.is_min_max_backwards_compatible()); + let stats = statistics_roundtrip::(&[-0.0, -1.0, -f64::NAN, -2.0]); + assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { - assert_eq!(stats.min_opt().unwrap(), &-2.0); - assert_eq!(stats.max_opt().unwrap(), &0.0); - assert!(stats.max_opt().unwrap().is_sign_positive()); + assert_eq!( + stats.min_opt().unwrap().total_cmp(&-f64::NAN), + Ordering::Equal + ); + assert_eq!(stats.max_opt().unwrap(), &-0.0); + assert!(stats.max_opt().unwrap().is_sign_negative()); } else { panic!("expecting Statistics::Double"); } diff --git a/parquet/src/file/metadata/thrift/mod.rs b/parquet/src/file/metadata/thrift/mod.rs index 88cb96f35555..7c0efbbedc6f 100644 --- a/parquet/src/file/metadata/thrift/mod.rs +++ b/parquet/src/file/metadata/thrift/mod.rs @@ -873,6 +873,7 @@ pub(crate) fn parquet_metadata_from_bytes( column.logical_type_ref(), column.converted_type(), column.physical_type(), + true, ); cos[i] = ColumnOrder::TYPE_DEFINED_ORDER(sort_order); } diff --git a/parquet/src/file/metadata/writer.rs b/parquet/src/file/metadata/writer.rs index 275b4ff28e56..fa2eda3c4edc 100644 --- a/parquet/src/file/metadata/writer.rs +++ b/parquet/src/file/metadata/writer.rs @@ -203,22 +203,16 @@ impl<'a, W: Write> ThriftMetadataWriter<'a, W> { let column_indexes = self.finalize_column_indexes()?; let offset_indexes = self.finalize_offset_indexes()?; - // We only include ColumnOrder for leaf nodes. - // Currently only supported ColumnOrder is TypeDefinedOrder so we set this - // for all leaf nodes. - // Even if the column has an undefined sort order, such as INTERVAL, this - // is still technically the defined TYPEORDER so it should still be set. let column_orders = self .schema_descr .columns() .iter() .map(|col| { - let sort_order = ColumnOrder::sort_order_for_type( + ColumnOrder::column_order_for_type( col.logical_type_ref(), col.converted_type(), col.physical_type(), - ); - ColumnOrder::TYPE_DEFINED_ORDER(sort_order) + ) }) .collect(); diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 7d69904451d3..2cfbf1a91ebf 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -1226,7 +1226,7 @@ mod tests { // INTERVAL ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNDEFINED), // Float16 - ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED), + ColumnOrder::IEEE_754_TOTAL_ORDER, // String ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED), ]; diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index 2925557e7b86..76ad66239718 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -997,11 +997,19 @@ impl ColumnDescriptor { /// Returns the sort order for this column pub fn sort_order(&self) -> SortOrder { - ColumnOrder::sort_order_for_type( - self.logical_type_ref(), - self.converted_type(), - self.physical_type(), - ) + match self.primitive_type.as_ref() { + Type::PrimitiveType { + basic_info, + physical_type, + .. + } => ColumnOrder::column_order_for_type( + basic_info.logical_type_ref(), + basic_info.converted_type(), + *physical_type, + ) + .sort_order(), + _ => SortOrder::UNDEFINED, + } } } diff --git a/parquet/tests/arrow_reader/statistics.rs b/parquet/tests/arrow_reader/statistics.rs index 4f7ddcff4ad1..271347cb7777 100644 --- a/parquet/tests/arrow_reader/statistics.rs +++ b/parquet/tests/arrow_reader/statistics.rs @@ -781,7 +781,7 @@ async fn test_float_16() { expected_min: Arc::new(Float16Array::from(vec![ f16::from_f32(-5.), f16::from_f32(-4.), - f16::from_f32(-0.), + f16::from_f32(0.), f16::from_f32(5.), ])), // maxes are [-1, 0, 4, 9] @@ -817,7 +817,7 @@ async fn test_float_32() { Test { reader: &reader, // mins are [-5, -4, 0, 5] - expected_min: Arc::new(Float32Array::from(vec![-5., -4., -0., 5.0])), + expected_min: Arc::new(Float32Array::from(vec![-5., -4., 0., 5.0])), // maxes are [-1, 0, 4, 9] expected_max: Arc::new(Float32Array::from(vec![-1., 0., 4., 9.])), // nulls are [0, 0, 0, 0] @@ -846,7 +846,7 @@ async fn test_float_64() { Test { reader: &reader, // mins are [-5, -4, 0, 5] - expected_min: Arc::new(Float64Array::from(vec![-5., -4., -0., 5.0])), + expected_min: Arc::new(Float64Array::from(vec![-5., -4., 0., 5.0])), // maxes are [-1, 0, 4, 9] expected_max: Arc::new(Float64Array::from(vec![-1., 0., 4., 9.])), // nulls are [0, 0, 0, 0] @@ -1858,7 +1858,7 @@ async fn test_numeric_limits_float() { Test { reader: &reader, expected_min: Arc::new(Float32Array::from(vec![-1.0, -100.0])), - expected_max: Arc::new(Float32Array::from(vec![100.0, -100.0])), + expected_max: Arc::new(Float32Array::from(vec![f32::NAN, f32::NAN])), expected_null_counts: UInt64Array::from(vec![0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 2])), // stats are exact @@ -1872,7 +1872,7 @@ async fn test_numeric_limits_float() { Test { reader: &reader, expected_min: Arc::new(Float64Array::from(vec![-1.0, -100.0])), - expected_max: Arc::new(Float64Array::from(vec![100.0, -100.0])), + expected_max: Arc::new(Float64Array::from(vec![f64::NAN, f64::NAN])), expected_null_counts: UInt64Array::from(vec![0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 2])), // stats are exact @@ -1897,7 +1897,7 @@ async fn test_float64() { Test { reader: &reader, - expected_min: Arc::new(Float64Array::from(vec![-5.0, -4.0, -0.0, 5.0])), + expected_min: Arc::new(Float64Array::from(vec![-5.0, -4.0, 0.0, 5.0])), expected_max: Arc::new(Float64Array::from(vec![-1.0, 0.0, 4.0, 9.0])), expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), @@ -1925,7 +1925,7 @@ async fn test_float16() { Test { reader: &reader, expected_min: Arc::new(Float16Array::from( - vec![-5.0, -4.0, -0.0, 5.0] + vec![-5.0, -4.0, 0.0, 5.0] .into_iter() .map(f16::from_f32) .collect::>(),