From acd4f524908f2ab18fc12ec5f6854913d5f22f2d Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Tue, 4 May 2021 10:28:01 +0200 Subject: [PATCH] fix NaN handling in parquet statistics Closes #225. --- parquet/src/column/writer.rs | 90 ++++++++++++++++++++++++++++++++++-- 1 file changed, 85 insertions(+), 5 deletions(-) diff --git a/parquet/src/column/writer.rs b/parquet/src/column/writer.rs index 0b56594c0b60..833d82cb5699 100644 --- a/parquet/src/column/writer.rs +++ b/parquet/src/column/writer.rs @@ -922,11 +922,14 @@ impl ColumnWriterImpl { } fn update_page_min_max(&mut self, val: &T::T) { - if self.min_page_value.as_ref().map_or(true, |min| min > val) { - self.min_page_value = Some(val.clone()); - } - if self.max_page_value.as_ref().map_or(true, |max| max < val) { - self.max_page_value = Some(val.clone()); + // simple "isNaN" check that works for all types + if val == val { + if self.min_page_value.as_ref().map_or(true, |min| min > val) { + self.min_page_value = Some(val.clone()); + } + if self.max_page_value.as_ref().map_or(true, |max| max < val) { + self.max_page_value = Some(val.clone()); + } } } @@ -1652,6 +1655,68 @@ mod tests { ); } + #[test] + fn test_float_statistics_nan_middle() { + let stats = statistics_roundtrip::(&[1.0, f32::NAN, 2.0]); + assert!(stats.has_min_max_set()); + if let Statistics::Float(stats) = stats { + assert_eq!(stats.min(), &1.0); + assert_eq!(stats.max(), &2.0); + } else { + panic!("expecting Statistics::Float"); + } + } + + #[test] + fn test_float_statistics_nan_start() { + let stats = statistics_roundtrip::(&[f32::NAN, 1.0, 2.0]); + assert!(stats.has_min_max_set()); + if let Statistics::Float(stats) = stats { + assert_eq!(stats.min(), &1.0); + assert_eq!(stats.max(), &2.0); + } else { + panic!("expecting Statistics::Float"); + } + } + + #[test] + fn test_float_statistics_nan_only() { + let stats = statistics_roundtrip::(&[f32::NAN, f32::NAN]); + assert!(!stats.has_min_max_set()); + assert!(matches!(stats, Statistics::Float(_))); + } + + #[test] + fn test_double_statistics_nan_middle() { + let stats = statistics_roundtrip::(&[1.0, f64::NAN, 2.0]); + assert!(stats.has_min_max_set()); + if let Statistics::Double(stats) = stats { + assert_eq!(stats.min(), &1.0); + assert_eq!(stats.max(), &2.0); + } else { + panic!("expecting Statistics::Float"); + } + } + + #[test] + fn test_double_statistics_nan_start() { + let stats = statistics_roundtrip::(&[f64::NAN, 1.0, 2.0]); + assert!(stats.has_min_max_set()); + if let Statistics::Double(stats) = stats { + assert_eq!(stats.min(), &1.0); + assert_eq!(stats.max(), &2.0); + } else { + panic!("expecting Statistics::Float"); + } + } + + #[test] + fn test_double_statistics_nan_only() { + let stats = statistics_roundtrip::(&[f64::NAN, f64::NAN]); + assert!(!stats.has_min_max_set()); + assert!(matches!(stats, Statistics::Double(_))); + } + /// Performs write-read roundtrip with randomly generated values and levels. /// `max_size` is maximum number of values or levels (if `max_def_level` > 0) to write /// for a column. @@ -1905,4 +1970,19 @@ mod tests { Ok(()) } } + + /// Write data into parquet using [`get_test_page_writer`] and [`get_test_column_writer`] and returns generated statistics. + fn statistics_roundtrip(values: &[::T]) -> Statistics { + let page_writer = get_test_page_writer(); + let props = Arc::new(WriterProperties::builder().build()); + let mut writer = get_test_column_writer::(page_writer, 0, 0, props); + writer.write_batch(values, None, None).unwrap(); + + let (_bytes_written, _rows_written, metadata) = writer.close().unwrap(); + if let Some(stats) = metadata.statistics() { + stats.clone() + } else { + panic!("metadata missing statistics"); + } + } }