From d5857ed63abc216dad3a18ab7ad66d5103fe614c Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Sun, 8 Jan 2023 10:24:05 -0800 Subject: [PATCH 1/7] fix warnings --- rust/src/arrow.rs | 4 +--- rust/src/datatypes.rs | 27 +++++++++++++++++++++++++-- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/rust/src/arrow.rs b/rust/src/arrow.rs index 2e3f02c13e..24c0fbf80b 100644 --- a/rust/src/arrow.rs +++ b/rust/src/arrow.rs @@ -19,12 +19,10 @@ //! //! To improve Arrow-RS egonomitic -use arrow_array::types::UInt8Type; use arrow_array::{ Array, FixedSizeBinaryArray, FixedSizeListArray, Int32Array, ListArray, UInt8Array, }; -use arrow_data::{ArrayData, ArrayDataBuilder}; -use arrow_schema::DataType::FixedSizeBinary; +use arrow_data::ArrayDataBuilder; use arrow_schema::{DataType, Field}; use crate::error::Result; diff --git a/rust/src/datatypes.rs b/rust/src/datatypes.rs index b4e85f0d01..baae8c8205 100644 --- a/rust/src/datatypes.rs +++ b/rust/src/datatypes.rs @@ -319,6 +319,18 @@ impl Field { ) } + /// Recursively set field ID and parent ID for this field and all its children. + fn set_id(&mut self, parent_id: i32, id_seed: &mut i32) { + self.parent_id = parent_id; + if self.id < 0 { + self.id = *id_seed; + *id_seed += 1; + } + self.children + .iter_mut() + .for_each(|f| f.set_id(self.id, id_seed)); + } + // Find any nested child with a specific field id fn mut_field_by_id(&mut self, id: i32) -> Option<&mut Field> { for child in self.children.as_mut_slice() { @@ -558,6 +570,14 @@ impl Schema { } Ok(()) } + + fn set_field_id(&mut self) { + let mut current_id = self.max_field_id().unwrap_or(-1); + self.fields + .iter_mut() + .for_each(|f| f.set_id(-1, &mut current_id)); + } + } impl fmt::Display for Schema { @@ -574,14 +594,17 @@ impl TryFrom<&ArrowSchema> for Schema { type Error = Error; fn try_from(schema: &ArrowSchema) -> Result { - Ok(Self { + let mut schema = Self { fields: schema .fields .iter() .map(Field::try_from) .collect::>()?, metadata: schema.metadata.clone(), - }) + }; + schema.set_field_id(); + + Ok(schema) } } From 0f6ef45b28df575d8ea544cc52b78e15d1d153b3 Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Sun, 8 Jan 2023 10:24:41 -0800 Subject: [PATCH 2/7] fix warnings --- rust/src/datatypes.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/src/datatypes.rs b/rust/src/datatypes.rs index baae8c8205..2837918e06 100644 --- a/rust/src/datatypes.rs +++ b/rust/src/datatypes.rs @@ -572,7 +572,7 @@ impl Schema { } fn set_field_id(&mut self) { - let mut current_id = self.max_field_id().unwrap_or(-1); + let mut current_id = self.max_field_id().unwrap_or(-1) + 1; self.fields .iter_mut() .for_each(|f| f.set_id(-1, &mut current_id)); From 49d74c4bf9469ec0501c60b4ea01fcd811f9fa02 Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Sun, 8 Jan 2023 10:29:50 -0800 Subject: [PATCH 3/7] fix project by id --- rust/src/datatypes.rs | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/rust/src/datatypes.rs b/rust/src/datatypes.rs index 2837918e06..bb403948ea 100644 --- a/rust/src/datatypes.rs +++ b/rust/src/datatypes.rs @@ -782,7 +782,7 @@ mod tests { ), ArrowField::new("c", DataType::Float64, false), ]); - assert_eq!(projected, Schema::try_from(&expected_arrow_schema).unwrap()); + assert_eq!(ArrowSchema::from(&projected), expected_arrow_schema); } #[test] @@ -801,6 +801,19 @@ mod tests { ArrowField::new("c", DataType::Float64, false), ]); let schema = Schema::try_from(&arrow_schema).unwrap(); - let projected = schema.project_by_ids(&[1, 4, 5]).unwrap(); + let projected = schema.project_by_ids(&[1, 2, 4, 5]).unwrap(); + + let expected_arrow_schema = ArrowSchema::new(vec![ + ArrowField::new( + "b", + DataType::Struct(vec![ + ArrowField::new("f1", DataType::Utf8, true), + ArrowField::new("f3", DataType::Float32, false), + ]), + true, + ), + ArrowField::new("c", DataType::Float64, false), + ]); + assert_eq!(ArrowSchema::from(&projected), expected_arrow_schema); } } From a7963185530d0beb396b3820bfb702c09b1ff627 Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Sun, 8 Jan 2023 10:35:17 -0800 Subject: [PATCH 4/7] set field id recursively --- rust/src/datatypes.rs | 30 +++++++++++++++++++++++++++++- rust/src/encodings/plain.rs | 1 - 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/rust/src/datatypes.rs b/rust/src/datatypes.rs index bb403948ea..352274634a 100644 --- a/rust/src/datatypes.rs +++ b/rust/src/datatypes.rs @@ -652,9 +652,12 @@ impl From<&Schema> for Vec { #[cfg(test)] mod tests { + use super::*; + + use std::collections::BTreeSet; + use arrow_schema::{Field as ArrowField, TimeUnit}; - use super::*; #[test] fn arrow_field_to_field() { @@ -816,4 +819,29 @@ mod tests { ]); assert_eq!(ArrowSchema::from(&projected), expected_arrow_schema); } + + #[test] + fn test_schema_set_ids() { + let arrow_schema = ArrowSchema::new(vec![ + ArrowField::new("a", DataType::Int32, false), + ArrowField::new( + "b", + DataType::Struct(vec![ + ArrowField::new("f1", DataType::Utf8, true), + ArrowField::new("f2", DataType::Boolean, false), + ArrowField::new("f3", DataType::Float32, false), + ]), + true, + ), + ArrowField::new("c", DataType::Float64, false), + ]); + let schema = Schema::try_from(&arrow_schema).unwrap(); + + let protos: Vec = (&schema).into(); + let all_ids: BTreeSet<_> = protos.iter().map(|p| p.id).collect(); + assert_eq!(all_ids.len(), 6); + assert_eq!(*all_ids.first().unwrap(), 0); + assert_eq!(*all_ids.last().unwrap(), 5); + } + } diff --git a/rust/src/encodings/plain.rs b/rust/src/encodings/plain.rs index 1bc65caa74..00a0cc3707 100644 --- a/rust/src/encodings/plain.rs +++ b/rust/src/encodings/plain.rs @@ -224,7 +224,6 @@ impl<'a> Decoder for PlainDecoder<'a> { mod tests { use crate::io::ObjectStore; use arrow_array::*; - use arrow_schema::DataType::FixedSizeList; use arrow_schema::Field; use object_store::path::Path; use rand::prelude::*; From 1a8822a4fa63d0ab0b8bc5ddc978175c98632ecc Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Sun, 8 Jan 2023 10:38:14 -0800 Subject: [PATCH 5/7] fix cargo fmt --- rust/src/datatypes.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/rust/src/datatypes.rs b/rust/src/datatypes.rs index 352274634a..d9067d3adb 100644 --- a/rust/src/datatypes.rs +++ b/rust/src/datatypes.rs @@ -577,7 +577,6 @@ impl Schema { .iter_mut() .for_each(|f| f.set_id(-1, &mut current_id)); } - } impl fmt::Display for Schema { @@ -658,7 +657,6 @@ mod tests { use arrow_schema::{Field as ArrowField, TimeUnit}; - #[test] fn arrow_field_to_field() { for (name, data_type) in [ @@ -843,5 +841,4 @@ mod tests { assert_eq!(*all_ids.first().unwrap(), 0); assert_eq!(*all_ids.last().unwrap(), 5); } - } From 0ea5195aa7900dc840ad0ddd7a6228e7efd68300 Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Sun, 8 Jan 2023 10:46:40 -0800 Subject: [PATCH 6/7] fix test on 1.65 --- rust/src/datatypes.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/rust/src/datatypes.rs b/rust/src/datatypes.rs index d9067d3adb..544b81865f 100644 --- a/rust/src/datatypes.rs +++ b/rust/src/datatypes.rs @@ -836,9 +836,9 @@ mod tests { let schema = Schema::try_from(&arrow_schema).unwrap(); let protos: Vec = (&schema).into(); - let all_ids: BTreeSet<_> = protos.iter().map(|p| p.id).collect(); - assert_eq!(all_ids.len(), 6); - assert_eq!(*all_ids.first().unwrap(), 0); - assert_eq!(*all_ids.last().unwrap(), 5); + assert_eq!( + protos.iter().map(|p| p.id).collect::>(), + (0..6).collect::>() + ); } } From 8b32c4ac736e3314339f371d1cc89e7df872d969 Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Sun, 8 Jan 2023 10:50:23 -0800 Subject: [PATCH 7/7] fix warnings --- rust/src/datatypes.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/rust/src/datatypes.rs b/rust/src/datatypes.rs index 544b81865f..440d57476f 100644 --- a/rust/src/datatypes.rs +++ b/rust/src/datatypes.rs @@ -653,8 +653,6 @@ impl From<&Schema> for Vec { mod tests { use super::*; - use std::collections::BTreeSet; - use arrow_schema::{Field as ArrowField, TimeUnit}; #[test]