From 41642d861b7358be93c3a682357537837390eb22 Mon Sep 17 00:00:00 2001 From: gsilvestrin Date: Thu, 10 Aug 2023 17:15:16 -0700 Subject: [PATCH 1/3] feat: Persist schema metadata Close #1128 --- python/python/tests/test_lance.py | 9 ++++++ rust/src/datatypes/schema.rs | 51 +++++++++++++++++++++++++++++++ rust/src/format/manifest.rs | 7 +++-- 3 files changed, 64 insertions(+), 3 deletions(-) diff --git a/python/python/tests/test_lance.py b/python/python/tests/test_lance.py index 8c4836f9ae..ab5edb4a08 100644 --- a/python/python/tests/test_lance.py +++ b/python/python/tests/test_lance.py @@ -215,3 +215,12 @@ def test_roundtrip_types(tmp_path, sample_data_all_types): dataset = lance.write_dataset(sample_data_all_types, tmp_path) roundtripped = dataset.to_table() assert roundtripped == sample_data_all_types + + +def test_roundtrip_schema(tmp_path): + schema = pa.schema([pa.field("a", pa.float64())], metadata={"key": "value"}) + data = pa.table( + {"a": [1.0, 2.0]} + ).to_batches() + dataset = lance.write_dataset(data, tmp_path, schema=schema) + assert dataset.schema == schema diff --git a/rust/src/datatypes/schema.rs b/rust/src/datatypes/schema.rs index d1524fb890..5a9b6535bb 100644 --- a/rust/src/datatypes/schema.rs +++ b/rust/src/datatypes/schema.rs @@ -340,6 +340,25 @@ impl From<&Vec> for Schema { } } +/// Convert list of protobuf `Field` and Metadata to a Schema. +impl From<(&Vec, HashMap>)> for Schema { + fn from((fields, metadata): (&Vec, HashMap>)) -> Self { + let lance_metadata = metadata + .into_iter() + .map(|(key, value)| { + let string_value = String::from_utf8_lossy(&value).to_string(); + (key, string_value) + }) + .collect(); + + let schema_with_fields = Schema::from(fields); + Self { + fields: schema_with_fields.fields, + metadata: lance_metadata, + } + } +} + /// Convert a Schema to a list of protobuf Field. impl From<&Schema> for Vec { fn from(schema: &Schema) -> Self { @@ -351,6 +370,20 @@ impl From<&Schema> for Vec { } } +/// Convert a Schema to a list of protobuf Field and Metadata +impl From<&Schema> for (Vec, HashMap>) { + fn from(schema: &Schema) -> Self { + let fields: Vec = schema.into(); + let pb_metadata = schema + .metadata + .clone() + .into_iter() + .map(|(key, value)| (key, value.into_bytes())) + .collect(); + (fields, pb_metadata) + } +} + #[cfg(test)] mod tests { use std::sync::Arc; @@ -514,6 +547,24 @@ mod tests { ); } + #[test] + fn test_schema_metadata() { + let mut metadata: HashMap = HashMap::new(); + metadata.insert(String::from("k1"), String::from("v1")); + metadata.insert(String::from("k2"), String::from("v2")); + + let arrow_schema = ArrowSchema::new_with_metadata( + vec![ArrowField::new("a", DataType::Int32, false)], + metadata, + ); + + let expected_schema = Schema::try_from(&arrow_schema).unwrap(); + let (fields, meta): (Vec, HashMap>) = (&expected_schema).into(); + + let schema = Schema::from((&fields, meta)); + assert_eq!(expected_schema, schema); + } + #[test] fn test_get_nested_field() { let arrow_schema = ArrowSchema::new(vec![ArrowField::new( diff --git a/rust/src/format/manifest.rs b/rust/src/format/manifest.rs index 476fa3c7bd..f1d8d8c899 100644 --- a/rust/src/format/manifest.rs +++ b/rust/src/format/manifest.rs @@ -184,7 +184,7 @@ impl From for Manifest { sec + nanos }); Self { - schema: Schema::from(&p.fields), + schema: Schema::from((&p.fields, p.metadata)), version: p.version, fragments: Arc::new(p.fragments.iter().map(Fragment::from).collect()), version_aux_data: p.version_aux_data as usize, @@ -210,11 +210,12 @@ impl From<&Manifest> for pb::Manifest { nanos: nanos as i32, }) }; + let (fields, metadata): (Vec, HashMap>) = (&m.schema).into(); Self { - fields: (&m.schema).into(), + fields: fields, version: m.version, fragments: m.fragments.iter().map(pb::DataFragment::from).collect(), - metadata: HashMap::default(), + metadata: metadata, version_aux_data: m.version_aux_data as u64, index_section: m.index_section.map(|i| i as u64), timestamp: timestamp_nanos, From ad877c7779ca84df289368088b5e79b8a07abe7a Mon Sep 17 00:00:00 2001 From: gsilvestrin Date: Thu, 10 Aug 2023 17:20:02 -0700 Subject: [PATCH 2/3] clippy --- rust/src/datatypes/schema.rs | 2 +- rust/src/format/manifest.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/rust/src/datatypes/schema.rs b/rust/src/datatypes/schema.rs index 5a9b6535bb..128ff2bdff 100644 --- a/rust/src/datatypes/schema.rs +++ b/rust/src/datatypes/schema.rs @@ -351,7 +351,7 @@ impl From<(&Vec, HashMap>)> for Schema { }) .collect(); - let schema_with_fields = Schema::from(fields); + let schema_with_fields = Self::from(fields); Self { fields: schema_with_fields.fields, metadata: lance_metadata, diff --git a/rust/src/format/manifest.rs b/rust/src/format/manifest.rs index f1d8d8c899..4d99b3a70f 100644 --- a/rust/src/format/manifest.rs +++ b/rust/src/format/manifest.rs @@ -212,10 +212,10 @@ impl From<&Manifest> for pb::Manifest { }; let (fields, metadata): (Vec, HashMap>) = (&m.schema).into(); Self { - fields: fields, + fields, version: m.version, fragments: m.fragments.iter().map(pb::DataFragment::from).collect(), - metadata: metadata, + metadata, version_aux_data: m.version_aux_data as u64, index_section: m.index_section.map(|i| i as u64), timestamp: timestamp_nanos, From 42ccf014c5864203877ffd079788e1197dc32e49 Mon Sep 17 00:00:00 2001 From: gsilvestrin Date: Thu, 10 Aug 2023 17:33:26 -0700 Subject: [PATCH 3/3] black --- python/python/tests/test_lance.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/python/tests/test_lance.py b/python/python/tests/test_lance.py index ab5edb4a08..5574eae5c1 100644 --- a/python/python/tests/test_lance.py +++ b/python/python/tests/test_lance.py @@ -219,8 +219,6 @@ def test_roundtrip_types(tmp_path, sample_data_all_types): def test_roundtrip_schema(tmp_path): schema = pa.schema([pa.field("a", pa.float64())], metadata={"key": "value"}) - data = pa.table( - {"a": [1.0, 2.0]} - ).to_batches() + data = pa.table({"a": [1.0, 2.0]}).to_batches() dataset = lance.write_dataset(data, tmp_path, schema=schema) assert dataset.schema == schema