Skip to content

Commit

Permalink
feat: Persist schema metadata (#1133)
Browse files Browse the repository at this point in the history
  • Loading branch information
gsilvestrin authored Aug 11, 2023
1 parent c23b3e0 commit d3e514e
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 3 deletions.
7 changes: 7 additions & 0 deletions python/python/tests/test_lance.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,3 +215,10 @@ def test_roundtrip_types(tmp_path, sample_data_all_types):
dataset = lance.write_dataset(sample_data_all_types, tmp_path)
roundtripped = dataset.to_table()
assert roundtripped == sample_data_all_types


def test_roundtrip_schema(tmp_path):
schema = pa.schema([pa.field("a", pa.float64())], metadata={"key": "value"})
data = pa.table({"a": [1.0, 2.0]}).to_batches()
dataset = lance.write_dataset(data, tmp_path, schema=schema)
assert dataset.schema == schema
51 changes: 51 additions & 0 deletions rust/src/datatypes/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,25 @@ impl From<&Vec<pb::Field>> for Schema {
}
}

/// Convert list of protobuf `Field` and Metadata to a Schema.
impl From<(&Vec<pb::Field>, HashMap<String, Vec<u8>>)> for Schema {
fn from((fields, metadata): (&Vec<pb::Field>, HashMap<String, Vec<u8>>)) -> Self {
let lance_metadata = metadata
.into_iter()
.map(|(key, value)| {
let string_value = String::from_utf8_lossy(&value).to_string();
(key, string_value)
})
.collect();

let schema_with_fields = Self::from(fields);
Self {
fields: schema_with_fields.fields,
metadata: lance_metadata,
}
}
}

/// Convert a Schema to a list of protobuf Field.
impl From<&Schema> for Vec<pb::Field> {
fn from(schema: &Schema) -> Self {
Expand All @@ -351,6 +370,20 @@ impl From<&Schema> for Vec<pb::Field> {
}
}

/// Convert a Schema to a list of protobuf Field and Metadata
impl From<&Schema> for (Vec<pb::Field>, HashMap<String, Vec<u8>>) {
fn from(schema: &Schema) -> Self {
let fields: Vec<pb::Field> = schema.into();
let pb_metadata = schema
.metadata
.clone()
.into_iter()
.map(|(key, value)| (key, value.into_bytes()))
.collect();
(fields, pb_metadata)
}
}

#[cfg(test)]
mod tests {
use std::sync::Arc;
Expand Down Expand Up @@ -514,6 +547,24 @@ mod tests {
);
}

#[test]
fn test_schema_metadata() {
let mut metadata: HashMap<String, String> = HashMap::new();
metadata.insert(String::from("k1"), String::from("v1"));
metadata.insert(String::from("k2"), String::from("v2"));

let arrow_schema = ArrowSchema::new_with_metadata(
vec![ArrowField::new("a", DataType::Int32, false)],
metadata,
);

let expected_schema = Schema::try_from(&arrow_schema).unwrap();
let (fields, meta): (Vec<pb::Field>, HashMap<String, Vec<u8>>) = (&expected_schema).into();

let schema = Schema::from((&fields, meta));
assert_eq!(expected_schema, schema);
}

#[test]
fn test_get_nested_field() {
let arrow_schema = ArrowSchema::new(vec![ArrowField::new(
Expand Down
7 changes: 4 additions & 3 deletions rust/src/format/manifest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ impl From<pb::Manifest> for Manifest {
sec + nanos
});
Self {
schema: Schema::from(&p.fields),
schema: Schema::from((&p.fields, p.metadata)),
version: p.version,
fragments: Arc::new(p.fragments.iter().map(Fragment::from).collect()),
version_aux_data: p.version_aux_data as usize,
Expand All @@ -210,11 +210,12 @@ impl From<&Manifest> for pb::Manifest {
nanos: nanos as i32,
})
};
let (fields, metadata): (Vec<pb::Field>, HashMap<String, Vec<u8>>) = (&m.schema).into();
Self {
fields: (&m.schema).into(),
fields,
version: m.version,
fragments: m.fragments.iter().map(pb::DataFragment::from).collect(),
metadata: HashMap::default(),
metadata,
version_aux_data: m.version_aux_data as u64,
index_section: m.index_section.map(|i| i as u64),
timestamp: timestamp_nanos,
Expand Down

0 comments on commit d3e514e

Please sign in to comment.