From 8348bf2eb8cd99a0dc9999f21de8db068398c4e9 Mon Sep 17 00:00:00 2001 From: Marko Grujic Date: Mon, 8 Jul 2024 12:24:01 +0200 Subject: [PATCH 1/2] Use the storage location name as the key in the clade schema --- clade/proto/schema.proto | 12 +++++++----- src/catalog/metastore.rs | 16 ++++++++-------- src/catalog/repository.rs | 2 +- tests/fixtures.rs | 8 +++++--- 4 files changed, 21 insertions(+), 17 deletions(-) diff --git a/clade/proto/schema.proto b/clade/proto/schema.proto index d58cbd09..3abad5ca 100644 --- a/clade/proto/schema.proto +++ b/clade/proto/schema.proto @@ -11,16 +11,18 @@ message TableObject { string name = 1; // Path within the provided storage location, if any string path = 2; - // URL of the root storage location - optional string location = 3; + // Storage location identifier + optional string store = 3; } // A single root storage location, hosting many individual tables message StorageLocation { - // URL of the root storage location - string location = 1; + // Unique storage location identifier + string name = 1; + // URL of the storage location root + string location = 2; // Connection options for the object store client - map options = 2; + map options = 3; } message ListSchemaRequest { diff --git a/src/catalog/metastore.rs b/src/catalog/metastore.rs index 622d4946..834c8d91 100644 --- a/src/catalog/metastore.rs +++ b/src/catalog/metastore.rs @@ -97,7 +97,7 @@ impl Metastore { let store_options = catalog_schemas .stores .into_iter() - .map(|store| (store.location, store.options)) + .map(|store| (store.name, (store.location, store.options))) .collect(); // Turn the list of all collections, tables and their columns into a nested map. @@ -119,7 +119,7 @@ impl Metastore { async fn build_schema( &self, schema: SchemaObject, - store_options: &HashMap>, + store_options: &HashMap)>, ) -> CatalogResult<(Arc, Arc)> { let schema_name = schema.name; @@ -140,7 +140,7 @@ impl Metastore { async fn build_table( &self, table: TableObject, - store_options: &HashMap>, + store_options: &HashMap)>, ) -> CatalogResult<(Arc, Arc)> { // Build a delta table but don't load it yet; we'll do that only for tables that are // actually referenced in a statement, via the async `table` method of the schema provider. @@ -148,13 +148,13 @@ impl Metastore { // delta tables present in the database. The real fix for this is to make DF use `TableSource` // for the information schema, and then implement `TableSource` for `DeltaTable` in delta-rs. - let table_log_store = match table.location { + let table_log_store = match table.store { // Use the provided customized location - Some(location) => { - let this_store_options = store_options - .get(&location) + Some(name) => { + let (location, this_store_options) = store_options + .get(&name) .ok_or(CatalogError::Generic { - reason: format!("Object store for location {location} not found"), + reason: format!("Object store with name {name} not found"), })? .clone(); diff --git a/src/catalog/repository.rs b/src/catalog/repository.rs index 07595429..278c8d07 100644 --- a/src/catalog/repository.rs +++ b/src/catalog/repository.rs @@ -130,7 +130,7 @@ impl SchemaStore for RepositoryStore { Some(TableObject { name: name.clone(), path: uuid.to_string(), - location: None, + store: None, }) } else { None diff --git a/tests/fixtures.rs b/tests/fixtures.rs index f50c71c4..46832963 100644 --- a/tests/fixtures.rs +++ b/tests/fixtures.rs @@ -29,7 +29,7 @@ pub fn schemas() -> ListSchemaResponse { tables: vec![TableObject { name: "file".to_string(), path: "delta-0.8.0-partitioned".to_string(), - location: None, + store: None, }], }, SchemaObject { @@ -37,7 +37,7 @@ pub fn schemas() -> ListSchemaResponse { tables: vec![TableObject { name: "minio".to_string(), path: "test-data/delta-0.8.0-partitioned".to_string(), - location: Some("s3://seafowl-test-bucket".to_string()), + store: Some("minio".to_string()), }], }, SchemaObject { @@ -45,12 +45,13 @@ pub fn schemas() -> ListSchemaResponse { tables: vec![TableObject { name: "fake".to_string(), path: "delta-0.8.0-partitioned".to_string(), - location: Some("gs://test-data".to_string()), + store: Some("fake-gcs".to_string()), }], }, ], stores: vec![ StorageLocation { + name: "minio".to_string(), location: "s3://seafowl-test-bucket".to_string(), options: HashMap::from([ ( @@ -74,6 +75,7 @@ pub fn schemas() -> ListSchemaResponse { ]), }, StorageLocation { + name: "fake-gcs".to_string(), location: "gs://test-data".to_string(), options: HashMap::from([( GoogleConfigKey::ServiceAccount.as_ref().to_string(), From 7cac2ce7b8948767156d6d0f2fe95b5cf7d54ac5 Mon Sep 17 00:00:00 2001 From: Marko Grujic Date: Wed, 10 Jul 2024 06:54:28 +0200 Subject: [PATCH 2/2] Make clade schema field numbers backward compatible --- clade/proto/schema.proto | 10 +++++----- src/catalog/metastore.rs | 7 +++++-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/clade/proto/schema.proto b/clade/proto/schema.proto index 3abad5ca..5d726731 100644 --- a/clade/proto/schema.proto +++ b/clade/proto/schema.proto @@ -12,17 +12,17 @@ message TableObject { // Path within the provided storage location, if any string path = 2; // Storage location identifier - optional string store = 3; + optional string store = 4; } // A single root storage location, hosting many individual tables message StorageLocation { - // Unique storage location identifier - string name = 1; // URL of the storage location root - string location = 2; + string location = 1; // Connection options for the object store client - map options = 3; + map options = 2; + // Unique storage location identifier + string name = 3; } message ListSchemaRequest { diff --git a/src/catalog/metastore.rs b/src/catalog/metastore.rs index 834c8d91..98786df4 100644 --- a/src/catalog/metastore.rs +++ b/src/catalog/metastore.rs @@ -26,6 +26,9 @@ use std::str::FromStr; use std::sync::Arc; use url::Url; +// Root URL for a storage location alongside client connection options +type LocationAndOptions = (String, HashMap); + // This is the main entrypoint to all individual catalogs for various objects types. // The intention is to make it extensible and de-coupled from the underlying metastore // persistence mechanism (such as the presently used `Repository`). @@ -119,7 +122,7 @@ impl Metastore { async fn build_schema( &self, schema: SchemaObject, - store_options: &HashMap)>, + store_options: &HashMap, ) -> CatalogResult<(Arc, Arc)> { let schema_name = schema.name; @@ -140,7 +143,7 @@ impl Metastore { async fn build_table( &self, table: TableObject, - store_options: &HashMap)>, + store_options: &HashMap, ) -> CatalogResult<(Arc, Arc)> { // Build a delta table but don't load it yet; we'll do that only for tables that are // actually referenced in a statement, via the async `table` method of the schema provider.