From eebb7de012644b9411864294e9f5dcb7e45253ae Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 20 Nov 2023 12:40:37 -0500 Subject: [PATCH 1/3] Fix bug in field level metadata matching code --- datafusion/physical-plan/src/projection.rs | 16 ++--- datafusion/sqllogictest/src/test_context.rs | 44 ++++++++++--- .../sqllogictest/test_files/metadata.slt | 62 +++++++++++++++++++ 3 files changed, 104 insertions(+), 18 deletions(-) create mode 100644 datafusion/sqllogictest/test_files/metadata.slt diff --git a/datafusion/physical-plan/src/projection.rs b/datafusion/physical-plan/src/projection.rs index b8e2d0e425d4..d4b2f204a7c5 100644 --- a/datafusion/physical-plan/src/projection.rs +++ b/datafusion/physical-plan/src/projection.rs @@ -257,16 +257,12 @@ fn get_field_metadata( e: &Arc, input_schema: &Schema, ) -> Option> { - let name = if let Some(column) = e.as_any().downcast_ref::() { - column.name() - } else { - return None; - }; - - input_schema - .field_with_name(name) - .ok() - .map(|f| f.metadata().clone()) + // Look up field by index in schema (not NAME) + e.as_any() + .downcast_ref::() + .map(|column| column.index()) + .map(|idx| input_schema.field(idx).metadata()) + .cloned() } fn stats_projection( diff --git a/datafusion/sqllogictest/src/test_context.rs b/datafusion/sqllogictest/src/test_context.rs index b2314f34f360..f5ab8f71aaaf 100644 --- a/datafusion/sqllogictest/src/test_context.rs +++ b/datafusion/sqllogictest/src/test_context.rs @@ -35,6 +35,7 @@ use datafusion::{ }; use datafusion_common::DataFusionError; use log::info; +use std::collections::HashMap; use std::fs::File; use std::io::Write; use std::path::Path; @@ -57,8 +58,8 @@ impl TestContext { } } - /// Create a SessionContext, configured for the specific test, if - /// possible. + /// Create a SessionContext, configured for the specific sqllogictest + /// test(.slt file) , if possible. /// /// If `None` is returned (e.g. because some needed feature is not /// enabled), the file should be skipped @@ -67,7 +68,7 @@ impl TestContext { // hardcode target partitions so plans are deterministic .with_target_partitions(4); - let test_ctx = TestContext::new(SessionContext::new_with_config(config)); + let mut test_ctx = TestContext::new(SessionContext::new_with_config(config)); let file_name = relative_path.file_name().unwrap().to_str().unwrap(); match file_name { @@ -86,10 +87,8 @@ impl TestContext { "avro.slt" => { #[cfg(feature = "avro")] { - let mut test_ctx = test_ctx; info!("Registering avro tables"); register_avro_tables(&mut test_ctx).await; - return Some(test_ctx); } #[cfg(not(feature = "avro"))] { @@ -99,10 +98,11 @@ impl TestContext { } "joins.slt" => { info!("Registering partition table tables"); - - let mut test_ctx = test_ctx; register_partition_table(&mut test_ctx).await; - return Some(test_ctx); + } + "metadata.slt" => { + info!("Registering metadata table tables"); + register_metadata_tables(test_ctx.session_ctx()).await; } _ => { info!("Using default SessionContext"); @@ -299,3 +299,31 @@ fn table_with_many_types() -> Arc { let provider = MemTable::try_new(Arc::new(schema), vec![vec![batch]]).unwrap(); Arc::new(provider) } + +/// Registers a table_with_metadata that contains both field level and Table level metadata +pub async fn register_metadata_tables(ctx: &SessionContext) { + let id = Field::new("id", DataType::Int32, true).with_metadata(HashMap::from([( + String::from("metadata_key"), + String::from("the id field"), + )])); + let name = Field::new("name", DataType::Utf8, true).with_metadata(HashMap::from([( + String::from("metadata_key"), + String::from("the name field"), + )])); + + let schema = Schema::new(vec![id, name]).with_metadata(HashMap::from([( + String::from("metadata_key"), + String::from("the entire schema"), + )])); + + let batch = RecordBatch::try_new( + Arc::new(schema), + vec![ + Arc::new(Int32Array::from(vec![Some(1), None, Some(3)])) as _, + Arc::new(StringArray::from(vec![None, Some("bar"), Some("baz")])) as _, + ], + ) + .unwrap(); + + ctx.register_batch("table_with_metadata", batch).unwrap(); +} diff --git a/datafusion/sqllogictest/test_files/metadata.slt b/datafusion/sqllogictest/test_files/metadata.slt new file mode 100644 index 000000000000..3b2b219244f5 --- /dev/null +++ b/datafusion/sqllogictest/test_files/metadata.slt @@ -0,0 +1,62 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +########## +## Tests for tables that has both metadata on each field as well as metadata on +## the schema itself. +########## + +## Note that table_with_metadata is defined using Rust code +## in the test harness as there is no way to define schema +## with metadata in SQL. + +query IT +select * from table_with_metadata; +---- +1 NULL +NULL bar +3 baz + +query I rowsort +SELECT ( + SELECT id FROM table_with_metadata + ) UNION ( + SELECT id FROM table_with_metadata + ); +---- +1 +3 +NULL + +query I rowsort +SELECT "data"."id" +FROM + ( + (SELECT "id" FROM "table_with_metadata") + UNION + (SELECT "id" FROM "table_with_metadata") + ) as "data", + ( + SELECT "id" FROM "table_with_metadata" + ) as "samples" +WHERE "data"."id" = "samples"."id"; +---- +1 +3 + +statement ok +drop table table_with_metadata; From 9358d329700ffb2a5421b8466e4c73d96ca4243d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 20 Nov 2023 13:25:02 -0500 Subject: [PATCH 2/3] improve comment --- datafusion/physical-plan/src/projection.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datafusion/physical-plan/src/projection.rs b/datafusion/physical-plan/src/projection.rs index d4b2f204a7c5..bcbc0e4bb235 100644 --- a/datafusion/physical-plan/src/projection.rs +++ b/datafusion/physical-plan/src/projection.rs @@ -257,7 +257,8 @@ fn get_field_metadata( e: &Arc, input_schema: &Schema, ) -> Option> { - // Look up field by index in schema (not NAME) + // Look up field by index in schema (not NAME as there can be more than one + // column with the same name) e.as_any() .downcast_ref::() .map(|column| column.index()) From f5c557e1f8a34ffc855e80a328318f8029b2bb40 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 20 Nov 2023 14:36:13 -0500 Subject: [PATCH 3/3] single map --- datafusion/physical-plan/src/projection.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/datafusion/physical-plan/src/projection.rs b/datafusion/physical-plan/src/projection.rs index bcbc0e4bb235..dfb860bc8cf3 100644 --- a/datafusion/physical-plan/src/projection.rs +++ b/datafusion/physical-plan/src/projection.rs @@ -261,8 +261,7 @@ fn get_field_metadata( // column with the same name) e.as_any() .downcast_ref::() - .map(|column| column.index()) - .map(|idx| input_schema.field(idx).metadata()) + .map(|column| input_schema.field(column.index()).metadata()) .cloned() }