From 6d6e5ba784e82ae5f7a3a5a3a4f18d62557175e2 Mon Sep 17 00:00:00 2001 From: wiedld Date: Thu, 25 Apr 2024 13:27:49 -0700 Subject: [PATCH] fix: enable any character in the metadata string value, by having any key parsing be a part of the format.metadata::key --- datafusion/common/src/config.rs | 62 +++++++++++++++---- .../src/datasource/file_format/parquet.rs | 25 +++++--- datafusion/sqllogictest/test_files/copy.slt | 34 +++++----- 3 files changed, 85 insertions(+), 36 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 68d7d6f0db80e..33c45c492f4bf 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -1376,6 +1376,15 @@ pub struct TableParquetOptions { pub column_specific_options: HashMap, /// Additional file-level metadata to include. Inserted into the key_value_metadata /// for the written [`FileMetaData`](https://docs.rs/parquet/latest/parquet/file/metadata/struct.FileMetaData.html). + /// + /// Multiple entries are permitted + /// ```sql + /// OPTIONS ( + /// 'format.metadata::key1' '', + /// 'format.metadata::key2' 'value', + /// 'format.metadata::key3' 'value has spaces' + /// ) + /// ``` pub key_value_metadata: HashMap>, } @@ -1387,23 +1396,25 @@ impl ConfigField for TableParquetOptions { } fn set(&mut self, key: &str, value: &str) -> Result<()> { - // Determine the key if it's a global or column-specific setting - if key.contains("::") { - self.column_specific_options.set(key, value) - } else if key.eq("metadata") { - for maybe_pair in value.split(' ') { - let (k, v) = match maybe_pair.split(':').collect::>()[..] { - [k, v] => (k.into(), Some(v.into())), - [k] => (k.into(), None), + // Determine if the key is a global, metadata, or column-specific setting + if key.starts_with("metadata::") { + let k = + match key.split("::").collect::>()[..] { + [_meta] | [_meta, ""] => return Err(DataFusionError::Configuration( + "Invalid metadata key provided, missing key in metadata::" + .to_string(), + )), + [_meta, k] => k.into(), _ => { return Err(DataFusionError::Configuration(format!( - "Invalid metadata provided \"{maybe_pair}\"" - ))) + "Invalid metadata key provided, found too many '::' in \"{key}\"" + ))) } }; - self.key_value_metadata.insert(k, v); - } + self.key_value_metadata.insert(k, Some(value.into())); Ok(()) + } else if key.contains("::") { + self.column_specific_options.set(key, value) } else { self.global.set(key, value) } @@ -1794,4 +1805,31 @@ mod tests { .iter() .any(|item| item.key == "format.bloom_filter_enabled::col1")) } + + #[cfg(feature = "parquet")] + #[test] + fn parquet_table_options_config_metadata_entry() { + let mut table_config = TableOptions::new(); + table_config.set_file_format(FileType::PARQUET); + table_config.set("format.metadata::key1", "").unwrap(); + table_config.set("format.metadata::key2", "value2").unwrap(); + table_config + .set("format.metadata::key3", "value with spaces ") + .unwrap(); + + let parsed_metadata = table_config.parquet.key_value_metadata.clone(); + assert_eq!(parsed_metadata.get("should not exist1"), None); + assert_eq!(parsed_metadata.get("key1"), Some(&Some("".into()))); + assert_eq!(parsed_metadata.get("key2"), Some(&Some("value2".into()))); + assert_eq!( + parsed_metadata.get("key3"), + Some(&Some("value with spaces ".into())) + ); + + // duplicate keys are overwritten + table_config.set("format.metadata::key_dupe", "A").unwrap(); + table_config.set("format.metadata::key_dupe", "B").unwrap(); + let parsed_metadata = table_config.parquet.key_value_metadata; + assert_eq!(parsed_metadata.get("key_dupe"), Some(&Some("B".into()))); + } } diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index e54cfcac7ff6d..7ec7d4540fff0 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -1858,10 +1858,10 @@ mod tests { let parquet_sink = Arc::new(ParquetSink::new( file_sink_config, TableParquetOptions { - key_value_metadata: std::collections::HashMap::from([( - "my-data".to_string(), - Some("stuff".to_string()), - )]), + key_value_metadata: std::collections::HashMap::from([ + ("my-data".to_string(), Some("stuff".to_string())), + ("my-data-bool-key".to_string(), None), + ]), ..Default::default() }, )); @@ -1916,11 +1916,18 @@ mod tests { "output file metadata should contain col b" ); - let key_value_metadata = key_value_metadata.unwrap(); - let expected_metadata = vec![KeyValue { - key: "my-data".to_string(), - value: Some("stuff".to_string()), - }]; + let mut key_value_metadata = key_value_metadata.unwrap(); + key_value_metadata.sort_by(|a, b| a.key.cmp(&b.key)); + let expected_metadata = vec![ + KeyValue { + key: "my-data".to_string(), + value: Some("stuff".to_string()), + }, + KeyValue { + key: "my-data-bool-key".to_string(), + value: None, + }, + ]; assert_eq!(key_value_metadata, expected_metadata); Ok(()) diff --git a/datafusion/sqllogictest/test_files/copy.slt b/datafusion/sqllogictest/test_files/copy.slt index cffbc74a627b2..d695e8514b07a 100644 --- a/datafusion/sqllogictest/test_files/copy.slt +++ b/datafusion/sqllogictest/test_files/copy.slt @@ -284,65 +284,69 @@ OPTIONS ( 'format.max_statistics_size' 123, 'format.bloom_filter_fpp' 0.001, 'format.bloom_filter_ndv' 100, -'format.metadata' 'foo:bar baz' +'format.metadata::key' 'value' ) ---- 2 # valid vs invalid metadata -# accepts empty map +# accepts map with a single entry statement ok COPY source_table TO 'test_files/scratch/copy/table_with_metadata/' STORED AS PARQUET OPTIONS ( -'format.metadata' '' + 'format.metadata::key' 'value' ) -# accepts map with a single entry +# accepts multiple entries (on different keys) statement ok COPY source_table TO 'test_files/scratch/copy/table_with_metadata/' STORED AS PARQUET OPTIONS ( -'format.metadata' 'key:value' + 'format.metadata::key1' '', + 'format.metadata::key2' 'value', + 'format.metadata::key3' 'value with spaces', + 'format.metadata::key4' 'value with special chars :: :' ) -# accepts map with multiple entries +# accepts multiple entries with the same key (will overwrite) statement ok COPY source_table TO 'test_files/scratch/copy/table_with_metadata/' STORED AS PARQUET OPTIONS ( -'format.metadata' 'key1:value1 key2:value2' + 'format.metadata::key1' 'value', + 'format.metadata::key1' 'value' ) -# accepts entries which are key-only (no value) -statement ok +# errors if key is missing +statement error DataFusion error: Invalid or Unsupported Configuration: Invalid metadata key provided, missing key in metadata:: COPY source_table TO 'test_files/scratch/copy/table_with_metadata/' STORED AS PARQUET OPTIONS ( -'format.metadata' 'key1 key2:value2 key3' + 'format.metadata::' 'value' ) -# errors for invalid key-value pair (extra `:`) -statement error DataFusion error: Invalid or Unsupported Configuration: Invalid metadata provided "foo:bar:extra" +# errors if key contains internal '::' +statement error DataFusion error: Invalid or Unsupported Configuration: Invalid metadata key provided, found too many '::' in "metadata::key::extra" COPY source_table TO 'test_files/scratch/copy/table_with_metadata/' STORED AS PARQUET OPTIONS ( -'format.metadata' 'foo:bar:extra' + 'format.metadata::key::extra' 'value' ) # errors for invalid property (not stating `format.metadata`) -statement error DataFusion error: Invalid or Unsupported Configuration: Config value "wrong-metadata-key" not found on ParquetOptions +statement error DataFusion error: Invalid or Unsupported Configuration: Config value "wrong-metadata" not found on ColumnOptions COPY source_table TO 'test_files/scratch/copy/table_with_metadata/' STORED AS PARQUET OPTIONS ( -'format.wrong-metadata-key' 'key:value' + 'format.wrong-metadata::key' 'value' )