Skip to content

Commit

Permalink
fix: enable any character in the metadata string value, by having any…
Browse files Browse the repository at this point in the history
… key parsing be a part of the format.metadata::key
  • Loading branch information
wiedld committed Apr 25, 2024
1 parent 0cf04b5 commit 6d6e5ba
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 36 deletions.
62 changes: 50 additions & 12 deletions datafusion/common/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1376,6 +1376,15 @@ pub struct TableParquetOptions {
pub column_specific_options: HashMap<String, ColumnOptions>,
/// Additional file-level metadata to include. Inserted into the key_value_metadata
/// for the written [`FileMetaData`](https://docs.rs/parquet/latest/parquet/file/metadata/struct.FileMetaData.html).
///
/// Multiple entries are permitted
/// ```sql
/// OPTIONS (
/// 'format.metadata::key1' '',
/// 'format.metadata::key2' 'value',
/// 'format.metadata::key3' 'value has spaces'
/// )
/// ```
pub key_value_metadata: HashMap<String, Option<String>>,
}

Expand All @@ -1387,23 +1396,25 @@ impl ConfigField for TableParquetOptions {
}

fn set(&mut self, key: &str, value: &str) -> Result<()> {
// Determine the key if it's a global or column-specific setting
if key.contains("::") {
self.column_specific_options.set(key, value)
} else if key.eq("metadata") {
for maybe_pair in value.split(' ') {
let (k, v) = match maybe_pair.split(':').collect::<Vec<_>>()[..] {
[k, v] => (k.into(), Some(v.into())),
[k] => (k.into(), None),
// Determine if the key is a global, metadata, or column-specific setting
if key.starts_with("metadata::") {
let k =
match key.split("::").collect::<Vec<_>>()[..] {
[_meta] | [_meta, ""] => return Err(DataFusionError::Configuration(
"Invalid metadata key provided, missing key in metadata::<key>"
.to_string(),
)),
[_meta, k] => k.into(),
_ => {
return Err(DataFusionError::Configuration(format!(
"Invalid metadata provided \"{maybe_pair}\""
)))
"Invalid metadata key provided, found too many '::' in \"{key}\""
)))
}
};
self.key_value_metadata.insert(k, v);
}
self.key_value_metadata.insert(k, Some(value.into()));
Ok(())
} else if key.contains("::") {
self.column_specific_options.set(key, value)
} else {
self.global.set(key, value)
}
Expand Down Expand Up @@ -1794,4 +1805,31 @@ mod tests {
.iter()
.any(|item| item.key == "format.bloom_filter_enabled::col1"))
}

#[cfg(feature = "parquet")]
#[test]
fn parquet_table_options_config_metadata_entry() {
let mut table_config = TableOptions::new();
table_config.set_file_format(FileType::PARQUET);
table_config.set("format.metadata::key1", "").unwrap();
table_config.set("format.metadata::key2", "value2").unwrap();
table_config
.set("format.metadata::key3", "value with spaces ")
.unwrap();

let parsed_metadata = table_config.parquet.key_value_metadata.clone();
assert_eq!(parsed_metadata.get("should not exist1"), None);
assert_eq!(parsed_metadata.get("key1"), Some(&Some("".into())));
assert_eq!(parsed_metadata.get("key2"), Some(&Some("value2".into())));
assert_eq!(
parsed_metadata.get("key3"),
Some(&Some("value with spaces ".into()))
);

// duplicate keys are overwritten
table_config.set("format.metadata::key_dupe", "A").unwrap();
table_config.set("format.metadata::key_dupe", "B").unwrap();
let parsed_metadata = table_config.parquet.key_value_metadata;
assert_eq!(parsed_metadata.get("key_dupe"), Some(&Some("B".into())));
}
}
25 changes: 16 additions & 9 deletions datafusion/core/src/datasource/file_format/parquet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1858,10 +1858,10 @@ mod tests {
let parquet_sink = Arc::new(ParquetSink::new(
file_sink_config,
TableParquetOptions {
key_value_metadata: std::collections::HashMap::from([(
"my-data".to_string(),
Some("stuff".to_string()),
)]),
key_value_metadata: std::collections::HashMap::from([
("my-data".to_string(), Some("stuff".to_string())),
("my-data-bool-key".to_string(), None),
]),
..Default::default()
},
));
Expand Down Expand Up @@ -1916,11 +1916,18 @@ mod tests {
"output file metadata should contain col b"
);

let key_value_metadata = key_value_metadata.unwrap();
let expected_metadata = vec![KeyValue {
key: "my-data".to_string(),
value: Some("stuff".to_string()),
}];
let mut key_value_metadata = key_value_metadata.unwrap();
key_value_metadata.sort_by(|a, b| a.key.cmp(&b.key));
let expected_metadata = vec![
KeyValue {
key: "my-data".to_string(),
value: Some("stuff".to_string()),
},
KeyValue {
key: "my-data-bool-key".to_string(),
value: None,
},
];
assert_eq!(key_value_metadata, expected_metadata);

Ok(())
Expand Down
34 changes: 19 additions & 15 deletions datafusion/sqllogictest/test_files/copy.slt
Original file line number Diff line number Diff line change
Expand Up @@ -284,65 +284,69 @@ OPTIONS (
'format.max_statistics_size' 123,
'format.bloom_filter_fpp' 0.001,
'format.bloom_filter_ndv' 100,
'format.metadata' 'foo:bar baz'
'format.metadata::key' 'value'
)
----
2

# valid vs invalid metadata

# accepts empty map
# accepts map with a single entry
statement ok
COPY source_table
TO 'test_files/scratch/copy/table_with_metadata/'
STORED AS PARQUET
OPTIONS (
'format.metadata' ''
'format.metadata::key' 'value'
)

# accepts map with a single entry
# accepts multiple entries (on different keys)
statement ok
COPY source_table
TO 'test_files/scratch/copy/table_with_metadata/'
STORED AS PARQUET
OPTIONS (
'format.metadata' 'key:value'
'format.metadata::key1' '',
'format.metadata::key2' 'value',
'format.metadata::key3' 'value with spaces',
'format.metadata::key4' 'value with special chars :: :'
)

# accepts map with multiple entries
# accepts multiple entries with the same key (will overwrite)
statement ok
COPY source_table
TO 'test_files/scratch/copy/table_with_metadata/'
STORED AS PARQUET
OPTIONS (
'format.metadata' 'key1:value1 key2:value2'
'format.metadata::key1' 'value',
'format.metadata::key1' 'value'
)

# accepts entries which are key-only (no value)
statement ok
# errors if key is missing
statement error DataFusion error: Invalid or Unsupported Configuration: Invalid metadata key provided, missing key in metadata::<key>
COPY source_table
TO 'test_files/scratch/copy/table_with_metadata/'
STORED AS PARQUET
OPTIONS (
'format.metadata' 'key1 key2:value2 key3'
'format.metadata::' 'value'
)

# errors for invalid key-value pair (extra `:`)
statement error DataFusion error: Invalid or Unsupported Configuration: Invalid metadata provided "foo:bar:extra"
# errors if key contains internal '::'
statement error DataFusion error: Invalid or Unsupported Configuration: Invalid metadata key provided, found too many '::' in "metadata::key::extra"
COPY source_table
TO 'test_files/scratch/copy/table_with_metadata/'
STORED AS PARQUET
OPTIONS (
'format.metadata' 'foo:bar:extra'
'format.metadata::key::extra' 'value'
)

# errors for invalid property (not stating `format.metadata`)
statement error DataFusion error: Invalid or Unsupported Configuration: Config value "wrong-metadata-key" not found on ParquetOptions
statement error DataFusion error: Invalid or Unsupported Configuration: Config value "wrong-metadata" not found on ColumnOptions
COPY source_table
TO 'test_files/scratch/copy/table_with_metadata/'
STORED AS PARQUET
OPTIONS (
'format.wrong-metadata-key' 'key:value'
'format.wrong-metadata::key' 'value'
)


Expand Down

0 comments on commit 6d6e5ba

Please sign in to comment.