From 8d6fb7fc44fdfaf5a60c670fbdbd7e1b6030be56 Mon Sep 17 00:00:00 2001 From: wiedld Date: Fri, 19 Jul 2024 13:58:32 -0700 Subject: [PATCH 1/7] feat(11367): define the default session's writer props, both with and without the parquet feature enabled --- datafusion/common/src/config.rs | 62 +++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index b46b002baac0..c2072438fd90 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -314,6 +314,68 @@ config_namespace! { } } +/// When using the parquet feature, +/// use the same default writer settings as the extern parquet. +#[cfg(feature = "parquet")] +mod parquet_defaults { + #![allow(dead_code)] + use parquet::file::properties as props; + + /// Default value for [`parquet::WriterProperties::data_page_size_limit`] + pub const DEFAULT_PAGE_SIZE: usize = props::DEFAULT_PAGE_SIZE; + /// Default value for [`parquet::WriterProperties::write_batch_size`] + pub const DEFAULT_WRITE_BATCH_SIZE: usize = props::DEFAULT_WRITE_BATCH_SIZE; + /// Default value for [`parquet::WriterProperties::writer_version`] + pub const DEFAULT_WRITER_VERSION: &str = "1.0"; + /// Default value for [`parquet::WriterProperties::dictionary_enabled`] + pub const DEFAULT_DICTIONARY_ENABLED: Option = + Some(props::DEFAULT_DICTIONARY_ENABLED); + /// Default value for [`parquet::WriterProperties::dictionary_page_size_limit`] + pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = + props::DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT; + /// Default value for [`parquet::WriterProperties::data_page_row_count_limit`] + pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = + props::DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT; + /// Default value for [`parquet::WriterProperties::statistics_enabled`] + pub const DEFAULT_STATISTICS_ENABLED: Option<&str> = Some("page"); + /// Default value for [`parquet::WriterProperties::max_statistics_size`] + pub const DEFAULT_MAX_STATISTICS_SIZE: Option = + Some(props::DEFAULT_MAX_STATISTICS_SIZE); + /// Default value for [`parquet::WriterProperties::max_row_group_size`] + pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = props::DEFAULT_MAX_ROW_GROUP_SIZE; + /// Default value for [`parquet::WriterProperties::column_index_truncate_length`] + pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option = + props::DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH; + /// Default value for [`parquet::BloomFilterProperties::fpp`] + pub const DEFAULT_BLOOM_FILTER_FPP: Option = + Some(props::DEFAULT_BLOOM_FILTER_FPP); + /// Default value for [`parquet::BloomFilterProperties::ndv`] + pub const DEFAULT_BLOOM_FILTER_NDV: Option = + Some(props::DEFAULT_BLOOM_FILTER_NDV); +} + +/// When note using the parquet feature, provide a manual copy +/// of the extern parquet's settings in order to compile. +/// +/// This is required since the [`ParquetOptions`] are extended with the +/// `config_namespace` macro, which does not handle internal configuration macros. +#[cfg(not(feature = "parquet"))] +mod parquet_defaults { + #![allow(dead_code)] + pub const DEFAULT_PAGE_SIZE: usize = 1024 * 1024; + pub const DEFAULT_WRITE_BATCH_SIZE: usize = 1024; + pub const DEFAULT_WRITER_VERSION: &str = "1.0"; + pub const DEFAULT_DICTIONARY_ENABLED: Option = Some(true); + pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = DEFAULT_PAGE_SIZE; + pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = 20_000; + pub const DEFAULT_STATISTICS_ENABLED: Option<&str> = Some("page"); + pub const DEFAULT_MAX_STATISTICS_SIZE: Option = Some(4096); + pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024; + pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option = Some(64); + pub const DEFAULT_BLOOM_FILTER_FPP: Option = Some(0.05); + pub const DEFAULT_BLOOM_FILTER_NDV: Option = Some(1_000_000_u64); +} + config_namespace! { /// Options for reading and writing parquet files /// From 01bc13efb6a0b703d08910b26781805f28d03a1d Mon Sep 17 00:00:00 2001 From: wiedld Date: Fri, 19 Jul 2024 13:59:36 -0700 Subject: [PATCH 2/7] feat(11367): map the defaults that already align between datafusion and arrow-rs --- datafusion/common/src/config.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index c2072438fd90..41386806b00c 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -419,14 +419,14 @@ config_namespace! { // and map to parquet::file::properties::WriterProperties /// (writing) Sets best effort maximum size of data page in bytes - pub data_pagesize_limit: usize, default = 1024 * 1024 + pub data_pagesize_limit: usize, default = parquet_defaults::DEFAULT_PAGE_SIZE /// (writing) Sets write_batch_size in bytes - pub write_batch_size: usize, default = 1024 + pub write_batch_size: usize, default = parquet_defaults::DEFAULT_WRITE_BATCH_SIZE /// (writing) Sets parquet writer version /// valid values are "1.0" and "2.0" - pub writer_version: String, default = "1.0".into() + pub writer_version: String, default = parquet_defaults::DEFAULT_WRITER_VERSION.to_string() /// (writing) Sets default parquet compression codec. /// Valid values are: uncompressed, snappy, gzip(level), @@ -440,7 +440,7 @@ config_namespace! { pub dictionary_enabled: Option, default = None /// (writing) Sets best effort maximum dictionary page size, in bytes - pub dictionary_page_size_limit: usize, default = 1024 * 1024 + pub dictionary_page_size_limit: usize, default = parquet_defaults::DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT /// (writing) Sets if statistics are enabled for any column /// Valid values are: "none", "chunk", and "page" @@ -455,7 +455,7 @@ config_namespace! { /// (writing) Target maximum number of rows in each row group (defaults to 1M /// rows). Writing larger row groups requires more memory to write, but /// can get better compression and be faster to read. - pub max_row_group_size: usize, default = 1024 * 1024 + pub max_row_group_size: usize, default = parquet_defaults::DEFAULT_MAX_ROW_GROUP_SIZE /// (writing) Sets "created by" property pub created_by: String, default = concat!("datafusion version ", env!("CARGO_PKG_VERSION")).into() From 6883c2164562dededfbf83a8a38950293178a1f5 Mon Sep 17 00:00:00 2001 From: wiedld Date: Fri, 19 Jul 2024 15:42:23 -0700 Subject: [PATCH 3/7] feat(11367): move datafusion to use extern parquet defaults for most writer settings --- datafusion/common/src/config.rs | 8 +-- .../common/src/file_options/parquet_writer.rs | 72 ------------------- .../test_files/information_schema.slt | 16 ++--- 3 files changed, 12 insertions(+), 84 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 41386806b00c..78e353730360 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -437,7 +437,7 @@ config_namespace! { /// (writing) Sets if dictionary encoding is enabled. If NULL, uses /// default parquet writer setting - pub dictionary_enabled: Option, default = None + pub dictionary_enabled: Option, default = parquet_defaults::DEFAULT_DICTIONARY_ENABLED /// (writing) Sets best effort maximum dictionary page size, in bytes pub dictionary_page_size_limit: usize, default = parquet_defaults::DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT @@ -450,7 +450,7 @@ config_namespace! { /// (writing) Sets max statistics size for any column. If NULL, uses /// default parquet writer setting - pub max_statistics_size: Option, default = None + pub max_statistics_size: Option, default = parquet_defaults::DEFAULT_MAX_STATISTICS_SIZE /// (writing) Target maximum number of rows in each row group (defaults to 1M /// rows). Writing larger row groups requires more memory to write, but @@ -461,10 +461,10 @@ config_namespace! { pub created_by: String, default = concat!("datafusion version ", env!("CARGO_PKG_VERSION")).into() /// (writing) Sets column index truncate length - pub column_index_truncate_length: Option, default = None + pub column_index_truncate_length: Option, default = parquet_defaults::DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH /// (writing) Sets best effort maximum number of rows in data page - pub data_page_row_count_limit: usize, default = usize::MAX + pub data_page_row_count_limit: usize, default = parquet_defaults::DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT /// (writing) Sets default encoding for any column. /// Valid values are: plain, plain_dictionary, rle, diff --git a/datafusion/common/src/file_options/parquet_writer.rs b/datafusion/common/src/file_options/parquet_writer.rs index abe7db2009a2..e63a7e5ef78d 100644 --- a/datafusion/common/src/file_options/parquet_writer.rs +++ b/datafusion/common/src/file_options/parquet_writer.rs @@ -644,56 +644,6 @@ mod tests { "datafusion's default is zstd" ); - // TODO: data_page_row_count_limit defaults do not match - // refer to https://github.com/apache/datafusion/issues/11367 - assert_eq!( - default_writer_props.data_page_row_count_limit(), - 20_000, - "extern parquet's default data_page_row_count_limit is 20_000" - ); - assert_eq!( - from_datafusion_defaults.data_page_row_count_limit(), - usize::MAX, - "datafusion's default is usize::MAX" - ); - - // TODO: column_index_truncate_length do not match - // refer to https://github.com/apache/datafusion/issues/11367 - assert_eq!( - default_writer_props.column_index_truncate_length(), - Some(64), - "extern parquet's default is 64" - ); - assert_eq!( - from_datafusion_defaults.column_index_truncate_length(), - None, - "datafusion's default is None" - ); - - // The next few examples are where datafusion's default is None. - // But once datafusion's TableParquetOptions are converted to a WriterProperties, - // then we get the extern parquet's defaults. - // - // In other words, we do not get indeterminate behavior in the output writer props. - // But this is only because we use the extern parquet's defaults when we leave - // the datafusion setting as None. - - // datafusion's `None` for Option => becomes parquet's true - // TODO: should this be changed? - // refer to https://github.com/apache/datafusion/issues/11367 - assert!( - default_writer_props.dictionary_enabled(&"default".into()), - "extern parquet's default is true" - ); - assert_eq!( - default_table_writer_opts.global.dictionary_enabled, None, - "datafusion's has no default" - ); - assert!( - from_datafusion_defaults.dictionary_enabled(&"default".into()), - "should see the extern parquet's default over-riding datafusion's None", - ); - // datafusion's `None` for Option => becomes parquet's EnabledStatistics::Page // TODO: should this be changed? // refer to https://github.com/apache/datafusion/issues/11367 @@ -712,35 +662,13 @@ mod tests { "should see the extern parquet's default over-riding datafusion's None", ); - // datafusion's `None` for Option => becomes parquet's 4096 - // TODO: should this be changed? - // refer to https://github.com/apache/datafusion/issues/11367 - assert_eq!( - default_writer_props.max_statistics_size(&"default".into()), - 4096, - "extern parquet's default is 4096" - ); - assert_eq!( - default_table_writer_opts.global.max_statistics_size, None, - "datafusion's has no default" - ); - assert_eq!( - default_writer_props.max_statistics_size(&"default".into()), - 4096, - "should see the extern parquet's default over-riding datafusion's None", - ); - // Confirm all other settings are equal. // First resolve the known discrepancies, (set as the same). // TODO: once we fix the above mis-matches, we should be able to remove this. let mut from_extern_parquet = session_config_from_writer_props(&default_writer_props); from_extern_parquet.global.compression = Some("zstd(3)".into()); - from_extern_parquet.global.data_page_row_count_limit = usize::MAX; - from_extern_parquet.global.column_index_truncate_length = None; - from_extern_parquet.global.dictionary_enabled = None; from_extern_parquet.global.statistics_enabled = None; - from_extern_parquet.global.max_statistics_size = None; // Expected: the remaining should match let same_created_by = default_table_writer_opts.global.created_by.clone(); // we expect these to be different diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index f7b755b01911..a310fc84a41e 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -183,17 +183,17 @@ datafusion.execution.parquet.bloom_filter_fpp NULL datafusion.execution.parquet.bloom_filter_ndv NULL datafusion.execution.parquet.bloom_filter_on_read true datafusion.execution.parquet.bloom_filter_on_write false -datafusion.execution.parquet.column_index_truncate_length NULL +datafusion.execution.parquet.column_index_truncate_length 64 datafusion.execution.parquet.compression zstd(3) datafusion.execution.parquet.created_by datafusion -datafusion.execution.parquet.data_page_row_count_limit 18446744073709551615 +datafusion.execution.parquet.data_page_row_count_limit 20000 datafusion.execution.parquet.data_pagesize_limit 1048576 -datafusion.execution.parquet.dictionary_enabled NULL +datafusion.execution.parquet.dictionary_enabled true datafusion.execution.parquet.dictionary_page_size_limit 1048576 datafusion.execution.parquet.enable_page_index true datafusion.execution.parquet.encoding NULL datafusion.execution.parquet.max_row_group_size 1048576 -datafusion.execution.parquet.max_statistics_size NULL +datafusion.execution.parquet.max_statistics_size 4096 datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 datafusion.execution.parquet.maximum_parallel_row_group_writers 1 datafusion.execution.parquet.metadata_size_hint NULL @@ -267,17 +267,17 @@ datafusion.execution.parquet.bloom_filter_fpp NULL (writing) Sets bloom filter f datafusion.execution.parquet.bloom_filter_ndv NULL (writing) Sets bloom filter number of distinct values. If NULL, uses default parquet writer setting datafusion.execution.parquet.bloom_filter_on_read true (writing) Use any available bloom filters when reading parquet files datafusion.execution.parquet.bloom_filter_on_write false (writing) Write bloom filters for all columns when creating parquet files -datafusion.execution.parquet.column_index_truncate_length NULL (writing) Sets column index truncate length +datafusion.execution.parquet.column_index_truncate_length 64 (writing) Sets column index truncate length datafusion.execution.parquet.compression zstd(3) (writing) Sets default parquet compression codec. Valid values are: uncompressed, snappy, gzip(level), lzo, brotli(level), lz4, zstd(level), and lz4_raw. These values are not case sensitive. If NULL, uses default parquet writer setting datafusion.execution.parquet.created_by datafusion (writing) Sets "created by" property -datafusion.execution.parquet.data_page_row_count_limit 18446744073709551615 (writing) Sets best effort maximum number of rows in data page +datafusion.execution.parquet.data_page_row_count_limit 20000 (writing) Sets best effort maximum number of rows in data page datafusion.execution.parquet.data_pagesize_limit 1048576 (writing) Sets best effort maximum size of data page in bytes -datafusion.execution.parquet.dictionary_enabled NULL (writing) Sets if dictionary encoding is enabled. If NULL, uses default parquet writer setting +datafusion.execution.parquet.dictionary_enabled true (writing) Sets if dictionary encoding is enabled. If NULL, uses default parquet writer setting datafusion.execution.parquet.dictionary_page_size_limit 1048576 (writing) Sets best effort maximum dictionary page size, in bytes datafusion.execution.parquet.enable_page_index true (reading) If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded. datafusion.execution.parquet.encoding NULL (writing) Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting datafusion.execution.parquet.max_row_group_size 1048576 (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. -datafusion.execution.parquet.max_statistics_size NULL (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting +datafusion.execution.parquet.max_statistics_size 4096 (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. datafusion.execution.parquet.maximum_parallel_row_group_writers 1 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. datafusion.execution.parquet.metadata_size_hint NULL (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer From 4899a00e4e0d8e366c7f877c815d96b88a98f26c Mon Sep 17 00:00:00 2001 From: wiedld Date: Fri, 19 Jul 2024 15:54:44 -0700 Subject: [PATCH 4/7] chore(11367): denote what are the remaining misaligned defaults, for further discussion --- datafusion/common/src/config.rs | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 78e353730360..50e7c59a517e 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -318,7 +318,7 @@ config_namespace! { /// use the same default writer settings as the extern parquet. #[cfg(feature = "parquet")] mod parquet_defaults { - #![allow(dead_code)] + use parquet::basic::Compression; use parquet::file::properties as props; /// Default value for [`parquet::WriterProperties::data_page_size_limit`] @@ -336,8 +336,6 @@ mod parquet_defaults { /// Default value for [`parquet::WriterProperties::data_page_row_count_limit`] pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = props::DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT; - /// Default value for [`parquet::WriterProperties::statistics_enabled`] - pub const DEFAULT_STATISTICS_ENABLED: Option<&str> = Some("page"); /// Default value for [`parquet::WriterProperties::max_statistics_size`] pub const DEFAULT_MAX_STATISTICS_SIZE: Option = Some(props::DEFAULT_MAX_STATISTICS_SIZE); @@ -346,12 +344,25 @@ mod parquet_defaults { /// Default value for [`parquet::WriterProperties::column_index_truncate_length`] pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option = props::DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH; + + // TODO: discuss if we want datafusion to use these defaults from the extern parquet + // refer to https://github.com/apache/datafusion/issues/11367 + + #[allow(dead_code)] + /// Default value for [`parquet::WriterProperties::statistics_enabled`] + pub const DEFAULT_STATISTICS_ENABLED: Option<&str> = Some("page"); + #[allow(dead_code)] /// Default value for [`parquet::BloomFilterProperties::fpp`] pub const DEFAULT_BLOOM_FILTER_FPP: Option = Some(props::DEFAULT_BLOOM_FILTER_FPP); + #[allow(dead_code)] /// Default value for [`parquet::BloomFilterProperties::ndv`] pub const DEFAULT_BLOOM_FILTER_NDV: Option = Some(props::DEFAULT_BLOOM_FILTER_NDV); + + #[allow(dead_code)] + /// Default value for [`parquet::WriterProperties::compression`] + pub const DEFAULT_COMPRESSION: Compression = Compression::UNCOMPRESSED; } /// When note using the parquet feature, provide a manual copy @@ -361,18 +372,24 @@ mod parquet_defaults { /// `config_namespace` macro, which does not handle internal configuration macros. #[cfg(not(feature = "parquet"))] mod parquet_defaults { - #![allow(dead_code)] pub const DEFAULT_PAGE_SIZE: usize = 1024 * 1024; pub const DEFAULT_WRITE_BATCH_SIZE: usize = 1024; pub const DEFAULT_WRITER_VERSION: &str = "1.0"; pub const DEFAULT_DICTIONARY_ENABLED: Option = Some(true); pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = DEFAULT_PAGE_SIZE; pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = 20_000; - pub const DEFAULT_STATISTICS_ENABLED: Option<&str> = Some("page"); pub const DEFAULT_MAX_STATISTICS_SIZE: Option = Some(4096); pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024; pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option = Some(64); + + // TODO: discuss if we want datafusion to use these defaults from the extern parquet + // refer to https://github.com/apache/datafusion/issues/11367 + + #[allow(dead_code)] + pub const DEFAULT_STATISTICS_ENABLED: Option<&str> = Some("page"); + #[allow(dead_code)] pub const DEFAULT_BLOOM_FILTER_FPP: Option = Some(0.05); + #[allow(dead_code)] pub const DEFAULT_BLOOM_FILTER_NDV: Option = Some(1_000_000_u64); } From ac9969bd05285ba678de1fcb472a4b14231c0eaf Mon Sep 17 00:00:00 2001 From: wiedld Date: Fri, 19 Jul 2024 16:29:12 -0700 Subject: [PATCH 5/7] chore: fix doc links --- datafusion/common/src/config.rs | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 50e7c59a517e..156eac60b866 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -321,27 +321,27 @@ mod parquet_defaults { use parquet::basic::Compression; use parquet::file::properties as props; - /// Default value for [`parquet::WriterProperties::data_page_size_limit`] + /// Default value for [`props::WriterProperties::data_page_size_limit`] pub const DEFAULT_PAGE_SIZE: usize = props::DEFAULT_PAGE_SIZE; - /// Default value for [`parquet::WriterProperties::write_batch_size`] + /// Default value for [`props::WriterProperties::write_batch_size`] pub const DEFAULT_WRITE_BATCH_SIZE: usize = props::DEFAULT_WRITE_BATCH_SIZE; - /// Default value for [`parquet::WriterProperties::writer_version`] + /// Default value for [`props::WriterProperties::writer_version`] pub const DEFAULT_WRITER_VERSION: &str = "1.0"; - /// Default value for [`parquet::WriterProperties::dictionary_enabled`] + /// Default value for [`props::WriterProperties::dictionary_enabled`] pub const DEFAULT_DICTIONARY_ENABLED: Option = Some(props::DEFAULT_DICTIONARY_ENABLED); - /// Default value for [`parquet::WriterProperties::dictionary_page_size_limit`] + /// Default value for [`props::WriterProperties::dictionary_page_size_limit`] pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = props::DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT; - /// Default value for [`parquet::WriterProperties::data_page_row_count_limit`] + /// Default value for [`props::WriterProperties::data_page_row_count_limit`] pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = props::DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT; - /// Default value for [`parquet::WriterProperties::max_statistics_size`] + /// Default value for [`props::WriterProperties::max_statistics_size`] pub const DEFAULT_MAX_STATISTICS_SIZE: Option = Some(props::DEFAULT_MAX_STATISTICS_SIZE); - /// Default value for [`parquet::WriterProperties::max_row_group_size`] + /// Default value for [`props::WriterProperties::max_row_group_size`] pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = props::DEFAULT_MAX_ROW_GROUP_SIZE; - /// Default value for [`parquet::WriterProperties::column_index_truncate_length`] + /// Default value for [`props::WriterProperties::column_index_truncate_length`] pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option = props::DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH; @@ -349,19 +349,19 @@ mod parquet_defaults { // refer to https://github.com/apache/datafusion/issues/11367 #[allow(dead_code)] - /// Default value for [`parquet::WriterProperties::statistics_enabled`] + /// Default value for [`props::WriterProperties::statistics_enabled`] pub const DEFAULT_STATISTICS_ENABLED: Option<&str> = Some("page"); #[allow(dead_code)] - /// Default value for [`parquet::BloomFilterProperties::fpp`] + /// Default value for [`props::BloomFilterProperties::fpp`] pub const DEFAULT_BLOOM_FILTER_FPP: Option = Some(props::DEFAULT_BLOOM_FILTER_FPP); #[allow(dead_code)] - /// Default value for [`parquet::BloomFilterProperties::ndv`] + /// Default value for [`props::BloomFilterProperties::ndv`] pub const DEFAULT_BLOOM_FILTER_NDV: Option = Some(props::DEFAULT_BLOOM_FILTER_NDV); #[allow(dead_code)] - /// Default value for [`parquet::WriterProperties::compression`] + /// Default value for [props::WriterProperties::compression`] pub const DEFAULT_COMPRESSION: Compression = Compression::UNCOMPRESSED; } From 38124ba82fa614368fc868325c619e26b007bd11 Mon Sep 17 00:00:00 2001 From: wiedld Date: Fri, 19 Jul 2024 16:32:34 -0700 Subject: [PATCH 6/7] chore: update the readme demonstrating the default config setting --- docs/source/user-guide/configs.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 8d3ecbc98544..4e9d95228743 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -59,14 +59,14 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.execution.parquet.write_batch_size | 1024 | (writing) Sets write_batch_size in bytes | | datafusion.execution.parquet.writer_version | 1.0 | (writing) Sets parquet writer version valid values are "1.0" and "2.0" | | datafusion.execution.parquet.compression | zstd(3) | (writing) Sets default parquet compression codec. Valid values are: uncompressed, snappy, gzip(level), lzo, brotli(level), lz4, zstd(level), and lz4_raw. These values are not case sensitive. If NULL, uses default parquet writer setting | -| datafusion.execution.parquet.dictionary_enabled | NULL | (writing) Sets if dictionary encoding is enabled. If NULL, uses default parquet writer setting | +| datafusion.execution.parquet.dictionary_enabled | true | (writing) Sets if dictionary encoding is enabled. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.dictionary_page_size_limit | 1048576 | (writing) Sets best effort maximum dictionary page size, in bytes | | datafusion.execution.parquet.statistics_enabled | NULL | (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting | -| datafusion.execution.parquet.max_statistics_size | NULL | (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting | +| datafusion.execution.parquet.max_statistics_size | 4096 | (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.max_row_group_size | 1048576 | (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. | | datafusion.execution.parquet.created_by | datafusion version 40.0.0 | (writing) Sets "created by" property | -| datafusion.execution.parquet.column_index_truncate_length | NULL | (writing) Sets column index truncate length | -| datafusion.execution.parquet.data_page_row_count_limit | 18446744073709551615 | (writing) Sets best effort maximum number of rows in data page | +| datafusion.execution.parquet.column_index_truncate_length | 64 | (writing) Sets column index truncate length | +| datafusion.execution.parquet.data_page_row_count_limit | 20000 | (writing) Sets best effort maximum number of rows in data page | | datafusion.execution.parquet.encoding | NULL | (writing) Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.bloom_filter_on_read | true | (writing) Use any available bloom filters when reading parquet files | | datafusion.execution.parquet.bloom_filter_on_write | false | (writing) Write bloom filters for all columns when creating parquet files | From d9724164e30379f721feb192a3893ef45f4c8a97 Mon Sep 17 00:00:00 2001 From: wiedld Date: Sun, 21 Jul 2024 19:22:20 -0700 Subject: [PATCH 7/7] refactor(11367): remove layer of indirection using the conditional parquet feature, instead rely upon regression testing --- datafusion/common/src/config.rs | 100 +++--------------- .../test_files/information_schema.slt | 2 +- docs/source/user-guide/configs.md | 2 +- 3 files changed, 14 insertions(+), 90 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 156eac60b866..58dc1f9adde4 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -314,85 +314,6 @@ config_namespace! { } } -/// When using the parquet feature, -/// use the same default writer settings as the extern parquet. -#[cfg(feature = "parquet")] -mod parquet_defaults { - use parquet::basic::Compression; - use parquet::file::properties as props; - - /// Default value for [`props::WriterProperties::data_page_size_limit`] - pub const DEFAULT_PAGE_SIZE: usize = props::DEFAULT_PAGE_SIZE; - /// Default value for [`props::WriterProperties::write_batch_size`] - pub const DEFAULT_WRITE_BATCH_SIZE: usize = props::DEFAULT_WRITE_BATCH_SIZE; - /// Default value for [`props::WriterProperties::writer_version`] - pub const DEFAULT_WRITER_VERSION: &str = "1.0"; - /// Default value for [`props::WriterProperties::dictionary_enabled`] - pub const DEFAULT_DICTIONARY_ENABLED: Option = - Some(props::DEFAULT_DICTIONARY_ENABLED); - /// Default value for [`props::WriterProperties::dictionary_page_size_limit`] - pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = - props::DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT; - /// Default value for [`props::WriterProperties::data_page_row_count_limit`] - pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = - props::DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT; - /// Default value for [`props::WriterProperties::max_statistics_size`] - pub const DEFAULT_MAX_STATISTICS_SIZE: Option = - Some(props::DEFAULT_MAX_STATISTICS_SIZE); - /// Default value for [`props::WriterProperties::max_row_group_size`] - pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = props::DEFAULT_MAX_ROW_GROUP_SIZE; - /// Default value for [`props::WriterProperties::column_index_truncate_length`] - pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option = - props::DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH; - - // TODO: discuss if we want datafusion to use these defaults from the extern parquet - // refer to https://github.com/apache/datafusion/issues/11367 - - #[allow(dead_code)] - /// Default value for [`props::WriterProperties::statistics_enabled`] - pub const DEFAULT_STATISTICS_ENABLED: Option<&str> = Some("page"); - #[allow(dead_code)] - /// Default value for [`props::BloomFilterProperties::fpp`] - pub const DEFAULT_BLOOM_FILTER_FPP: Option = - Some(props::DEFAULT_BLOOM_FILTER_FPP); - #[allow(dead_code)] - /// Default value for [`props::BloomFilterProperties::ndv`] - pub const DEFAULT_BLOOM_FILTER_NDV: Option = - Some(props::DEFAULT_BLOOM_FILTER_NDV); - - #[allow(dead_code)] - /// Default value for [props::WriterProperties::compression`] - pub const DEFAULT_COMPRESSION: Compression = Compression::UNCOMPRESSED; -} - -/// When note using the parquet feature, provide a manual copy -/// of the extern parquet's settings in order to compile. -/// -/// This is required since the [`ParquetOptions`] are extended with the -/// `config_namespace` macro, which does not handle internal configuration macros. -#[cfg(not(feature = "parquet"))] -mod parquet_defaults { - pub const DEFAULT_PAGE_SIZE: usize = 1024 * 1024; - pub const DEFAULT_WRITE_BATCH_SIZE: usize = 1024; - pub const DEFAULT_WRITER_VERSION: &str = "1.0"; - pub const DEFAULT_DICTIONARY_ENABLED: Option = Some(true); - pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = DEFAULT_PAGE_SIZE; - pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = 20_000; - pub const DEFAULT_MAX_STATISTICS_SIZE: Option = Some(4096); - pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024; - pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option = Some(64); - - // TODO: discuss if we want datafusion to use these defaults from the extern parquet - // refer to https://github.com/apache/datafusion/issues/11367 - - #[allow(dead_code)] - pub const DEFAULT_STATISTICS_ENABLED: Option<&str> = Some("page"); - #[allow(dead_code)] - pub const DEFAULT_BLOOM_FILTER_FPP: Option = Some(0.05); - #[allow(dead_code)] - pub const DEFAULT_BLOOM_FILTER_NDV: Option = Some(1_000_000_u64); -} - config_namespace! { /// Options for reading and writing parquet files /// @@ -436,28 +357,31 @@ config_namespace! { // and map to parquet::file::properties::WriterProperties /// (writing) Sets best effort maximum size of data page in bytes - pub data_pagesize_limit: usize, default = parquet_defaults::DEFAULT_PAGE_SIZE + pub data_pagesize_limit: usize, default = 1024 * 1024 /// (writing) Sets write_batch_size in bytes - pub write_batch_size: usize, default = parquet_defaults::DEFAULT_WRITE_BATCH_SIZE + pub write_batch_size: usize, default = 1024 /// (writing) Sets parquet writer version /// valid values are "1.0" and "2.0" - pub writer_version: String, default = parquet_defaults::DEFAULT_WRITER_VERSION.to_string() + pub writer_version: String, default = "1.0".to_string() /// (writing) Sets default parquet compression codec. /// Valid values are: uncompressed, snappy, gzip(level), /// lzo, brotli(level), lz4, zstd(level), and lz4_raw. /// These values are not case sensitive. If NULL, uses /// default parquet writer setting + /// + /// Note that this default setting is not the same as + /// the default parquet writer setting. pub compression: Option, default = Some("zstd(3)".into()) /// (writing) Sets if dictionary encoding is enabled. If NULL, uses /// default parquet writer setting - pub dictionary_enabled: Option, default = parquet_defaults::DEFAULT_DICTIONARY_ENABLED + pub dictionary_enabled: Option, default = Some(true) /// (writing) Sets best effort maximum dictionary page size, in bytes - pub dictionary_page_size_limit: usize, default = parquet_defaults::DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT + pub dictionary_page_size_limit: usize, default = 1024 * 1024 /// (writing) Sets if statistics are enabled for any column /// Valid values are: "none", "chunk", and "page" @@ -467,21 +391,21 @@ config_namespace! { /// (writing) Sets max statistics size for any column. If NULL, uses /// default parquet writer setting - pub max_statistics_size: Option, default = parquet_defaults::DEFAULT_MAX_STATISTICS_SIZE + pub max_statistics_size: Option, default = Some(4096) /// (writing) Target maximum number of rows in each row group (defaults to 1M /// rows). Writing larger row groups requires more memory to write, but /// can get better compression and be faster to read. - pub max_row_group_size: usize, default = parquet_defaults::DEFAULT_MAX_ROW_GROUP_SIZE + pub max_row_group_size: usize, default = 1024 * 1024 /// (writing) Sets "created by" property pub created_by: String, default = concat!("datafusion version ", env!("CARGO_PKG_VERSION")).into() /// (writing) Sets column index truncate length - pub column_index_truncate_length: Option, default = parquet_defaults::DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH + pub column_index_truncate_length: Option, default = Some(64) /// (writing) Sets best effort maximum number of rows in data page - pub data_page_row_count_limit: usize, default = parquet_defaults::DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT + pub data_page_row_count_limit: usize, default = 20_000 /// (writing) Sets default encoding for any column. /// Valid values are: plain, plain_dictionary, rle, diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index a310fc84a41e..d5cad0d1f734 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -268,7 +268,7 @@ datafusion.execution.parquet.bloom_filter_ndv NULL (writing) Sets bloom filter n datafusion.execution.parquet.bloom_filter_on_read true (writing) Use any available bloom filters when reading parquet files datafusion.execution.parquet.bloom_filter_on_write false (writing) Write bloom filters for all columns when creating parquet files datafusion.execution.parquet.column_index_truncate_length 64 (writing) Sets column index truncate length -datafusion.execution.parquet.compression zstd(3) (writing) Sets default parquet compression codec. Valid values are: uncompressed, snappy, gzip(level), lzo, brotli(level), lz4, zstd(level), and lz4_raw. These values are not case sensitive. If NULL, uses default parquet writer setting +datafusion.execution.parquet.compression zstd(3) (writing) Sets default parquet compression codec. Valid values are: uncompressed, snappy, gzip(level), lzo, brotli(level), lz4, zstd(level), and lz4_raw. These values are not case sensitive. If NULL, uses default parquet writer setting Note that this default setting is not the same as the default parquet writer setting. datafusion.execution.parquet.created_by datafusion (writing) Sets "created by" property datafusion.execution.parquet.data_page_row_count_limit 20000 (writing) Sets best effort maximum number of rows in data page datafusion.execution.parquet.data_pagesize_limit 1048576 (writing) Sets best effort maximum size of data page in bytes diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 4e9d95228743..f4e4eb1a7193 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -58,7 +58,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.execution.parquet.data_pagesize_limit | 1048576 | (writing) Sets best effort maximum size of data page in bytes | | datafusion.execution.parquet.write_batch_size | 1024 | (writing) Sets write_batch_size in bytes | | datafusion.execution.parquet.writer_version | 1.0 | (writing) Sets parquet writer version valid values are "1.0" and "2.0" | -| datafusion.execution.parquet.compression | zstd(3) | (writing) Sets default parquet compression codec. Valid values are: uncompressed, snappy, gzip(level), lzo, brotli(level), lz4, zstd(level), and lz4_raw. These values are not case sensitive. If NULL, uses default parquet writer setting | +| datafusion.execution.parquet.compression | zstd(3) | (writing) Sets default parquet compression codec. Valid values are: uncompressed, snappy, gzip(level), lzo, brotli(level), lz4, zstd(level), and lz4_raw. These values are not case sensitive. If NULL, uses default parquet writer setting Note that this default setting is not the same as the default parquet writer setting. | | datafusion.execution.parquet.dictionary_enabled | true | (writing) Sets if dictionary encoding is enabled. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.dictionary_page_size_limit | 1048576 | (writing) Sets best effort maximum dictionary page size, in bytes | | datafusion.execution.parquet.statistics_enabled | NULL | (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting |