diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 156eac60b8664..b899d6dd9b056 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -314,85 +314,6 @@ config_namespace! { } } -/// When using the parquet feature, -/// use the same default writer settings as the extern parquet. -#[cfg(feature = "parquet")] -mod parquet_defaults { - use parquet::basic::Compression; - use parquet::file::properties as props; - - /// Default value for [`props::WriterProperties::data_page_size_limit`] - pub const DEFAULT_PAGE_SIZE: usize = props::DEFAULT_PAGE_SIZE; - /// Default value for [`props::WriterProperties::write_batch_size`] - pub const DEFAULT_WRITE_BATCH_SIZE: usize = props::DEFAULT_WRITE_BATCH_SIZE; - /// Default value for [`props::WriterProperties::writer_version`] - pub const DEFAULT_WRITER_VERSION: &str = "1.0"; - /// Default value for [`props::WriterProperties::dictionary_enabled`] - pub const DEFAULT_DICTIONARY_ENABLED: Option = - Some(props::DEFAULT_DICTIONARY_ENABLED); - /// Default value for [`props::WriterProperties::dictionary_page_size_limit`] - pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = - props::DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT; - /// Default value for [`props::WriterProperties::data_page_row_count_limit`] - pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = - props::DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT; - /// Default value for [`props::WriterProperties::max_statistics_size`] - pub const DEFAULT_MAX_STATISTICS_SIZE: Option = - Some(props::DEFAULT_MAX_STATISTICS_SIZE); - /// Default value for [`props::WriterProperties::max_row_group_size`] - pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = props::DEFAULT_MAX_ROW_GROUP_SIZE; - /// Default value for [`props::WriterProperties::column_index_truncate_length`] - pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option = - props::DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH; - - // TODO: discuss if we want datafusion to use these defaults from the extern parquet - // refer to https://github.com/apache/datafusion/issues/11367 - - #[allow(dead_code)] - /// Default value for [`props::WriterProperties::statistics_enabled`] - pub const DEFAULT_STATISTICS_ENABLED: Option<&str> = Some("page"); - #[allow(dead_code)] - /// Default value for [`props::BloomFilterProperties::fpp`] - pub const DEFAULT_BLOOM_FILTER_FPP: Option = - Some(props::DEFAULT_BLOOM_FILTER_FPP); - #[allow(dead_code)] - /// Default value for [`props::BloomFilterProperties::ndv`] - pub const DEFAULT_BLOOM_FILTER_NDV: Option = - Some(props::DEFAULT_BLOOM_FILTER_NDV); - - #[allow(dead_code)] - /// Default value for [props::WriterProperties::compression`] - pub const DEFAULT_COMPRESSION: Compression = Compression::UNCOMPRESSED; -} - -/// When note using the parquet feature, provide a manual copy -/// of the extern parquet's settings in order to compile. -/// -/// This is required since the [`ParquetOptions`] are extended with the -/// `config_namespace` macro, which does not handle internal configuration macros. -#[cfg(not(feature = "parquet"))] -mod parquet_defaults { - pub const DEFAULT_PAGE_SIZE: usize = 1024 * 1024; - pub const DEFAULT_WRITE_BATCH_SIZE: usize = 1024; - pub const DEFAULT_WRITER_VERSION: &str = "1.0"; - pub const DEFAULT_DICTIONARY_ENABLED: Option = Some(true); - pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = DEFAULT_PAGE_SIZE; - pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = 20_000; - pub const DEFAULT_MAX_STATISTICS_SIZE: Option = Some(4096); - pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024; - pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option = Some(64); - - // TODO: discuss if we want datafusion to use these defaults from the extern parquet - // refer to https://github.com/apache/datafusion/issues/11367 - - #[allow(dead_code)] - pub const DEFAULT_STATISTICS_ENABLED: Option<&str> = Some("page"); - #[allow(dead_code)] - pub const DEFAULT_BLOOM_FILTER_FPP: Option = Some(0.05); - #[allow(dead_code)] - pub const DEFAULT_BLOOM_FILTER_NDV: Option = Some(1_000_000_u64); -} - config_namespace! { /// Options for reading and writing parquet files /// @@ -436,28 +357,31 @@ config_namespace! { // and map to parquet::file::properties::WriterProperties /// (writing) Sets best effort maximum size of data page in bytes - pub data_pagesize_limit: usize, default = parquet_defaults::DEFAULT_PAGE_SIZE + pub data_pagesize_limit: usize, default = 1024 * 1024 /// (writing) Sets write_batch_size in bytes - pub write_batch_size: usize, default = parquet_defaults::DEFAULT_WRITE_BATCH_SIZE + pub write_batch_size: usize, default = 1024 /// (writing) Sets parquet writer version /// valid values are "1.0" and "2.0" - pub writer_version: String, default = parquet_defaults::DEFAULT_WRITER_VERSION.to_string() + pub writer_version: String, default = "1.0".to_string() /// (writing) Sets default parquet compression codec. /// Valid values are: uncompressed, snappy, gzip(level), /// lzo, brotli(level), lz4, zstd(level), and lz4_raw. /// These values are not case sensitive. If NULL, uses /// default parquet writer setting + /// + /// Note that this default setting may be by the same as the + /// parquet ArrowWriter. pub compression: Option, default = Some("zstd(3)".into()) /// (writing) Sets if dictionary encoding is enabled. If NULL, uses /// default parquet writer setting - pub dictionary_enabled: Option, default = parquet_defaults::DEFAULT_DICTIONARY_ENABLED + pub dictionary_enabled: Option, default = Some(true) /// (writing) Sets best effort maximum dictionary page size, in bytes - pub dictionary_page_size_limit: usize, default = parquet_defaults::DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT + pub dictionary_page_size_limit: usize, default = 1024 * 1024 /// (writing) Sets if statistics are enabled for any column /// Valid values are: "none", "chunk", and "page" @@ -467,21 +391,21 @@ config_namespace! { /// (writing) Sets max statistics size for any column. If NULL, uses /// default parquet writer setting - pub max_statistics_size: Option, default = parquet_defaults::DEFAULT_MAX_STATISTICS_SIZE + pub max_statistics_size: Option, default = Some(4096) /// (writing) Target maximum number of rows in each row group (defaults to 1M /// rows). Writing larger row groups requires more memory to write, but /// can get better compression and be faster to read. - pub max_row_group_size: usize, default = parquet_defaults::DEFAULT_MAX_ROW_GROUP_SIZE + pub max_row_group_size: usize, default = 1024 * 1024 /// (writing) Sets "created by" property pub created_by: String, default = concat!("datafusion version ", env!("CARGO_PKG_VERSION")).into() /// (writing) Sets column index truncate length - pub column_index_truncate_length: Option, default = parquet_defaults::DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH + pub column_index_truncate_length: Option, default = Some(64) /// (writing) Sets best effort maximum number of rows in data page - pub data_page_row_count_limit: usize, default = parquet_defaults::DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT + pub data_page_row_count_limit: usize, default = 20_000 /// (writing) Sets default encoding for any column. /// Valid values are: plain, plain_dictionary, rle,