Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure statistic defaults in parquet writers are in sync #11656

Merged
merged 8 commits into from
Jul 27, 2024
2 changes: 1 addition & 1 deletion datafusion/common/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,7 @@ config_namespace! {
/// Valid values are: "none", "chunk", and "page"
/// These values are not case sensitive. If NULL, uses
/// default parquet writer setting
pub statistics_enabled: Option<String>, default = None
pub statistics_enabled: Option<String>, default = Some("page".into())

/// (writing) Sets max statistics size for any column. If NULL, uses
/// default parquet writer setting
Expand Down
118 changes: 31 additions & 87 deletions datafusion/common/src/file_options/parquet_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,7 @@ mod tests {
ColumnOptions {
compression: Some("zstd(22)".into()),
dictionary_enabled: src_col_defaults.dictionary_enabled.map(|v| !v),
statistics_enabled: Some("page".into()),
statistics_enabled: Some("none".into()),
max_statistics_size: Some(72),
encoding: Some("RLE".into()),
bloom_filter_enabled: Some(true),
Expand Down Expand Up @@ -614,23 +614,7 @@ mod tests {
"should indicate that table_parquet_opts defaults came from datafusion",
);

// Expected: the remaining should match
let same_created_by = default_table_writer_opts.global.created_by.clone();
let mut from_extern_parquet =
session_config_from_writer_props(&default_writer_props);
from_extern_parquet.global.created_by = same_created_by;
// TODO: the remaining defaults do not match!
// refer to https://github.com/apache/datafusion/issues/11367
assert_ne!(
default_table_writer_opts,
from_extern_parquet,
"the default writer_props should have the same configuration as the session's default TableParquetOptions",
);

// Below here itemizes how the defaults **should** match, but do not.

// TODO: compression defaults do not match
// refer to https://github.com/apache/datafusion/issues/11367
// Expected: the datafusion default compression is different from arrow-rs's parquet
assert_eq!(
default_writer_props.compression(&"default".into()),
Compression::UNCOMPRESSED,
Expand All @@ -644,35 +628,13 @@ mod tests {
"datafusion's default is zstd"
);

// datafusion's `None` for Option<String> => becomes parquet's EnabledStatistics::Page
// TODO: should this be changed?
// refer to https://github.com/apache/datafusion/issues/11367
assert_eq!(
default_writer_props.statistics_enabled(&"default".into()),
EnabledStatistics::Page,
"extern parquet's default is page"
);
assert_eq!(
default_table_writer_opts.global.statistics_enabled, None,
"datafusion's has no default"
);
assert_eq!(
from_datafusion_defaults.statistics_enabled(&"default".into()),
EnabledStatistics::Page,
"should see the extern parquet's default over-riding datafusion's None",
);

// Confirm all other settings are equal.
// First resolve the known discrepancies, (set as the same).
// TODO: once we fix the above mis-matches, we should be able to remove this.
// Expected: the remaining should match
let same_created_by = default_table_writer_opts.global.created_by.clone();
let mut from_extern_parquet =
session_config_from_writer_props(&default_writer_props);
from_extern_parquet.global.created_by = same_created_by;
from_extern_parquet.global.compression = Some("zstd(3)".into());
from_extern_parquet.global.statistics_enabled = None;

// Expected: the remaining should match
let same_created_by = default_table_writer_opts.global.created_by.clone(); // we expect these to be different
from_extern_parquet.global.created_by = same_created_by; // we expect these to be different
assert_eq!(
default_table_writer_opts,
from_extern_parquet,
Expand All @@ -685,31 +647,25 @@ mod tests {
// the TableParquetOptions::default, with only the bloom filter turned on
let mut default_table_writer_opts = TableParquetOptions::default();
default_table_writer_opts.global.bloom_filter_on_write = true;

// the WriterProperties::default, with only the bloom filter turned on
let default_writer_props = WriterProperties::new();
let from_datafusion_defaults =
WriterPropertiesBuilder::try_from(&default_table_writer_opts)
.unwrap()
.set_bloom_filter_enabled(true)
.build();

// TODO: should have same behavior in either.
// refer to https://github.com/apache/datafusion/issues/11367
assert_ne!(
// the WriterProperties::default, with only the bloom filter turned on
let default_writer_props = WriterProperties::builder()
.set_bloom_filter_enabled(true)
.build();

assert_eq!(
default_writer_props.bloom_filter_properties(&"default".into()),
from_datafusion_defaults.bloom_filter_properties(&"default".into()),
"parquet and datafusion props, will not have the same bloom filter props",
"parquet and datafusion props, should have the same bloom filter props",
);
assert_eq!(
default_writer_props.bloom_filter_properties(&"default".into()),
None,
"extern parquet's default remains None"
);
assert_eq!(
from_datafusion_defaults.bloom_filter_properties(&"default".into()),
Some(&BloomFilterProperties::default()),
"datafusion's has BloomFilterProperties::default",
"should use the default bloom filter props"
);
}

Expand All @@ -719,35 +675,29 @@ mod tests {
let mut default_table_writer_opts = TableParquetOptions::default();
default_table_writer_opts.global.bloom_filter_on_write = true;
default_table_writer_opts.global.bloom_filter_fpp = Some(0.42);

// the WriterProperties::default, with only fpp set
let default_writer_props = WriterProperties::new();
let from_datafusion_defaults =
WriterPropertiesBuilder::try_from(&default_table_writer_opts)
.unwrap()
.set_bloom_filter_enabled(true)
.set_bloom_filter_fpp(0.42)
.build();

// TODO: should have same behavior in either.
// refer to https://github.com/apache/datafusion/issues/11367
assert_ne!(
// the WriterProperties::default, with only fpp set
let default_writer_props = WriterProperties::builder()
.set_bloom_filter_enabled(true)
.set_bloom_filter_fpp(0.42)
.build();

assert_eq!(
default_writer_props.bloom_filter_properties(&"default".into()),
from_datafusion_defaults.bloom_filter_properties(&"default".into()),
"parquet and datafusion props, will not have the same bloom filter props",
"parquet and datafusion props, should have the same bloom filter props",
);
assert_eq!(
default_writer_props.bloom_filter_properties(&"default".into()),
None,
"extern parquet's default remains None"
);
assert_eq!(
from_datafusion_defaults.bloom_filter_properties(&"default".into()),
Some(&BloomFilterProperties {
fpp: 0.42,
ndv: DEFAULT_BLOOM_FILTER_NDV
}),
"datafusion's has BloomFilterProperties",
"should have only the fpp set, and the ndv at default",
);
}

Expand All @@ -757,35 +707,29 @@ mod tests {
let mut default_table_writer_opts = TableParquetOptions::default();
default_table_writer_opts.global.bloom_filter_on_write = true;
default_table_writer_opts.global.bloom_filter_ndv = Some(42);

// the WriterProperties::default, with only ndv set
let default_writer_props = WriterProperties::new();
let from_datafusion_defaults =
WriterPropertiesBuilder::try_from(&default_table_writer_opts)
.unwrap()
.set_bloom_filter_enabled(true)
.set_bloom_filter_ndv(42)
.build();

// TODO: should have same behavior in either.
// refer to https://github.com/apache/datafusion/issues/11367
assert_ne!(
// the WriterProperties::default, with only ndv set
let default_writer_props = WriterProperties::builder()
.set_bloom_filter_enabled(true)
.set_bloom_filter_ndv(42)
.build();

assert_eq!(
default_writer_props.bloom_filter_properties(&"default".into()),
from_datafusion_defaults.bloom_filter_properties(&"default".into()),
"parquet and datafusion props, will not have the same bloom filter props",
"parquet and datafusion props, should have the same bloom filter props",
);
assert_eq!(
default_writer_props.bloom_filter_properties(&"default".into()),
None,
"extern parquet's default remains None"
);
assert_eq!(
from_datafusion_defaults.bloom_filter_properties(&"default".into()),
Some(&BloomFilterProperties {
fpp: DEFAULT_BLOOM_FILTER_FPP,
ndv: 42
}),
"datafusion's has BloomFilterProperties",
"should have only the ndv set, and the fpp at default",
);
}
}
4 changes: 2 additions & 2 deletions datafusion/sqllogictest/test_files/information_schema.slt
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ datafusion.execution.parquet.pruning true
datafusion.execution.parquet.pushdown_filters false
datafusion.execution.parquet.reorder_filters false
datafusion.execution.parquet.skip_metadata true
datafusion.execution.parquet.statistics_enabled NULL
datafusion.execution.parquet.statistics_enabled page
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is an improvement -- it doesn't change the default value (NULL means use arrow-rs defaults, which is page), but now the default value isexplicit in the config settings

Also there is a test to ensure the defaults don't drift from the arrow-rs defaults accidentally

datafusion.execution.parquet.write_batch_size 1024
datafusion.execution.parquet.writer_version 1.0
datafusion.execution.planning_concurrency 13
Expand Down Expand Up @@ -288,7 +288,7 @@ datafusion.execution.parquet.pruning true (reading) If true, the parquet reader
datafusion.execution.parquet.pushdown_filters false (reading) If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded. This optimization is sometimes called "late materialization".
datafusion.execution.parquet.reorder_filters false (reading) If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query
datafusion.execution.parquet.skip_metadata true (reading) If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata
datafusion.execution.parquet.statistics_enabled NULL (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting
datafusion.execution.parquet.statistics_enabled page (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting
datafusion.execution.parquet.write_batch_size 1024 (writing) Sets write_batch_size in bytes
datafusion.execution.parquet.writer_version 1.0 (writing) Sets parquet writer version valid values are "1.0" and "2.0"
datafusion.execution.planning_concurrency 13 Fan-out during initial physical planning. This is mostly use to plan `UNION` children in parallel. Defaults to the number of CPU cores on the system
Expand Down
2 changes: 1 addition & 1 deletion docs/source/user-guide/configs.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus
| datafusion.execution.parquet.compression | zstd(3) | (writing) Sets default parquet compression codec. Valid values are: uncompressed, snappy, gzip(level), lzo, brotli(level), lz4, zstd(level), and lz4_raw. These values are not case sensitive. If NULL, uses default parquet writer setting Note that this default setting is not the same as the default parquet writer setting. |
| datafusion.execution.parquet.dictionary_enabled | true | (writing) Sets if dictionary encoding is enabled. If NULL, uses default parquet writer setting |
| datafusion.execution.parquet.dictionary_page_size_limit | 1048576 | (writing) Sets best effort maximum dictionary page size, in bytes |
| datafusion.execution.parquet.statistics_enabled | NULL | (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting |
| datafusion.execution.parquet.statistics_enabled | page | (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting |
| datafusion.execution.parquet.max_statistics_size | 4096 | (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting |
| datafusion.execution.parquet.max_row_group_size | 1048576 | (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. |
| datafusion.execution.parquet.created_by | datafusion version 40.0.0 | (writing) Sets "created by" property |
Expand Down
Loading