diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 8539ca0874dd..e3bd5219267c 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -395,8 +395,11 @@ config_namespace! { /// default parquet writer setting pub encoding: Option, default = None - /// Sets if bloom filter is enabled for any column - pub bloom_filter_enabled: bool, default = false + /// Use any available bloom filters when reading parquet files + pub bloom_filter_on_read: bool, default = true + + /// Write bloom filters for all columns when creating parquet files + pub bloom_filter_on_write: bool, default = false /// Sets bloom filter false positive probability. If NULL, uses /// default parquet writer setting @@ -1654,6 +1657,7 @@ config_namespace! { } #[derive(Debug, Clone, PartialEq)] +#[allow(clippy::large_enum_variant)] pub enum FormatOptions { CSV(CsvOptions), JSON(JsonOptions), diff --git a/datafusion/common/src/file_options/mod.rs b/datafusion/common/src/file_options/mod.rs index a760619a7ba8..59040b4290b0 100644 --- a/datafusion/common/src/file_options/mod.rs +++ b/datafusion/common/src/file_options/mod.rs @@ -67,7 +67,7 @@ mod tests { "format.data_page_row_count_limit".to_owned(), "123".to_owned(), ); - option_map.insert("format.bloom_filter_enabled".to_owned(), "true".to_owned()); + option_map.insert("format.bloom_filter_on_write".to_owned(), "true".to_owned()); option_map.insert("format.encoding".to_owned(), "plain".to_owned()); option_map.insert("format.dictionary_enabled".to_owned(), "true".to_owned()); option_map.insert("format.compression".to_owned(), "zstd(4)".to_owned()); diff --git a/datafusion/common/src/file_options/parquet_writer.rs b/datafusion/common/src/file_options/parquet_writer.rs index 8ac6bcaa7adf..009164a29e34 100644 --- a/datafusion/common/src/file_options/parquet_writer.rs +++ b/datafusion/common/src/file_options/parquet_writer.rs @@ -62,7 +62,7 @@ impl TryFrom<&TableParquetOptions> for ParquetWriterOptions { created_by, column_index_truncate_length, data_page_row_count_limit, - bloom_filter_enabled, + bloom_filter_on_write, encoding, dictionary_enabled, compression, @@ -80,6 +80,7 @@ impl TryFrom<&TableParquetOptions> for ParquetWriterOptions { allow_single_file_parallelism: _, maximum_parallel_row_group_writers: _, maximum_buffered_record_batches_per_stream: _, + bloom_filter_on_read: _, } = &parquet_options.global; let key_value_metadata = if !parquet_options.key_value_metadata.is_empty() { @@ -104,7 +105,7 @@ impl TryFrom<&TableParquetOptions> for ParquetWriterOptions { .set_created_by(created_by.clone()) .set_column_index_truncate_length(*column_index_truncate_length) .set_data_page_row_count_limit(*data_page_row_count_limit) - .set_bloom_filter_enabled(*bloom_filter_enabled) + .set_bloom_filter_enabled(*bloom_filter_on_write) .set_key_value_metadata(key_value_metadata); if let Some(encoding) = &encoding { diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index 6ee19828f1d4..2d14a801ace0 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -1603,7 +1603,7 @@ mod tests { "50".into(), ); config_map.insert( - "datafusion.execution.parquet.bloom_filter_enabled".into(), + "datafusion.execution.parquet.bloom_filter_on_write".into(), "true".into(), ); config_map.insert( @@ -1681,7 +1681,7 @@ mod tests { "delta_binary_packed".into(), ); config_map.insert( - "datafusion.execution.parquet.bloom_filter_enabled".into(), + "datafusion.execution.parquet.bloom_filter_on_write".into(), "true".into(), ); config_map.insert( diff --git a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs index 73fb82980fc4..68cc024dbf24 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs @@ -243,14 +243,24 @@ impl ParquetExec { } /// If enabled, the reader will read by the bloom filter - pub fn with_enable_bloom_filter(mut self, enable_bloom_filter: bool) -> Self { - self.table_parquet_options.global.bloom_filter_enabled = enable_bloom_filter; + pub fn with_bloom_filter_on_read(mut self, bloom_filter_on_read: bool) -> Self { + self.table_parquet_options.global.bloom_filter_on_read = bloom_filter_on_read; self } - /// Return the value described in [`Self::with_enable_bloom_filter`] - fn enable_bloom_filter(&self) -> bool { - self.table_parquet_options.global.bloom_filter_enabled + /// If enabled, the writer will write by the bloom filter + pub fn with_bloom_filter_on_write( + mut self, + enable_bloom_filter_on_write: bool, + ) -> Self { + self.table_parquet_options.global.bloom_filter_on_write = + enable_bloom_filter_on_write; + self + } + + /// Return the value described in [`Self::with_bloom_filter_on_read`] + fn bloom_filter_on_read(&self) -> bool { + self.table_parquet_options.global.bloom_filter_on_read } fn output_partitioning_helper(file_config: &FileScanConfig) -> Partitioning { @@ -407,7 +417,7 @@ impl ExecutionPlan for ParquetExec { pushdown_filters: self.pushdown_filters(), reorder_filters: self.reorder_filters(), enable_page_index: self.enable_page_index(), - enable_bloom_filter: self.enable_bloom_filter(), + enable_bloom_filter: self.bloom_filter_on_read(), }; let stream = diff --git a/datafusion/execution/src/config.rs b/datafusion/execution/src/config.rs index 28275d484e29..8bcf62dc95ce 100644 --- a/datafusion/execution/src/config.rs +++ b/datafusion/execution/src/config.rs @@ -344,12 +344,12 @@ impl SessionConfig { /// Returns true if bloom filter should be used to skip parquet row groups pub fn parquet_bloom_filter_pruning(&self) -> bool { - self.options.execution.parquet.bloom_filter_enabled + self.options.execution.parquet.bloom_filter_on_read } /// Enables or disables the use of bloom filter for parquet readers to skip row groups pub fn with_parquet_bloom_filter_pruning(mut self, enabled: bool) -> Self { - self.options.execution.parquet.bloom_filter_enabled = enabled; + self.options.execution.parquet.bloom_filter_on_read = enabled; self } diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index c2653fa96f82..6dbff707fcfa 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -1215,10 +1215,12 @@ message ParquetOptions { uint64 data_pagesize_limit = 7; // default = 1024 * 1024 uint64 write_batch_size = 8; // default = 1024 string writer_version = 9; // default = "1.0" - bool bloom_filter_enabled = 20; // default = false + // bool bloom_filter_enabled = 20; // default = false bool allow_single_file_parallelism = 23; // default = true uint64 maximum_parallel_row_group_writers = 24; // default = 1 uint64 maximum_buffered_record_batches_per_stream = 25; // default = 2 + bool bloom_filter_on_read = 26; // default = true + bool bloom_filter_on_write = 27; // default = false oneof metadata_size_hint_opt { uint64 metadata_size_hint = 4; diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index 0fb6f4623745..3cd8611afa3e 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -15976,9 +15976,6 @@ impl serde::Serialize for ParquetOptions { if !self.writer_version.is_empty() { len += 1; } - if self.bloom_filter_enabled { - len += 1; - } if self.allow_single_file_parallelism { len += 1; } @@ -15988,6 +15985,12 @@ impl serde::Serialize for ParquetOptions { if self.maximum_buffered_record_batches_per_stream != 0 { len += 1; } + if self.bloom_filter_on_read { + len += 1; + } + if self.bloom_filter_on_write { + len += 1; + } if self.dictionary_page_size_limit != 0 { len += 1; } @@ -16054,9 +16057,6 @@ impl serde::Serialize for ParquetOptions { if !self.writer_version.is_empty() { struct_ser.serialize_field("writerVersion", &self.writer_version)?; } - if self.bloom_filter_enabled { - struct_ser.serialize_field("bloomFilterEnabled", &self.bloom_filter_enabled)?; - } if self.allow_single_file_parallelism { struct_ser.serialize_field("allowSingleFileParallelism", &self.allow_single_file_parallelism)?; } @@ -16068,6 +16068,12 @@ impl serde::Serialize for ParquetOptions { #[allow(clippy::needless_borrow)] struct_ser.serialize_field("maximumBufferedRecordBatchesPerStream", ToString::to_string(&self.maximum_buffered_record_batches_per_stream).as_str())?; } + if self.bloom_filter_on_read { + struct_ser.serialize_field("bloomFilterOnRead", &self.bloom_filter_on_read)?; + } + if self.bloom_filter_on_write { + struct_ser.serialize_field("bloomFilterOnWrite", &self.bloom_filter_on_write)?; + } if self.dictionary_page_size_limit != 0 { #[allow(clippy::needless_borrow)] struct_ser.serialize_field("dictionaryPageSizeLimit", ToString::to_string(&self.dictionary_page_size_limit).as_str())?; @@ -16175,14 +16181,16 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { "writeBatchSize", "writer_version", "writerVersion", - "bloom_filter_enabled", - "bloomFilterEnabled", "allow_single_file_parallelism", "allowSingleFileParallelism", "maximum_parallel_row_group_writers", "maximumParallelRowGroupWriters", "maximum_buffered_record_batches_per_stream", "maximumBufferedRecordBatchesPerStream", + "bloom_filter_on_read", + "bloomFilterOnRead", + "bloom_filter_on_write", + "bloomFilterOnWrite", "dictionary_page_size_limit", "dictionaryPageSizeLimit", "data_page_row_count_limit", @@ -16219,10 +16227,11 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { DataPagesizeLimit, WriteBatchSize, WriterVersion, - BloomFilterEnabled, AllowSingleFileParallelism, MaximumParallelRowGroupWriters, MaximumBufferedRecordBatchesPerStream, + BloomFilterOnRead, + BloomFilterOnWrite, DictionaryPageSizeLimit, DataPageRowCountLimit, MaxRowGroupSize, @@ -16265,10 +16274,11 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { "dataPagesizeLimit" | "data_pagesize_limit" => Ok(GeneratedField::DataPagesizeLimit), "writeBatchSize" | "write_batch_size" => Ok(GeneratedField::WriteBatchSize), "writerVersion" | "writer_version" => Ok(GeneratedField::WriterVersion), - "bloomFilterEnabled" | "bloom_filter_enabled" => Ok(GeneratedField::BloomFilterEnabled), "allowSingleFileParallelism" | "allow_single_file_parallelism" => Ok(GeneratedField::AllowSingleFileParallelism), "maximumParallelRowGroupWriters" | "maximum_parallel_row_group_writers" => Ok(GeneratedField::MaximumParallelRowGroupWriters), "maximumBufferedRecordBatchesPerStream" | "maximum_buffered_record_batches_per_stream" => Ok(GeneratedField::MaximumBufferedRecordBatchesPerStream), + "bloomFilterOnRead" | "bloom_filter_on_read" => Ok(GeneratedField::BloomFilterOnRead), + "bloomFilterOnWrite" | "bloom_filter_on_write" => Ok(GeneratedField::BloomFilterOnWrite), "dictionaryPageSizeLimit" | "dictionary_page_size_limit" => Ok(GeneratedField::DictionaryPageSizeLimit), "dataPageRowCountLimit" | "data_page_row_count_limit" => Ok(GeneratedField::DataPageRowCountLimit), "maxRowGroupSize" | "max_row_group_size" => Ok(GeneratedField::MaxRowGroupSize), @@ -16309,10 +16319,11 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { let mut data_pagesize_limit__ = None; let mut write_batch_size__ = None; let mut writer_version__ = None; - let mut bloom_filter_enabled__ = None; let mut allow_single_file_parallelism__ = None; let mut maximum_parallel_row_group_writers__ = None; let mut maximum_buffered_record_batches_per_stream__ = None; + let mut bloom_filter_on_read__ = None; + let mut bloom_filter_on_write__ = None; let mut dictionary_page_size_limit__ = None; let mut data_page_row_count_limit__ = None; let mut max_row_group_size__ = None; @@ -16380,12 +16391,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { } writer_version__ = Some(map_.next_value()?); } - GeneratedField::BloomFilterEnabled => { - if bloom_filter_enabled__.is_some() { - return Err(serde::de::Error::duplicate_field("bloomFilterEnabled")); - } - bloom_filter_enabled__ = Some(map_.next_value()?); - } GeneratedField::AllowSingleFileParallelism => { if allow_single_file_parallelism__.is_some() { return Err(serde::de::Error::duplicate_field("allowSingleFileParallelism")); @@ -16408,6 +16413,18 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) ; } + GeneratedField::BloomFilterOnRead => { + if bloom_filter_on_read__.is_some() { + return Err(serde::de::Error::duplicate_field("bloomFilterOnRead")); + } + bloom_filter_on_read__ = Some(map_.next_value()?); + } + GeneratedField::BloomFilterOnWrite => { + if bloom_filter_on_write__.is_some() { + return Err(serde::de::Error::duplicate_field("bloomFilterOnWrite")); + } + bloom_filter_on_write__ = Some(map_.next_value()?); + } GeneratedField::DictionaryPageSizeLimit => { if dictionary_page_size_limit__.is_some() { return Err(serde::de::Error::duplicate_field("dictionaryPageSizeLimit")); @@ -16503,10 +16520,11 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { data_pagesize_limit: data_pagesize_limit__.unwrap_or_default(), write_batch_size: write_batch_size__.unwrap_or_default(), writer_version: writer_version__.unwrap_or_default(), - bloom_filter_enabled: bloom_filter_enabled__.unwrap_or_default(), allow_single_file_parallelism: allow_single_file_parallelism__.unwrap_or_default(), maximum_parallel_row_group_writers: maximum_parallel_row_group_writers__.unwrap_or_default(), maximum_buffered_record_batches_per_stream: maximum_buffered_record_batches_per_stream__.unwrap_or_default(), + bloom_filter_on_read: bloom_filter_on_read__.unwrap_or_default(), + bloom_filter_on_write: bloom_filter_on_write__.unwrap_or_default(), dictionary_page_size_limit: dictionary_page_size_limit__.unwrap_or_default(), data_page_row_count_limit: data_page_row_count_limit__.unwrap_or_default(), max_row_group_size: max_row_group_size__.unwrap_or_default(), diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index a7aa73d1b621..10a48dcc4e01 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -1911,9 +1911,8 @@ pub struct ParquetOptions { /// default = "1.0" #[prost(string, tag = "9")] pub writer_version: ::prost::alloc::string::String, - /// default = false - #[prost(bool, tag = "20")] - pub bloom_filter_enabled: bool, + /// bool bloom_filter_enabled = 20; // default = false + /// /// default = true #[prost(bool, tag = "23")] pub allow_single_file_parallelism: bool, @@ -1923,6 +1922,12 @@ pub struct ParquetOptions { /// default = 2 #[prost(uint64, tag = "25")] pub maximum_buffered_record_batches_per_stream: u64, + /// default = true + #[prost(bool, tag = "26")] + pub bloom_filter_on_read: bool, + /// default = false + #[prost(bool, tag = "27")] + pub bloom_filter_on_write: bool, #[prost(uint64, tag = "12")] pub dictionary_page_size_limit: u64, #[prost(uint64, tag = "18")] diff --git a/datafusion/proto/src/physical_plan/from_proto.rs b/datafusion/proto/src/physical_plan/from_proto.rs index e9728d8542fe..b9e2c9c72be2 100644 --- a/datafusion/proto/src/physical_plan/from_proto.rs +++ b/datafusion/proto/src/physical_plan/from_proto.rs @@ -879,7 +879,8 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions { protobuf::parquet_options::EncodingOpt::Encoding(v) => Some(v), }) .unwrap_or(None), - bloom_filter_enabled: value.bloom_filter_enabled, + bloom_filter_on_read: value.bloom_filter_on_read, + bloom_filter_on_write: value.bloom_filter_on_write, bloom_filter_fpp: value.clone() .bloom_filter_fpp_opt .map(|opt| match opt { diff --git a/datafusion/proto/src/physical_plan/to_proto.rs b/datafusion/proto/src/physical_plan/to_proto.rs index aa6121bebc34..c20190f89335 100644 --- a/datafusion/proto/src/physical_plan/to_proto.rs +++ b/datafusion/proto/src/physical_plan/to_proto.rs @@ -911,7 +911,8 @@ impl TryFrom<&ParquetOptions> for protobuf::ParquetOptions { column_index_truncate_length_opt: value.column_index_truncate_length.map(|v| protobuf::parquet_options::ColumnIndexTruncateLengthOpt::ColumnIndexTruncateLength(v as u64)), data_page_row_count_limit: value.data_page_row_count_limit as u64, encoding_opt: value.encoding.clone().map(protobuf::parquet_options::EncodingOpt::Encoding), - bloom_filter_enabled: value.bloom_filter_enabled, + bloom_filter_on_read: value.bloom_filter_on_read, + bloom_filter_on_write: value.bloom_filter_on_write, bloom_filter_fpp_opt: value.bloom_filter_fpp.map(protobuf::parquet_options::BloomFilterFppOpt::BloomFilterFpp), bloom_filter_ndv_opt: value.bloom_filter_ndv.map(protobuf::parquet_options::BloomFilterNdvOpt::BloomFilterNdv), allow_single_file_parallelism: value.allow_single_file_parallelism, diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index 1fd6160c2c6c..65985f86801e 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -344,7 +344,7 @@ async fn roundtrip_logical_plan_copy_to_writer_options() -> Result<()> { TableOptions::default_from_session_config(ctx.state().config_options()); let mut parquet_format = table_options.parquet; - parquet_format.global.bloom_filter_enabled = true; + parquet_format.global.bloom_filter_on_read = true; parquet_format.global.created_by = "DataFusion Test".to_string(); parquet_format.global.writer_version = "PARQUET_2_0".to_string(); parquet_format.global.write_batch_size = 111; diff --git a/datafusion/sqllogictest/test_files/copy.slt b/datafusion/sqllogictest/test_files/copy.slt index d695e8514b07..882a4e220758 100644 --- a/datafusion/sqllogictest/test_files/copy.slt +++ b/datafusion/sqllogictest/test_files/copy.slt @@ -271,7 +271,7 @@ OPTIONS ( 'format.created_by' 'DF copy.slt', 'format.column_index_truncate_length' 123, 'format.data_page_row_count_limit' 1234, -'format.bloom_filter_enabled' true, +'format.bloom_filter_on_read' true, 'format.bloom_filter_enabled::col1' false, 'format.bloom_filter_fpp::col2' 0.456, 'format.bloom_filter_ndv::col2' 456, diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 8f4b1a3816a3..cbf2a48b604f 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -172,9 +172,10 @@ datafusion.execution.max_buffered_batches_per_output_file 2 datafusion.execution.meta_fetch_concurrency 32 datafusion.execution.minimum_parallel_output_files 4 datafusion.execution.parquet.allow_single_file_parallelism true -datafusion.execution.parquet.bloom_filter_enabled false datafusion.execution.parquet.bloom_filter_fpp NULL datafusion.execution.parquet.bloom_filter_ndv NULL +datafusion.execution.parquet.bloom_filter_on_read true +datafusion.execution.parquet.bloom_filter_on_write false datafusion.execution.parquet.column_index_truncate_length NULL datafusion.execution.parquet.compression zstd(3) datafusion.execution.parquet.created_by datafusion @@ -250,9 +251,10 @@ datafusion.execution.max_buffered_batches_per_output_file 2 This is the maximum datafusion.execution.meta_fetch_concurrency 32 Number of files to read in parallel when inferring schema and statistics datafusion.execution.minimum_parallel_output_files 4 Guarantees a minimum level of output files running in parallel. RecordBatches will be distributed in round robin fashion to each parallel writer. Each writer is closed and a new file opened once soft_max_rows_per_output_file is reached. datafusion.execution.parquet.allow_single_file_parallelism true Controls whether DataFusion will attempt to speed up writing parquet files by serializing them in parallel. Each column in each row group in each output file are serialized in parallel leveraging a maximum possible core count of n_files*n_row_groups*n_columns. -datafusion.execution.parquet.bloom_filter_enabled false Sets if bloom filter is enabled for any column datafusion.execution.parquet.bloom_filter_fpp NULL Sets bloom filter false positive probability. If NULL, uses default parquet writer setting datafusion.execution.parquet.bloom_filter_ndv NULL Sets bloom filter number of distinct values. If NULL, uses default parquet writer setting +datafusion.execution.parquet.bloom_filter_on_read true Use any available bloom filters when reading parquet files +datafusion.execution.parquet.bloom_filter_on_write false Write bloom filters for all columns when creating parquet files datafusion.execution.parquet.column_index_truncate_length NULL Sets column index truncate length datafusion.execution.parquet.compression zstd(3) Sets default parquet compression codec Valid values are: uncompressed, snappy, gzip(level), lzo, brotli(level), lz4, zstd(level), and lz4_raw. These values are not case sensitive. If NULL, uses default parquet writer setting datafusion.execution.parquet.created_by datafusion Sets "created by" property diff --git a/datafusion/sqllogictest/test_files/predicates.slt b/datafusion/sqllogictest/test_files/predicates.slt index abb36c3c0858..caf79abcfa4e 100644 --- a/datafusion/sqllogictest/test_files/predicates.slt +++ b/datafusion/sqllogictest/test_files/predicates.slt @@ -19,6 +19,9 @@ ## Predicates Tests ########## +statement ok +set datafusion.catalog.information_schema = true; + statement ok CREATE EXTERNAL TABLE aggregate_test_100 ( c1 VARCHAR NOT NULL, @@ -514,8 +517,38 @@ DROP TABLE t; statement ok CREATE EXTERNAL TABLE data_index_bloom_encoding_stats STORED AS PARQUET LOCATION '../../parquet-testing/data/data_index_bloom_encoding_stats.parquet'; +query TT +SHOW datafusion.execution.parquet.bloom_filter_on_read +---- +datafusion.execution.parquet.bloom_filter_on_read true + +query T +SELECT * FROM data_index_bloom_encoding_stats WHERE "String" = 'foo'; +---- + +query T +SELECT * FROM data_index_bloom_encoding_stats WHERE "String" = 'test'; +---- +test + +query T +SELECT * FROM data_index_bloom_encoding_stats WHERE "String" like '%e%'; +---- +Hello +test +are you +the quick +over +the lazy + + +######## +# Test query without bloom filter +# Refer to https://github.com/apache/datafusion/pull/7821#pullrequestreview-1688062599 +######## + statement ok -set datafusion.execution.parquet.bloom_filter_enabled=true; +set datafusion.execution.parquet.bloom_filter_on_read=false; query T SELECT * FROM data_index_bloom_encoding_stats WHERE "String" = 'foo'; @@ -537,7 +570,7 @@ over the lazy statement ok -set datafusion.execution.parquet.bloom_filter_enabled=false; +set datafusion.execution.parquet.bloom_filter_on_read=true; ######## diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 3ee3778177c4..a8797c4600f8 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -68,7 +68,8 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.execution.parquet.column_index_truncate_length | NULL | Sets column index truncate length | | datafusion.execution.parquet.data_page_row_count_limit | 18446744073709551615 | Sets best effort maximum number of rows in data page | | datafusion.execution.parquet.encoding | NULL | Sets default encoding for any column Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting | -| datafusion.execution.parquet.bloom_filter_enabled | false | Sets if bloom filter is enabled for any column | +| datafusion.execution.parquet.bloom_filter_on_read | true | Use any available bloom filters when reading parquet files | +| datafusion.execution.parquet.bloom_filter_on_write | false | Write bloom filters for all columns when creating parquet files | | datafusion.execution.parquet.bloom_filter_fpp | NULL | Sets bloom filter false positive probability. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.bloom_filter_ndv | NULL | Sets bloom filter number of distinct values. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.allow_single_file_parallelism | true | Controls whether DataFusion will attempt to speed up writing parquet files by serializing them in parallel. Each column in each row group in each output file are serialized in parallel leveraging a maximum possible core count of n_files*n_row_groups*n_columns. |