From 1ee68621b488180e316f23e3d02e0fc12abeb7db Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 9 Feb 2024 09:58:59 -0500 Subject: [PATCH 01/10] Add example of using PruningPredicate --- datafusion-examples/README.md | 1 + datafusion-examples/examples/pruning.rs | 186 ++++++++++++++++++ .../core/src/physical_optimizer/pruning.rs | 8 + 3 files changed, 195 insertions(+) create mode 100644 datafusion-examples/examples/pruning.rs diff --git a/datafusion-examples/README.md b/datafusion-examples/README.md index 5b1a5e24853c..d06916f55017 100644 --- a/datafusion-examples/README.md +++ b/datafusion-examples/README.md @@ -56,6 +56,7 @@ cargo run --example csv_sql - [`flight_sql_server.rs`](examples/flight/flight_sql_server.rs): Run DataFusion as a standalone process and execute SQL queries from JDBC clients - [`make_date.rs`](examples/make_date.rs): Examples of using the make_date function - [`memtable.rs`](examples/memtable.rs): Create an query data in memory using SQL and `RecordBatch`es +- [`pruning.rs`](examples/parquet_sql.rs): Use pruning to rule out files based on statistics - [`parquet_sql.rs`](examples/parquet_sql.rs): Build and run a query plan from a SQL statement against a local Parquet file - [`parquet_sql_multiple_files.rs`](examples/parquet_sql_multiple_files.rs): Build and run a query plan from a SQL statement against multiple local Parquet files - [`query-aws-s3.rs`](examples/external_dependency/query-aws-s3.rs): Configure `object_store` and run a query against files stored in AWS S3 diff --git a/datafusion-examples/examples/pruning.rs b/datafusion-examples/examples/pruning.rs new file mode 100644 index 000000000000..8ba1352f5776 --- /dev/null +++ b/datafusion-examples/examples/pruning.rs @@ -0,0 +1,186 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ArrayRef, BooleanArray, Int32Array}; +use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use datafusion::common::{DFSchema, ScalarValue}; +use datafusion::execution::context::ExecutionProps; +use datafusion::physical_expr::create_physical_expr; +use datafusion::physical_optimizer::pruning::{PruningPredicate, PruningStatistics}; +use datafusion::prelude::*; +use std::collections::HashSet; +use std::sync::Arc; + +/// This example shows how to use DataFusion's `PruningPredicate` to prove +/// filter expressions can never be true based on statistics such as min/max +/// values of columns. +/// +/// The process is called "pruning" and is commonly used in query engines to +/// quickly eliminate entire files / partitions / row groups of data from +/// consideration using statistical information from a catalog or other +/// metadata. +#[tokio::main] +async fn main() { + // In this example, we'll use the PruningPredicate to determine if + // the expression `x = 5 AND y = 10` can never be true based on statistics + + // Start with the expression `x = 5 AND y = 10` + let expr = col("x").eq(lit(5)).and(col("y").eq(lit(10))); + + // We can analyze this predicate using information provided by the + // `PruningStatistics` trait, in this case we'll use a simple catalog that + // models three files. For all rows in each file: + // + // File 1: x has values between `4` and `6` + // y has the value 10 + // + // File 1: x has values between `4` and `6` + // y has the value of `7` + // + // File 3: x has the value 1 + // nothing is known about the value of y + let my_catalog = MyCatalog::new(); + + // Create a `PruningPredicate`. + // + // Note the predicate does not automatically coerce types or simplify + // expressions. See expr_api.rs examples for how to do this if required + let predicate = create_pruning_predicate(expr, &my_catalog.schema); + + // Evaluate the predicate for the three files in the catalog + let prune_results = predicate.prune(&my_catalog).unwrap(); + println!("Pruning results: {prune_results:?}"); + + // The result is a `Vec` of bool values, one for each file in the catalog + assert_eq!( + prune_results, + vec![ + // File 1: `x = 5 AND y = 10` can evaluate to true if x has values + // between `4` and `6`, y has the value `10`, so the file can not be + // skipped + // + // NOTE this doesn't mean there actually are rows that evaluate to + // true, but the pruning predicate can't prove there aren't any. + true, + // File 2: `x = 5 AND y = 10` can never evaluate to true because y + // has only the value of 7. Thus this file can be skipped. + false, + // File 3: `x = 5 AND y = 10` can never evaluate to true because x + // has the value `1`, and for any value of `y` the expression will + // evaluate to false or null (not true). Thus this file can also be + // skipped. + false + ] + ); +} + +/// A simple model catalog that has information about the three files that store +/// data for a table with two columns (x and y). +struct MyCatalog { + schema: SchemaRef, + // (min, max) for x + x_values: Vec<(Option, Option)>, + // (min, max) for y + y_values: Vec<(Option, Option)>, +} +impl MyCatalog { + fn new() -> Self { + MyCatalog { + schema: Arc::new(Schema::new(vec![ + Field::new("x", DataType::Int32, false), + Field::new("y", DataType::Int32, false), + ])), + x_values: vec![ + // File 1: x has values between `4` and `6` + (Some(4), Some(6)), + // File 2: x has values between `4` and `6` + (Some(4), Some(6)), + // File 3: x has the value 1 + (Some(1), Some(1)), + ], + y_values: vec![ + // File 1: y has the value 10 + (Some(10), Some(10)), + // File 2: y has the value of `7` + (Some(7), Some(7)), + // File 3: nothing is known about the value of y. This is + // represented as (None, None). + // + // Note, returning null means the value isn't known, NOT + // that we know the entire column is null. + (None, None), + ], + } + } +} + +/// We communicate the statistical information to DataFusion by implementing the +/// PruningStatistics trait. +impl PruningStatistics for MyCatalog { + fn num_containers(&self) -> usize { + // there are 3 files in this "catalog", and thus each array returned + // from min_values and max_values also has 3 elements + 3 + } + + fn min_values(&self, column: &Column) -> Option { + // The pruning predicate evaluates the bounds for multiple expressions + // at once, so return an array with an element for the minimum value in + // each file + match column.name.as_str() { + "x" => Some(i32_array(self.x_values.iter().map(|(min, _)| min))), + "y" => Some(i32_array(self.y_values.iter().map(|(min, _)| min))), + name => panic!("unknown column name: {name}"), + } + } + + fn max_values(&self, column: &Column) -> Option { + // similarly to min_values, return an array with an element for the + // maximum value in each file + match column.name.as_str() { + "x" => Some(i32_array(self.x_values.iter().map(|(_, max)| max))), + "y" => Some(i32_array(self.y_values.iter().map(|(_, max)| max))), + name => panic!("unknown column name: {name}"), + } + } + + fn null_counts(&self, _column: &Column) -> Option { + // In this example, we know nothing about the number of nulls + None + } + + fn contained( + &self, + _column: &Column, + _values: &HashSet, + ) -> Option { + // this method can be used to implement Bloom filter like filtering + // but we do not illustrate that here + None + } +} + +fn create_pruning_predicate(expr: Expr, schema: &SchemaRef) -> PruningPredicate { + let df_schema = DFSchema::try_from(schema.as_ref().clone()).unwrap(); + let props = ExecutionProps::new(); + let physical_expr = create_physical_expr(&expr, &df_schema, &props).unwrap(); + PruningPredicate::try_new(physical_expr, schema.clone()).unwrap() +} + +fn i32_array<'a>(values: impl Iterator>) -> ArrayRef { + Arc::new(Int32Array::from_iter(values.cloned())) +} diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs index aa0c26723767..ceb9e598f63d 100644 --- a/datafusion/core/src/physical_optimizer/pruning.rs +++ b/datafusion/core/src/physical_optimizer/pruning.rs @@ -136,6 +136,8 @@ pub trait PruningStatistics { /// possibly evaluate to `true` given information about a column provided by /// [`PruningStatistics`]. /// +/// # Introduction +/// /// `PruningPredicate` analyzes filter expressions using statistics such as /// min/max values and null counts, attempting to prove a "container" (e.g. /// Parquet Row Group) can be skipped without reading the actual data, @@ -163,6 +165,12 @@ pub trait PruningStatistics { /// /// # Example /// +/// See the [`pruning.rs` example in the `datafusion-examples`] for a complete +/// example of how to use `PruningPredicate` to prune files based on min/max +/// values. +/// +/// [`pruning.rs` example in the `datafusion-examples`]: https://github.com/apache/arrow-datafusion/blob/main/datafusion-examples/examples/pruning.rs +/// /// Given an expression like `x = 5` and statistics for 3 containers (Row /// Groups, files, etc) `A`, `B`, and `C`: /// From 9ba480530e02cbc2429732eff5918fd71e72b376 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 9 Feb 2024 12:17:29 -0500 Subject: [PATCH 02/10] prettier --- datafusion-examples/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion-examples/README.md b/datafusion-examples/README.md index d06916f55017..9646cee45e7a 100644 --- a/datafusion-examples/README.md +++ b/datafusion-examples/README.md @@ -56,7 +56,7 @@ cargo run --example csv_sql - [`flight_sql_server.rs`](examples/flight/flight_sql_server.rs): Run DataFusion as a standalone process and execute SQL queries from JDBC clients - [`make_date.rs`](examples/make_date.rs): Examples of using the make_date function - [`memtable.rs`](examples/memtable.rs): Create an query data in memory using SQL and `RecordBatch`es -- [`pruning.rs`](examples/parquet_sql.rs): Use pruning to rule out files based on statistics +- [`pruning.rs`](examples/parquet_sql.rs): Use pruning to rule out files based on statistics - [`parquet_sql.rs`](examples/parquet_sql.rs): Build and run a query plan from a SQL statement against a local Parquet file - [`parquet_sql_multiple_files.rs`](examples/parquet_sql_multiple_files.rs): Build and run a query plan from a SQL statement against multiple local Parquet files - [`query-aws-s3.rs`](examples/external_dependency/query-aws-s3.rs): Configure `object_store` and run a query against files stored in AWS S3 From 8bee9c80238a2ac3f1cd532046cb60500fb0bfb7 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 9 Feb 2024 12:45:16 -0500 Subject: [PATCH 03/10] Docs: Extend PruningPredicate with background and implementation information --- .../core/src/physical_optimizer/pruning.rs | 168 +++++++++++++++++- 1 file changed, 164 insertions(+), 4 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs index ceb9e598f63d..8b7819bb439d 100644 --- a/datafusion/core/src/physical_optimizer/pruning.rs +++ b/datafusion/core/src/physical_optimizer/pruning.rs @@ -149,11 +149,12 @@ pub trait PruningStatistics { /// for any row in the Row Group, the entire Row Group is skipped during query /// execution. /// -/// The `PruningPredicate` API is designed to be general, so it can used for -/// pruning other types of containers (e.g. files) based on statistics that may -/// be known from external catalogs (e.g. Delta Lake) or other sources. +/// The `PruningPredicate` API is general, and can be used for pruning other +/// types of containers (e.g. files) based on statistics that may be known from +/// external catalogs (e.g. Delta Lake) or other sources. How his works is a +/// subtle topic. See the Background and Implementation section for details. /// -/// It currently supports: +/// `PruningPredicate` supports: /// /// 1. Arbitrary expressions (including user defined functions) /// @@ -190,6 +191,165 @@ pub trait PruningStatistics { /// ``` /// /// See [`PruningPredicate::try_new`] and [`PruningPredicate::prune`] for more information. +/// +/// # Background +/// +/// ## Boolean Tri-state logic +/// +/// To understand the details of the rest of this documentation, it is important +/// to understand how the tri-state boolean logic in SQL works. As this is +/// somewhat esoteric, we review it here. +/// +/// SQL has a notion of `NULL` that represents the value is `“unknown”` and this +/// uncertainty propagates through expressions. SQL `NULL` behaves very +/// differently than the `NULL` in most other languages where it is a special, +/// sentinel value (e.g. `0` in `C/C++`). While representing uncertainty with +/// `NULL` is powerful and elegant, SQL `NULL` s are often deeply confusing when +/// first encountered as they behave differently than most programmers may +/// expect. +/// +/// In most other programming languages, +/// * `a == NULL` evaluates to `true` if `a` also had the value `NULL` +/// * `a == NULL` evaluates to `false` if a has any other value +/// +/// However, in SQL `a = NULL` **always** evaluates to `NULL` (never `true` or `false`): +/// +/// | Expression | Result | +/// | ------------- | --------- | +/// | `1 = NULL` | `NULL` | +/// | `NULL = NULL` | `NULL` | +/// +/// Also important is how `AND` and `OR` works with tri-state boolean logic as +/// (perhaps counterintuitively) the result is **not** always NULL. While +/// consistent with the notion of `NULL` representing “unknown”, this is again, +/// often deeply confusing 🤯 when first encountered. +/// +/// | Expression | Result | Intuition | +/// | --------------- | --------- | ----------- | +/// | `NULL AND true` | `NULL` | The `NULL` stands for “unknown” and if it were `true` or `false` the overall expression value could change | +/// | `NULL AND false` | `false` | If the `NULL` was either `true` or `false` the overall expression is still `false` | +/// | `NULL AND NULL` | `NULL` | | +/// +/// | Expression | Result | Intuition | +/// | --------------- | --------- | ---------- | +/// | `NULL OR true` | `true` | If the `NULL` was either `true` or `false` the overall expression is still `true` | +/// | `NULL OR false` | `NULL` | The `NULL` stands for “unknown” and if it were `true` or `false` the overall expression value could change | +/// | `NULL OR NULL` | `NULL` | | +/// +/// ## SQL Filter Semantics +/// +/// The SQL `WHERE` clause has a boolean expression, often called a filter or +/// predicate. The semantics of this predicate are that the query evaluates the +/// predicate for each row in the input tables and: +/// +/// * Rows that evaluate to `true` are returned in the query results +/// * Rows that evaluate to `false` are not returned (“filtered out” or “pruned” or “skipped”). +/// * Rows that evaluate to `NULL` are **NOT** returned (also “filtered out”) – *this property appears many times in the discussion below* +/// +/// # `PruningPredicate` Implementation +/// Armed with the information in Here is how the PruningPredicate logic works today +/// +/// ## Interface +/// +/// **Inputs** +/// 1. An input schema describing what columns exist +/// +/// 2. A predicate (expression that evaluates to a boolean) +/// +/// 3. [`PruningStatistics`] that provides information about columns in that +/// schema, for multiple “containers”. For each column in each container, it +/// provides optional information on contained values, min_values, max_values, +/// and null_counts counts. +/// +/// **Outputs**: +/// A (non null) boolean value for each container: +/// * `true`: There MAY be rows that match the predicate +/// +/// * `false`: There are no rows that could possibly match the predicate (the +/// predicate can never possibly be true). The container can be pruned (skipped) +/// entirely. +/// +/// Note that in order to be correct, `PruningPredicate` must return false +/// **only** if it can determine that for all rows in the container, the +/// predicate could never evaluate to `true` (always evaluates to either `NULL` +/// or `false`). +/// +/// ## Contains Analysis and Min/Max Rewrite +/// +/// `PruningPredicate` works by first analyzing the predicate to see what +/// [`LiteralGuarantee`] must hold for the predicate to be true. +/// +/// Then, the `PruningPredicate` rewrites the original predicate into an +/// expression that references the min/max values of each column in the original +/// predicate. +/// +/// When the min/max values are actually substituted in to this expression and +/// evaluated, the result means +/// +/// * `true`: there MAY be rows that pass the predicate, **KEEPS** the container +/// +/// * `NULL`: there MAY be rows that pass the predicate, **KEEPS** the container +/// Note that rewritten predicate can evaluate to NULL when some of +/// the min/max values are not known. +/// +/// * `false`: there are no rows that could possibly match the predicate, +/// **PRUNES** the container +/// +/// For example, given a column `x`, the `x_min` and `x_max` and `x_null_count` +/// represent the minimum and maximum values, and the null count of column `x`, +/// provided by the `PruningStatistics`. Here are some examples of the rewritten +/// predicates: +/// +/// | Original Predicate | Rewritten Predicate | +/// | ------------------ | -------------------- | +/// | `x = 5` | `x_min <= 5 AND 5 <= x_max` | +/// | `x < 5` | `x_max < 5` | +/// | `x = 5 AND y = 10` | `x_min <= 5 AND 5 <= x_max AND y_min <= 10 AND 10 <= y_max` | +/// | `x IS NULL` | `x_null_count > 0` | +/// +/// ## Predicate Evaluation +/// The PruningPredicate works in two passes +/// +/// **First pass**: For each `LiteralGuarantee` calls +/// [`PruningStatistics::contains`] and rules out containers where the +/// LiteralGuarantees are not satisfied +/// +/// **Second Pass**: Evaluates the rewritten expression using the +/// min/max/null_counts values for each column for each container. For any +/// container that this expression evaluates to `false`, it rules out those +/// containers. +/// +/// For example, given the predicate, `x = 5 AND y = 10`, if we know `x` is +/// between `1 and 100` and we know that `y` is between `4` and `7`, the input +/// statistics might look like +/// +/// | Column | Value | +/// | ------ | ----- | +/// | x_min | 1 | +/// | x_max | 100 | +/// | y_min | 4 | +/// | y_max | 7 | +/// +/// The rewritten predicate would look like +/// +/// `x_min <= 5 AND 5 <= x_max AND y_min <= 10 AND 10 <= y_max` +/// +/// When these values are substituted in to the rewritten predicate and simplified, the result is `false`: +/// * `1 <= 5 AND 5 <= 100 AND 4 <= 10 AND 10 <= 7` +/// * `true AND true AND true AND false` +/// * `false` +/// +/// Returning `false` means the container can be pruned, which matches the +/// intuition that `x = 5 AND y = 10` can’t be true for any row if all values of `y` +/// are `7` or less. +/// +/// If, for some other container, we knew `y` was between the values `4` and +/// `15`, then the rewritten predicate evaluates to `true` (verifying this is +/// left as an exercise to the reader -- are you still here?), and the container +/// **could not** be pruned. The intuition is that there may be rows where the +/// predicate *might* evaluate to `true`, and the only way to find out is to do +/// more analysis, for example by actually reading the data and evaluating the +/// predicate row by row. #[derive(Debug, Clone)] pub struct PruningPredicate { /// The input schema against which the predicate will be evaluated From d6cd4840d96e2ec381970dda98e7930b446b389b Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 9 Feb 2024 13:40:45 -0500 Subject: [PATCH 04/10] tweaks and related work --- .../core/src/physical_optimizer/pruning.rs | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs index 8b7819bb439d..aa880dd239e9 100644 --- a/datafusion/core/src/physical_optimizer/pruning.rs +++ b/datafusion/core/src/physical_optimizer/pruning.rs @@ -247,7 +247,9 @@ pub trait PruningStatistics { /// * Rows that evaluate to `NULL` are **NOT** returned (also “filtered out”) – *this property appears many times in the discussion below* /// /// # `PruningPredicate` Implementation -/// Armed with the information in Here is how the PruningPredicate logic works today +/// +/// Armed with the information in the Background section, we can now understand +/// how the `PruningPredicate` logic works today /// /// ## Interface /// @@ -350,6 +352,18 @@ pub trait PruningStatistics { /// predicate *might* evaluate to `true`, and the only way to find out is to do /// more analysis, for example by actually reading the data and evaluating the /// predicate row by row. +/// +/// # Related Work +/// +/// [`PruningPredicate`] implements the type of min/max pruning described in +/// Section `3.3.3` of the [`Snowflake SIGMOD Paper`]. It is described by +/// various research such as [small materialized aggregates], [zone maps], and +/// [data skipping]. +/// +/// [`Snowflake SIGMOD Paper`]: https://dl.acm.org/doi/10.1145/2882903.2903741 +/// [small materialized aggregates]: https://www.vldb.org/conf/1998/p476.pdf +/// [zone maps]: https://dl.acm.org/doi/10.1007/978-3-642-03730-6_10 +///[data skipping]: https://dl.acm.org/doi/10.1145/2588555.2610515 #[derive(Debug, Clone)] pub struct PruningPredicate { /// The input schema against which the predicate will be evaluated @@ -387,6 +401,9 @@ impl PruningPredicate { /// For example, the filter expression `(column / 2) = 4` becomes /// the pruning predicate /// `(column_min / 2) <= 4 && 4 <= (column_max / 2))` + /// + /// See the struct level documentation on [`PruningPredicate`] for more + /// details. pub fn try_new(expr: Arc, schema: SchemaRef) -> Result { // build predicate expression once let mut required_columns = RequiredColumns::new(); From f765293cd39c20cadae4e69dbb83155ad709e0b0 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 9 Feb 2024 14:58:53 -0500 Subject: [PATCH 05/10] fix typo --- datafusion/core/src/physical_optimizer/pruning.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs index aa880dd239e9..c52318d8e426 100644 --- a/datafusion/core/src/physical_optimizer/pruning.rs +++ b/datafusion/core/src/physical_optimizer/pruning.rs @@ -313,7 +313,7 @@ pub trait PruningStatistics { /// The PruningPredicate works in two passes /// /// **First pass**: For each `LiteralGuarantee` calls -/// [`PruningStatistics::contains`] and rules out containers where the +/// [`PruningStatistics::contained`] and rules out containers where the /// LiteralGuarantees are not satisfied /// /// **Second Pass**: Evaluates the rewritten expression using the From c59ec0d28df6a8eb65498f0bdeca7a15d0a31f13 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 11 Feb 2024 06:38:33 -0500 Subject: [PATCH 06/10] Apply suggestions from code review Co-authored-by: Chunchun Ye <14298407+appletreeisyellow@users.noreply.github.com> --- datafusion/core/src/physical_optimizer/pruning.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs index c52318d8e426..7e7057518f33 100644 --- a/datafusion/core/src/physical_optimizer/pruning.rs +++ b/datafusion/core/src/physical_optimizer/pruning.rs @@ -204,13 +204,13 @@ pub trait PruningStatistics { /// uncertainty propagates through expressions. SQL `NULL` behaves very /// differently than the `NULL` in most other languages where it is a special, /// sentinel value (e.g. `0` in `C/C++`). While representing uncertainty with -/// `NULL` is powerful and elegant, SQL `NULL` s are often deeply confusing when +/// `NULL` is powerful and elegant, SQL `NULL`s are often deeply confusing when /// first encountered as they behave differently than most programmers may /// expect. /// /// In most other programming languages, /// * `a == NULL` evaluates to `true` if `a` also had the value `NULL` -/// * `a == NULL` evaluates to `false` if a has any other value +/// * `a == NULL` evaluates to `false` if `a` has any other value /// /// However, in SQL `a = NULL` **always** evaluates to `NULL` (never `true` or `false`): /// From 74261e8e36ed4788dc91a9479499bc7f6d81ea93 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 11 Feb 2024 06:45:04 -0500 Subject: [PATCH 07/10] Clarify null semantics --- .../core/src/physical_optimizer/pruning.rs | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs index 7e7057518f33..d01c85653450 100644 --- a/datafusion/core/src/physical_optimizer/pruning.rs +++ b/datafusion/core/src/physical_optimizer/pruning.rs @@ -243,13 +243,17 @@ pub trait PruningStatistics { /// predicate for each row in the input tables and: /// /// * Rows that evaluate to `true` are returned in the query results +/// /// * Rows that evaluate to `false` are not returned (“filtered out” or “pruned” or “skipped”). -/// * Rows that evaluate to `NULL` are **NOT** returned (also “filtered out”) – *this property appears many times in the discussion below* +/// +/// * Rows that evaluate to `NULL` are **NOT** returned (also “filtered out”). +/// Note: *this treatment of `NULL` is **DIFFERENT** than how `NULL` is treated +/// in the rewritten predicate described below.* /// /// # `PruningPredicate` Implementation /// /// Armed with the information in the Background section, we can now understand -/// how the `PruningPredicate` logic works today +/// how the `PruningPredicate` logic works. /// /// ## Interface /// @@ -292,7 +296,9 @@ pub trait PruningStatistics { /// /// * `NULL`: there MAY be rows that pass the predicate, **KEEPS** the container /// Note that rewritten predicate can evaluate to NULL when some of -/// the min/max values are not known. +/// the min/max values are not known. *Note that this is different than +/// the SQL filter semantics where `NULL` means the row is filtered +/// out.* /// /// * `false`: there are no rows that could possibly match the predicate, /// **PRUNES** the container @@ -302,8 +308,8 @@ pub trait PruningStatistics { /// provided by the `PruningStatistics`. Here are some examples of the rewritten /// predicates: /// -/// | Original Predicate | Rewritten Predicate | -/// | ------------------ | -------------------- | +/// | Original Predicate | Rewritten Predicate | +/// | ------------------ | --------------------| /// | `x = 5` | `x_min <= 5 AND 5 <= x_max` | /// | `x < 5` | `x_max < 5` | /// | `x = 5 AND y = 10` | `x_min <= 5 AND 5 <= x_max AND y_min <= 10 AND 10 <= y_max` | From cd998864e7a5085769472f3c565ec8e1ee318714 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 11 Feb 2024 06:53:04 -0500 Subject: [PATCH 08/10] fix table formatting --- .../core/src/physical_optimizer/pruning.rs | 59 ++++++++++--------- 1 file changed, 31 insertions(+), 28 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs index d01c85653450..7930e6efe673 100644 --- a/datafusion/core/src/physical_optimizer/pruning.rs +++ b/datafusion/core/src/physical_optimizer/pruning.rs @@ -212,29 +212,30 @@ pub trait PruningStatistics { /// * `a == NULL` evaluates to `true` if `a` also had the value `NULL` /// * `a == NULL` evaluates to `false` if `a` has any other value /// -/// However, in SQL `a = NULL` **always** evaluates to `NULL` (never `true` or `false`): +/// However, in SQL `a = NULL` **always** evaluates to `NULL` (never `true` or +/// `false`): /// -/// | Expression | Result | -/// | ------------- | --------- | -/// | `1 = NULL` | `NULL` | -/// | `NULL = NULL` | `NULL` | +/// Expression | Result +/// ------------- | --------- +/// `1 = NULL` | `NULL` +/// `NULL = NULL` | `NULL` /// /// Also important is how `AND` and `OR` works with tri-state boolean logic as /// (perhaps counterintuitively) the result is **not** always NULL. While /// consistent with the notion of `NULL` representing “unknown”, this is again, /// often deeply confusing 🤯 when first encountered. /// -/// | Expression | Result | Intuition | -/// | --------------- | --------- | ----------- | -/// | `NULL AND true` | `NULL` | The `NULL` stands for “unknown” and if it were `true` or `false` the overall expression value could change | -/// | `NULL AND false` | `false` | If the `NULL` was either `true` or `false` the overall expression is still `false` | -/// | `NULL AND NULL` | `NULL` | | +/// Expression | Result | Intuition +/// --------------- | --------- | ----------- +/// `NULL AND true` | `NULL` | The `NULL` stands for “unknown” and if it were `true` or `false` the overall expression value could change +/// `NULL AND false` | `false` | If the `NULL` was either `true` or `false` the overall expression is still `false` +/// `NULL AND NULL` | `NULL` | /// -/// | Expression | Result | Intuition | -/// | --------------- | --------- | ---------- | -/// | `NULL OR true` | `true` | If the `NULL` was either `true` or `false` the overall expression is still `true` | -/// | `NULL OR false` | `NULL` | The `NULL` stands for “unknown” and if it were `true` or `false` the overall expression value could change | -/// | `NULL OR NULL` | `NULL` | | +/// Expression | Result | Intuition +/// --------------- | --------- | ---------- +/// `NULL OR true` | `true` | If the `NULL` was either `true` or `false` the overall expression is still `true` +/// `NULL OR false` | `NULL` | The `NULL` stands for “unknown” and if it were `true` or `false` the overall expression value could change +/// `NULL OR NULL` | `NULL` | /// /// ## SQL Filter Semantics /// @@ -308,12 +309,12 @@ pub trait PruningStatistics { /// provided by the `PruningStatistics`. Here are some examples of the rewritten /// predicates: /// -/// | Original Predicate | Rewritten Predicate | -/// | ------------------ | --------------------| -/// | `x = 5` | `x_min <= 5 AND 5 <= x_max` | -/// | `x < 5` | `x_max < 5` | -/// | `x = 5 AND y = 10` | `x_min <= 5 AND 5 <= x_max AND y_min <= 10 AND 10 <= y_max` | -/// | `x IS NULL` | `x_null_count > 0` | +/// Original Predicate | Rewritten Predicate +/// ------------------ | -------------------- +/// `x = 5` | `x_min <= 5 AND 5 <= x_max` +/// `x < 5` | `x_max < 5` +/// `x = 5 AND y = 10` | `x_min <= 5 AND 5 <= x_max AND y_min <= 10 AND 10 <= y_max` +/// `x IS NULL` | `x_null_count > 0` /// /// ## Predicate Evaluation /// The PruningPredicate works in two passes @@ -331,18 +332,20 @@ pub trait PruningStatistics { /// between `1 and 100` and we know that `y` is between `4` and `7`, the input /// statistics might look like /// -/// | Column | Value | -/// | ------ | ----- | -/// | x_min | 1 | -/// | x_max | 100 | -/// | y_min | 4 | -/// | y_max | 7 | +/// Column | Value +/// -------- | ----- +/// `x_min` | `1` +/// `x_max` | `100` +/// `y_min` | `4` +/// `y_max` | `7` /// /// The rewritten predicate would look like /// /// `x_min <= 5 AND 5 <= x_max AND y_min <= 10 AND 10 <= y_max` /// -/// When these values are substituted in to the rewritten predicate and simplified, the result is `false`: +/// When these values are substituted in to the rewritten predicate and +/// simplified, the result is `false`: +/// /// * `1 <= 5 AND 5 <= 100 AND 4 <= 10 AND 10 <= 7` /// * `true AND true AND true AND false` /// * `false` From 5570047d9f2b6945cca9083f920ab4bb136b86a7 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 11 Feb 2024 06:53:22 -0500 Subject: [PATCH 09/10] fix table formatting --- datafusion/core/src/physical_optimizer/pruning.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs index 7930e6efe673..4140c13bdeae 100644 --- a/datafusion/core/src/physical_optimizer/pruning.rs +++ b/datafusion/core/src/physical_optimizer/pruning.rs @@ -365,9 +365,9 @@ pub trait PruningStatistics { /// # Related Work /// /// [`PruningPredicate`] implements the type of min/max pruning described in -/// Section `3.3.3` of the [`Snowflake SIGMOD Paper`]. It is described by -/// various research such as [small materialized aggregates], [zone maps], and -/// [data skipping]. +/// Section `3.3.3` of the [`Snowflake SIGMOD Paper`]. The technique is +/// described by various research such as [small materialized aggregates], [zone +/// maps], and [data skipping]. /// /// [`Snowflake SIGMOD Paper`]: https://dl.acm.org/doi/10.1145/2882903.2903741 /// [small materialized aggregates]: https://www.vldb.org/conf/1998/p476.pdf From 059657274158d757e8bb57e9d034f908d5f0abc1 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 12 Feb 2024 06:44:48 -0500 Subject: [PATCH 10/10] Update datafusion/core/src/physical_optimizer/pruning.rs Co-authored-by: Jeffrey Vo --- datafusion/core/src/physical_optimizer/pruning.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs index 4140c13bdeae..648b1f70c58b 100644 --- a/datafusion/core/src/physical_optimizer/pruning.rs +++ b/datafusion/core/src/physical_optimizer/pruning.rs @@ -151,7 +151,7 @@ pub trait PruningStatistics { /// /// The `PruningPredicate` API is general, and can be used for pruning other /// types of containers (e.g. files) based on statistics that may be known from -/// external catalogs (e.g. Delta Lake) or other sources. How his works is a +/// external catalogs (e.g. Delta Lake) or other sources. How this works is a /// subtle topic. See the Background and Implementation section for details. /// /// `PruningPredicate` supports: