From 1ee68621b488180e316f23e3d02e0fc12abeb7db Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Fri, 9 Feb 2024 09:58:59 -0500
Subject: [PATCH 01/10] Add example of using PruningPredicate

---
 datafusion-examples/README.md                 |   1 +
 datafusion-examples/examples/pruning.rs       | 186 ++++++++++++++++++
 .../core/src/physical_optimizer/pruning.rs    |   8 +
 3 files changed, 195 insertions(+)
 create mode 100644 datafusion-examples/examples/pruning.rs

diff --git a/datafusion-examples/README.md b/datafusion-examples/README.md
index 5b1a5e24853c..d06916f55017 100644
--- a/datafusion-examples/README.md
+++ b/datafusion-examples/README.md
@@ -56,6 +56,7 @@ cargo run --example csv_sql
 - [`flight_sql_server.rs`](examples/flight/flight_sql_server.rs): Run DataFusion as a standalone process and execute SQL queries from JDBC clients
 - [`make_date.rs`](examples/make_date.rs): Examples of using the make_date function
 - [`memtable.rs`](examples/memtable.rs): Create an query data in memory using SQL and `RecordBatch`es
+- [`pruning.rs`](examples/parquet_sql.rs): Use pruning to rule out files based on statistics 
 - [`parquet_sql.rs`](examples/parquet_sql.rs): Build and run a query plan from a SQL statement against a local Parquet file
 - [`parquet_sql_multiple_files.rs`](examples/parquet_sql_multiple_files.rs): Build and run a query plan from a SQL statement against multiple local Parquet files
 - [`query-aws-s3.rs`](examples/external_dependency/query-aws-s3.rs): Configure `object_store` and run a query against files stored in AWS S3
diff --git a/datafusion-examples/examples/pruning.rs b/datafusion-examples/examples/pruning.rs
new file mode 100644
index 000000000000..8ba1352f5776
--- /dev/null
+++ b/datafusion-examples/examples/pruning.rs
@@ -0,0 +1,186 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, BooleanArray, Int32Array};
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use datafusion::common::{DFSchema, ScalarValue};
+use datafusion::execution::context::ExecutionProps;
+use datafusion::physical_expr::create_physical_expr;
+use datafusion::physical_optimizer::pruning::{PruningPredicate, PruningStatistics};
+use datafusion::prelude::*;
+use std::collections::HashSet;
+use std::sync::Arc;
+
+/// This example shows how to use  DataFusion's `PruningPredicate` to prove
+/// filter expressions can never be true based on statistics such as min/max
+/// values of columns.
+///
+/// The process is called "pruning" and is commonly used in query engines to
+/// quickly eliminate entire files / partitions / row groups of data from
+/// consideration using statistical information from a catalog or other
+/// metadata.
+#[tokio::main]
+async fn main() {
+    // In this example, we'll use the PruningPredicate to determine if
+    // the expression `x = 5 AND y = 10` can never be true based on statistics
+
+    // Start with the expression `x = 5 AND y = 10`
+    let expr = col("x").eq(lit(5)).and(col("y").eq(lit(10)));
+
+    // We can analyze this predicate using information provided by the
+    // `PruningStatistics` trait, in this case we'll use a simple catalog that
+    // models three files.  For all rows in each file:
+    //
+    //  File 1: x has values between `4` and `6`
+    //          y has the value 10
+    //
+    //  File 1: x has values between `4` and `6`
+    //          y has the value of `7`
+    //
+    //  File 3: x has the value 1
+    //          nothing is known about the value of y
+    let my_catalog = MyCatalog::new();
+
+    // Create a `PruningPredicate`.
+    //
+    // Note the predicate does not automatically coerce types or simplify
+    // expressions. See expr_api.rs examples for how to do this if required
+    let predicate = create_pruning_predicate(expr, &my_catalog.schema);
+
+    // Evaluate the predicate for the three files in the catalog
+    let prune_results = predicate.prune(&my_catalog).unwrap();
+    println!("Pruning results: {prune_results:?}");
+
+    // The result is a `Vec` of bool values, one for each file in the catalog
+    assert_eq!(
+        prune_results,
+        vec![
+            // File 1: `x = 5 AND y = 10` can evaluate to true if x has values
+            // between `4` and `6`, y has the value `10`, so the file can not be
+            // skipped
+            //
+            // NOTE this doesn't mean there actually are rows that evaluate to
+            // true, but the pruning predicate can't prove there aren't any.
+            true,
+            // File 2: `x = 5 AND y = 10` can never evaluate to true because y
+            // has only the value of 7. Thus this file can be skipped.
+            false,
+            // File 3: `x = 5 AND y = 10` can never evaluate to true because x
+            // has the value `1`, and for any value of `y` the expression will
+            // evaluate to false or null (not true). Thus this file can also be
+            // skipped.
+            false
+        ]
+    );
+}
+
+/// A simple model catalog that has information about the three files that store
+/// data for a table with two columns (x and y).
+struct MyCatalog {
+    schema: SchemaRef,
+    // (min, max) for x
+    x_values: Vec<(Option<i32>, Option<i32>)>,
+    // (min, max) for y
+    y_values: Vec<(Option<i32>, Option<i32>)>,
+}
+impl MyCatalog {
+    fn new() -> Self {
+        MyCatalog {
+            schema: Arc::new(Schema::new(vec![
+                Field::new("x", DataType::Int32, false),
+                Field::new("y", DataType::Int32, false),
+            ])),
+            x_values: vec![
+                // File 1: x has values between `4` and `6`
+                (Some(4), Some(6)),
+                // File 2: x has values between `4` and `6`
+                (Some(4), Some(6)),
+                // File 3: x has the value 1
+                (Some(1), Some(1)),
+            ],
+            y_values: vec![
+                // File 1: y has the value 10
+                (Some(10), Some(10)),
+                // File 2: y has the value of `7`
+                (Some(7), Some(7)),
+                // File 3: nothing is known about the value of y. This is
+                // represented as (None, None).
+                //
+                // Note, returning null means the value isn't known, NOT
+                // that we know the entire column is null.
+                (None, None),
+            ],
+        }
+    }
+}
+
+/// We communicate the statistical information to DataFusion by implementing the
+/// PruningStatistics trait.
+impl PruningStatistics for MyCatalog {
+    fn num_containers(&self) -> usize {
+        // there are 3 files in this "catalog", and thus each array returned
+        // from min_values and max_values also has 3 elements
+        3
+    }
+
+    fn min_values(&self, column: &Column) -> Option<ArrayRef> {
+        // The pruning predicate evaluates the bounds for multiple expressions
+        // at once, so  return an array with an element for the minimum value in
+        // each file
+        match column.name.as_str() {
+            "x" => Some(i32_array(self.x_values.iter().map(|(min, _)| min))),
+            "y" => Some(i32_array(self.y_values.iter().map(|(min, _)| min))),
+            name => panic!("unknown column name: {name}"),
+        }
+    }
+
+    fn max_values(&self, column: &Column) -> Option<ArrayRef> {
+        // similarly to min_values, return an array with an element for the
+        // maximum value in each file
+        match column.name.as_str() {
+            "x" => Some(i32_array(self.x_values.iter().map(|(_, max)| max))),
+            "y" => Some(i32_array(self.y_values.iter().map(|(_, max)| max))),
+            name => panic!("unknown column name: {name}"),
+        }
+    }
+
+    fn null_counts(&self, _column: &Column) -> Option<ArrayRef> {
+        // In this example, we know nothing about the number of nulls
+        None
+    }
+
+    fn contained(
+        &self,
+        _column: &Column,
+        _values: &HashSet<ScalarValue>,
+    ) -> Option<BooleanArray> {
+        // this method can be used to implement Bloom filter like filtering
+        // but we do not illustrate that here
+        None
+    }
+}
+
+fn create_pruning_predicate(expr: Expr, schema: &SchemaRef) -> PruningPredicate {
+    let df_schema = DFSchema::try_from(schema.as_ref().clone()).unwrap();
+    let props = ExecutionProps::new();
+    let physical_expr = create_physical_expr(&expr, &df_schema, &props).unwrap();
+    PruningPredicate::try_new(physical_expr, schema.clone()).unwrap()
+}
+
+fn i32_array<'a>(values: impl Iterator<Item = &'a Option<i32>>) -> ArrayRef {
+    Arc::new(Int32Array::from_iter(values.cloned()))
+}
diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs
index aa0c26723767..ceb9e598f63d 100644
--- a/datafusion/core/src/physical_optimizer/pruning.rs
+++ b/datafusion/core/src/physical_optimizer/pruning.rs
@@ -136,6 +136,8 @@ pub trait PruningStatistics {
 /// possibly evaluate to `true` given information about a column provided by
 /// [`PruningStatistics`].
 ///
+/// # Introduction
+///
 /// `PruningPredicate` analyzes filter expressions using statistics such as
 /// min/max values and null counts, attempting to prove a "container" (e.g.
 /// Parquet Row Group) can be skipped without reading the actual data,
@@ -163,6 +165,12 @@ pub trait PruningStatistics {
 ///
 /// # Example
 ///
+/// See the [`pruning.rs` example in the `datafusion-examples`] for a complete
+/// example of how to use `PruningPredicate` to prune files based on min/max
+/// values.
+///
+/// [`pruning.rs` example in the `datafusion-examples`]: https://github.com/apache/arrow-datafusion/blob/main/datafusion-examples/examples/pruning.rs
+///
 /// Given an expression like `x = 5` and statistics for 3 containers (Row
 /// Groups, files, etc) `A`, `B`, and `C`:
 ///

From 9ba480530e02cbc2429732eff5918fd71e72b376 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Fri, 9 Feb 2024 12:17:29 -0500
Subject: [PATCH 02/10] prettier

---
 datafusion-examples/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datafusion-examples/README.md b/datafusion-examples/README.md
index d06916f55017..9646cee45e7a 100644
--- a/datafusion-examples/README.md
+++ b/datafusion-examples/README.md
@@ -56,7 +56,7 @@ cargo run --example csv_sql
 - [`flight_sql_server.rs`](examples/flight/flight_sql_server.rs): Run DataFusion as a standalone process and execute SQL queries from JDBC clients
 - [`make_date.rs`](examples/make_date.rs): Examples of using the make_date function
 - [`memtable.rs`](examples/memtable.rs): Create an query data in memory using SQL and `RecordBatch`es
-- [`pruning.rs`](examples/parquet_sql.rs): Use pruning to rule out files based on statistics 
+- [`pruning.rs`](examples/parquet_sql.rs): Use pruning to rule out files based on statistics
 - [`parquet_sql.rs`](examples/parquet_sql.rs): Build and run a query plan from a SQL statement against a local Parquet file
 - [`parquet_sql_multiple_files.rs`](examples/parquet_sql_multiple_files.rs): Build and run a query plan from a SQL statement against multiple local Parquet files
 - [`query-aws-s3.rs`](examples/external_dependency/query-aws-s3.rs): Configure `object_store` and run a query against files stored in AWS S3

From 8bee9c80238a2ac3f1cd532046cb60500fb0bfb7 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Fri, 9 Feb 2024 12:45:16 -0500
Subject: [PATCH 03/10] Docs: Extend PruningPredicate with background and
 implementation information

---
 .../core/src/physical_optimizer/pruning.rs    | 168 +++++++++++++++++-
 1 file changed, 164 insertions(+), 4 deletions(-)

diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs
index ceb9e598f63d..8b7819bb439d 100644
--- a/datafusion/core/src/physical_optimizer/pruning.rs
+++ b/datafusion/core/src/physical_optimizer/pruning.rs
@@ -149,11 +149,12 @@ pub trait PruningStatistics {
 /// for any row in the Row Group, the entire Row Group is skipped during query
 /// execution.
 ///
-/// The `PruningPredicate` API is designed to be general, so it can used for
-/// pruning other types of containers (e.g. files) based on statistics that may
-/// be known from external catalogs (e.g. Delta Lake) or other sources.
+/// The `PruningPredicate` API is general, and can be used for pruning other
+/// types of containers (e.g. files) based on statistics that may be known from
+/// external catalogs (e.g. Delta Lake) or other sources. How his works is a
+/// subtle topic.  See the Background and Implementation section for details.
 ///
-/// It currently supports:
+/// `PruningPredicate` supports:
 ///
 /// 1. Arbitrary expressions (including user defined functions)
 ///
@@ -190,6 +191,165 @@ pub trait PruningStatistics {
 /// ```
 ///
 /// See [`PruningPredicate::try_new`] and [`PruningPredicate::prune`] for more information.
+///
+/// # Background
+///
+/// ## Boolean Tri-state logic
+///
+/// To understand the details of the rest of this documentation, it is important
+/// to understand how the tri-state boolean logic in SQL works. As this is
+/// somewhat esoteric, we review it here.
+///
+/// SQL has a notion of `NULL` that represents the value is `“unknown”` and this
+/// uncertainty propagates through expressions. SQL `NULL` behaves very
+/// differently than the `NULL` in most other languages where it is a special,
+/// sentinel value (e.g. `0` in `C/C++`). While representing uncertainty with
+/// `NULL` is powerful and elegant, SQL `NULL` s are often deeply confusing when
+/// first encountered as they behave differently than most programmers may
+/// expect.
+///
+/// In most other programming languages,
+/// * `a == NULL` evaluates to `true` if `a` also had the value `NULL`
+/// * `a == NULL` evaluates to `false` if a has any other value
+///
+/// However, in SQL `a = NULL` **always** evaluates to `NULL` (never `true` or `false`):
+///
+/// | Expression     | Result   |
+/// | ------------- | --------- |
+/// | `1 = NULL`    | `NULL`    |
+/// | `NULL = NULL` | `NULL`    |
+///
+/// Also important is how `AND` and `OR` works with tri-state boolean logic as
+/// (perhaps counterintuitively) the result is **not** always NULL. While
+/// consistent with the notion of `NULL` representing “unknown”, this is again,
+/// often deeply confusing 🤯 when first encountered.
+///
+/// | Expression       | Result    | Intuition   |
+/// | ---------------  | --------- | ----------- |
+/// | `NULL AND true`  |   `NULL`  | The `NULL` stands for “unknown” and if it were `true` or `false` the overall expression value could change |
+/// | `NULL AND false` |  `false`  | If the `NULL` was either `true` or `false` the overall expression is still `false` |
+/// | `NULL AND NULL`  | `NULL`    |            |
+///
+/// | Expression      | Result    | Intuition |
+/// | --------------- | --------- | ---------- |
+/// | `NULL OR true`  | `true`    |  If the `NULL` was either `true` or `false` the overall expression is still `true` |
+/// | `NULL OR false` | `NULL`    |  The `NULL` stands for “unknown” and if it were `true` or `false` the overall expression value could change |
+/// | `NULL OR NULL`  |  `NULL`   |            |
+///
+/// ## SQL Filter Semantics
+///
+/// The SQL `WHERE` clause has a boolean expression, often called a filter or
+/// predicate. The semantics of this predicate are that the query evaluates the
+/// predicate for each row in the input tables and:
+///
+/// * Rows that evaluate to `true` are returned in the query results
+/// * Rows that evaluate to `false` are not returned (“filtered out” or “pruned” or “skipped”).
+/// * Rows that evaluate to `NULL` are **NOT** returned (also “filtered out”) – *this property appears many times in the discussion below*
+///
+/// # `PruningPredicate` Implementation
+/// Armed with the information in Here is how the PruningPredicate logic works today
+///
+/// ## Interface
+///
+/// **Inputs**
+/// 1. An input schema describing what columns exist
+///
+/// 2. A predicate (expression that evaluates to a boolean)
+///
+/// 3. [`PruningStatistics`] that provides information about columns in that
+/// schema, for multiple “containers”. For each column in each container, it
+/// provides optional information on contained values, min_values, max_values,
+/// and null_counts counts.
+///
+/// **Outputs**:
+/// A (non null) boolean value for each container:
+/// * `true`: There MAY be rows that match the predicate
+///
+/// * `false`: There are no rows that could possibly match the predicate (the
+/// predicate can never possibly be true). The container can be pruned (skipped)
+/// entirely.
+///
+/// Note that in order to be correct, `PruningPredicate` must return false
+/// **only** if it can determine that for all rows in the container, the
+/// predicate could never evaluate to `true` (always evaluates to either `NULL`
+/// or `false`).
+///
+/// ## Contains Analysis and Min/Max Rewrite
+///
+/// `PruningPredicate` works by first analyzing the predicate to see what
+/// [`LiteralGuarantee`] must hold for the predicate to be true.
+///
+/// Then, the `PruningPredicate` rewrites the original predicate into an
+/// expression that references the min/max values of each column in the original
+/// predicate.
+///
+/// When the min/max values are actually substituted in to this expression and
+/// evaluated, the result means
+///
+/// * `true`: there MAY be rows that pass the predicate, **KEEPS** the container
+///
+/// * `NULL`: there MAY be rows that pass the predicate, **KEEPS** the container
+///           Note that rewritten predicate can evaluate to NULL when some of
+///           the min/max values are not known.
+///
+/// * `false`: there are no rows that could possibly match the predicate,
+///            **PRUNES** the container
+///
+/// For example, given a column `x`, the `x_min` and `x_max` and `x_null_count`
+/// represent the minimum and maximum values, and the null count of column `x`,
+/// provided by the `PruningStatistics`. Here are some examples of the rewritten
+/// predicates:
+///
+/// | Original Predicate |  Rewritten Predicate |
+/// | ------------------ | -------------------- |
+/// | `x = 5` | `x_min <= 5 AND 5 <= x_max` |
+/// | `x < 5` | `x_max < 5` |
+/// | `x = 5 AND y = 10` | `x_min <= 5 AND 5 <= x_max AND y_min <= 10 AND 10 <= y_max` |
+/// | `x IS NULL`  | `x_null_count > 0` |
+///
+/// ## Predicate Evaluation
+/// The PruningPredicate works in two passes
+///
+/// **First pass**:  For each `LiteralGuarantee` calls
+/// [`PruningStatistics::contains`] and rules out containers where the
+/// LiteralGuarantees are not satisfied
+///
+/// **Second Pass**: Evaluates the rewritten expression using the
+/// min/max/null_counts values for each column for each container. For any
+/// container that this expression evaluates to `false`, it rules out those
+/// containers.
+///
+/// For example, given the predicate, `x = 5 AND y = 10`, if we know `x` is
+/// between `1 and 100` and we know that `y` is between `4` and `7`, the input
+/// statistics might look like
+///
+/// | Column | Value |
+/// | ------ | ----- |
+/// | x_min | 1    |
+/// | x_max | 100  |
+/// | y_min |  4   |
+/// | y_max | 7    |
+///
+/// The rewritten predicate would look like
+///
+/// `x_min <= 5 AND 5 <= x_max AND  y_min <= 10 AND 10 <= y_max`
+///
+/// When these values are substituted in to the rewritten predicate and simplified, the result is `false`:
+/// * `1 <= 5 AND 5 <= 100 AND 4 <= 10 AND 10 <= 7`
+/// * `true AND true AND true AND false`
+/// * `false`
+///
+/// Returning `false` means the container can be pruned, which matches the
+/// intuition that  `x = 5 AND y = 10` can’t be true for any row if all values of `y`
+/// are `7` or less.
+///
+/// If, for some other container, we knew `y` was between the values `4` and
+/// `15`, then the rewritten predicate evaluates to `true` (verifying this is
+/// left as an exercise to the reader -- are you still here?), and the container
+/// **could not** be pruned. The intuition is that there may be rows where the
+/// predicate *might* evaluate to `true`, and the only way to find out is to do
+/// more analysis, for example by actually reading the data and evaluating the
+/// predicate row by row.
 #[derive(Debug, Clone)]
 pub struct PruningPredicate {
     /// The input schema against which the predicate will be evaluated

From d6cd4840d96e2ec381970dda98e7930b446b389b Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Fri, 9 Feb 2024 13:40:45 -0500
Subject: [PATCH 04/10] tweaks and related work

---
 .../core/src/physical_optimizer/pruning.rs    | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs
index 8b7819bb439d..aa880dd239e9 100644
--- a/datafusion/core/src/physical_optimizer/pruning.rs
+++ b/datafusion/core/src/physical_optimizer/pruning.rs
@@ -247,7 +247,9 @@ pub trait PruningStatistics {
 /// * Rows that evaluate to `NULL` are **NOT** returned (also “filtered out”) – *this property appears many times in the discussion below*
 ///
 /// # `PruningPredicate` Implementation
-/// Armed with the information in Here is how the PruningPredicate logic works today
+///
+/// Armed with the information in the Background section, we can now understand
+/// how the `PruningPredicate` logic works today
 ///
 /// ## Interface
 ///
@@ -350,6 +352,18 @@ pub trait PruningStatistics {
 /// predicate *might* evaluate to `true`, and the only way to find out is to do
 /// more analysis, for example by actually reading the data and evaluating the
 /// predicate row by row.
+///
+/// # Related Work
+///
+/// [`PruningPredicate`] implements the type of min/max pruning described in
+/// Section `3.3.3` of the [`Snowflake SIGMOD Paper`]. It is described by
+/// various research such as [small materialized aggregates], [zone maps], and
+/// [data skipping].
+///
+/// [`Snowflake SIGMOD Paper`]: https://dl.acm.org/doi/10.1145/2882903.2903741
+/// [small materialized aggregates]: https://www.vldb.org/conf/1998/p476.pdf
+/// [zone maps]: https://dl.acm.org/doi/10.1007/978-3-642-03730-6_10
+///[data skipping]: https://dl.acm.org/doi/10.1145/2588555.2610515
 #[derive(Debug, Clone)]
 pub struct PruningPredicate {
     /// The input schema against which the predicate will be evaluated
@@ -387,6 +401,9 @@ impl PruningPredicate {
     /// For example, the filter expression `(column / 2) = 4` becomes
     /// the pruning predicate
     /// `(column_min / 2) <= 4 && 4 <= (column_max / 2))`
+    ///
+    /// See the struct level documentation on [`PruningPredicate`] for more
+    /// details.
     pub fn try_new(expr: Arc<dyn PhysicalExpr>, schema: SchemaRef) -> Result<Self> {
         // build predicate expression once
         let mut required_columns = RequiredColumns::new();

From f765293cd39c20cadae4e69dbb83155ad709e0b0 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Fri, 9 Feb 2024 14:58:53 -0500
Subject: [PATCH 05/10] fix typo

---
 datafusion/core/src/physical_optimizer/pruning.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs
index aa880dd239e9..c52318d8e426 100644
--- a/datafusion/core/src/physical_optimizer/pruning.rs
+++ b/datafusion/core/src/physical_optimizer/pruning.rs
@@ -313,7 +313,7 @@ pub trait PruningStatistics {
 /// The PruningPredicate works in two passes
 ///
 /// **First pass**:  For each `LiteralGuarantee` calls
-/// [`PruningStatistics::contains`] and rules out containers where the
+/// [`PruningStatistics::contained`] and rules out containers where the
 /// LiteralGuarantees are not satisfied
 ///
 /// **Second Pass**: Evaluates the rewritten expression using the

From c59ec0d28df6a8eb65498f0bdeca7a15d0a31f13 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Sun, 11 Feb 2024 06:38:33 -0500
Subject: [PATCH 06/10] Apply suggestions from code review

Co-authored-by: Chunchun Ye <14298407+appletreeisyellow@users.noreply.github.com>
---
 datafusion/core/src/physical_optimizer/pruning.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs
index c52318d8e426..7e7057518f33 100644
--- a/datafusion/core/src/physical_optimizer/pruning.rs
+++ b/datafusion/core/src/physical_optimizer/pruning.rs
@@ -204,13 +204,13 @@ pub trait PruningStatistics {
 /// uncertainty propagates through expressions. SQL `NULL` behaves very
 /// differently than the `NULL` in most other languages where it is a special,
 /// sentinel value (e.g. `0` in `C/C++`). While representing uncertainty with
-/// `NULL` is powerful and elegant, SQL `NULL` s are often deeply confusing when
+/// `NULL` is powerful and elegant, SQL `NULL`s are often deeply confusing when
 /// first encountered as they behave differently than most programmers may
 /// expect.
 ///
 /// In most other programming languages,
 /// * `a == NULL` evaluates to `true` if `a` also had the value `NULL`
-/// * `a == NULL` evaluates to `false` if a has any other value
+/// * `a == NULL` evaluates to `false` if `a` has any other value
 ///
 /// However, in SQL `a = NULL` **always** evaluates to `NULL` (never `true` or `false`):
 ///

From 74261e8e36ed4788dc91a9479499bc7f6d81ea93 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Sun, 11 Feb 2024 06:45:04 -0500
Subject: [PATCH 07/10] Clarify null semantics

---
 .../core/src/physical_optimizer/pruning.rs       | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs
index 7e7057518f33..d01c85653450 100644
--- a/datafusion/core/src/physical_optimizer/pruning.rs
+++ b/datafusion/core/src/physical_optimizer/pruning.rs
@@ -243,13 +243,17 @@ pub trait PruningStatistics {
 /// predicate for each row in the input tables and:
 ///
 /// * Rows that evaluate to `true` are returned in the query results
+///
 /// * Rows that evaluate to `false` are not returned (“filtered out” or “pruned” or “skipped”).
-/// * Rows that evaluate to `NULL` are **NOT** returned (also “filtered out”) – *this property appears many times in the discussion below*
+///
+/// * Rows that evaluate to `NULL` are **NOT** returned (also “filtered out”).
+///   Note: *this treatment of `NULL` is **DIFFERENT** than how `NULL` is treated
+///   in the rewritten predicate described below.*
 ///
 /// # `PruningPredicate` Implementation
 ///
 /// Armed with the information in the Background section, we can now understand
-/// how the `PruningPredicate` logic works today
+/// how the `PruningPredicate` logic works.
 ///
 /// ## Interface
 ///
@@ -292,7 +296,9 @@ pub trait PruningStatistics {
 ///
 /// * `NULL`: there MAY be rows that pass the predicate, **KEEPS** the container
 ///           Note that rewritten predicate can evaluate to NULL when some of
-///           the min/max values are not known.
+///           the min/max values are not known. *Note that this is different than
+///           the SQL filter semantics where `NULL` means the row is filtered
+///           out.*
 ///
 /// * `false`: there are no rows that could possibly match the predicate,
 ///            **PRUNES** the container
@@ -302,8 +308,8 @@ pub trait PruningStatistics {
 /// provided by the `PruningStatistics`. Here are some examples of the rewritten
 /// predicates:
 ///
-/// | Original Predicate |  Rewritten Predicate |
-/// | ------------------ | -------------------- |
+/// | Original Predicate | Rewritten Predicate |
+/// | ------------------ | --------------------|
 /// | `x = 5` | `x_min <= 5 AND 5 <= x_max` |
 /// | `x < 5` | `x_max < 5` |
 /// | `x = 5 AND y = 10` | `x_min <= 5 AND 5 <= x_max AND y_min <= 10 AND 10 <= y_max` |

From cd998864e7a5085769472f3c565ec8e1ee318714 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Sun, 11 Feb 2024 06:53:04 -0500
Subject: [PATCH 08/10] fix table formatting

---
 .../core/src/physical_optimizer/pruning.rs    | 59 ++++++++++---------
 1 file changed, 31 insertions(+), 28 deletions(-)

diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs
index d01c85653450..7930e6efe673 100644
--- a/datafusion/core/src/physical_optimizer/pruning.rs
+++ b/datafusion/core/src/physical_optimizer/pruning.rs
@@ -212,29 +212,30 @@ pub trait PruningStatistics {
 /// * `a == NULL` evaluates to `true` if `a` also had the value `NULL`
 /// * `a == NULL` evaluates to `false` if `a` has any other value
 ///
-/// However, in SQL `a = NULL` **always** evaluates to `NULL` (never `true` or `false`):
+/// However, in SQL `a = NULL` **always** evaluates to `NULL` (never `true` or
+/// `false`):
 ///
-/// | Expression     | Result   |
-/// | ------------- | --------- |
-/// | `1 = NULL`    | `NULL`    |
-/// | `NULL = NULL` | `NULL`    |
+/// Expression    | Result
+/// ------------- | ---------
+/// `1 = NULL`    | `NULL`
+/// `NULL = NULL` | `NULL`
 ///
 /// Also important is how `AND` and `OR` works with tri-state boolean logic as
 /// (perhaps counterintuitively) the result is **not** always NULL. While
 /// consistent with the notion of `NULL` representing “unknown”, this is again,
 /// often deeply confusing 🤯 when first encountered.
 ///
-/// | Expression       | Result    | Intuition   |
-/// | ---------------  | --------- | ----------- |
-/// | `NULL AND true`  |   `NULL`  | The `NULL` stands for “unknown” and if it were `true` or `false` the overall expression value could change |
-/// | `NULL AND false` |  `false`  | If the `NULL` was either `true` or `false` the overall expression is still `false` |
-/// | `NULL AND NULL`  | `NULL`    |            |
+/// Expression       | Result    | Intuition
+/// ---------------  | --------- | -----------
+/// `NULL AND true`  |   `NULL`  | The `NULL` stands for “unknown” and if it were `true` or `false` the overall expression value could change
+/// `NULL AND false` |  `false`  | If the `NULL` was either `true` or `false` the overall expression is still `false`
+/// `NULL AND NULL`  | `NULL`    |
 ///
-/// | Expression      | Result    | Intuition |
-/// | --------------- | --------- | ---------- |
-/// | `NULL OR true`  | `true`    |  If the `NULL` was either `true` or `false` the overall expression is still `true` |
-/// | `NULL OR false` | `NULL`    |  The `NULL` stands for “unknown” and if it were `true` or `false` the overall expression value could change |
-/// | `NULL OR NULL`  |  `NULL`   |            |
+/// Expression      | Result    | Intuition
+/// --------------- | --------- | ----------
+/// `NULL OR true`  | `true`    |  If the `NULL` was either `true` or `false` the overall expression is still `true`
+/// `NULL OR false` | `NULL`    |  The `NULL` stands for “unknown” and if it were `true` or `false` the overall expression value could change
+/// `NULL OR NULL`  |  `NULL`   |
 ///
 /// ## SQL Filter Semantics
 ///
@@ -308,12 +309,12 @@ pub trait PruningStatistics {
 /// provided by the `PruningStatistics`. Here are some examples of the rewritten
 /// predicates:
 ///
-/// | Original Predicate | Rewritten Predicate |
-/// | ------------------ | --------------------|
-/// | `x = 5` | `x_min <= 5 AND 5 <= x_max` |
-/// | `x < 5` | `x_max < 5` |
-/// | `x = 5 AND y = 10` | `x_min <= 5 AND 5 <= x_max AND y_min <= 10 AND 10 <= y_max` |
-/// | `x IS NULL`  | `x_null_count > 0` |
+/// Original Predicate | Rewritten Predicate
+/// ------------------ | --------------------
+/// `x = 5` | `x_min <= 5 AND 5 <= x_max`
+/// `x < 5` | `x_max < 5`
+/// `x = 5 AND y = 10` | `x_min <= 5 AND 5 <= x_max AND y_min <= 10 AND 10 <= y_max`
+/// `x IS NULL`  | `x_null_count > 0`
 ///
 /// ## Predicate Evaluation
 /// The PruningPredicate works in two passes
@@ -331,18 +332,20 @@ pub trait PruningStatistics {
 /// between `1 and 100` and we know that `y` is between `4` and `7`, the input
 /// statistics might look like
 ///
-/// | Column | Value |
-/// | ------ | ----- |
-/// | x_min | 1    |
-/// | x_max | 100  |
-/// | y_min |  4   |
-/// | y_max | 7    |
+/// Column   | Value
+/// -------- | -----
+/// `x_min`  | `1`
+/// `x_max`  | `100`
+/// `y_min`  | `4`
+/// `y_max`  | `7`
 ///
 /// The rewritten predicate would look like
 ///
 /// `x_min <= 5 AND 5 <= x_max AND  y_min <= 10 AND 10 <= y_max`
 ///
-/// When these values are substituted in to the rewritten predicate and simplified, the result is `false`:
+/// When these values are substituted in to the rewritten predicate and
+/// simplified, the result is `false`:
+///
 /// * `1 <= 5 AND 5 <= 100 AND 4 <= 10 AND 10 <= 7`
 /// * `true AND true AND true AND false`
 /// * `false`

From 5570047d9f2b6945cca9083f920ab4bb136b86a7 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Sun, 11 Feb 2024 06:53:22 -0500
Subject: [PATCH 09/10] fix table formatting

---
 datafusion/core/src/physical_optimizer/pruning.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs
index 7930e6efe673..4140c13bdeae 100644
--- a/datafusion/core/src/physical_optimizer/pruning.rs
+++ b/datafusion/core/src/physical_optimizer/pruning.rs
@@ -365,9 +365,9 @@ pub trait PruningStatistics {
 /// # Related Work
 ///
 /// [`PruningPredicate`] implements the type of min/max pruning described in
-/// Section `3.3.3` of the [`Snowflake SIGMOD Paper`]. It is described by
-/// various research such as [small materialized aggregates], [zone maps], and
-/// [data skipping].
+/// Section `3.3.3` of the [`Snowflake SIGMOD Paper`]. The technique is
+/// described by various research such as [small materialized aggregates], [zone
+/// maps], and [data skipping].
 ///
 /// [`Snowflake SIGMOD Paper`]: https://dl.acm.org/doi/10.1145/2882903.2903741
 /// [small materialized aggregates]: https://www.vldb.org/conf/1998/p476.pdf

From 059657274158d757e8bb57e9d034f908d5f0abc1 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Mon, 12 Feb 2024 06:44:48 -0500
Subject: [PATCH 10/10] Update
 datafusion/core/src/physical_optimizer/pruning.rs

Co-authored-by: Jeffrey Vo <jeffrey.vo.australia@gmail.com>
---
 datafusion/core/src/physical_optimizer/pruning.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs
index 4140c13bdeae..648b1f70c58b 100644
--- a/datafusion/core/src/physical_optimizer/pruning.rs
+++ b/datafusion/core/src/physical_optimizer/pruning.rs
@@ -151,7 +151,7 @@ pub trait PruningStatistics {
 ///
 /// The `PruningPredicate` API is general, and can be used for pruning other
 /// types of containers (e.g. files) based on statistics that may be known from
-/// external catalogs (e.g. Delta Lake) or other sources. How his works is a
+/// external catalogs (e.g. Delta Lake) or other sources. How this works is a
 /// subtle topic.  See the Background and Implementation section for details.
 ///
 /// `PruningPredicate` supports: