datafusion-examples/examples/advanced_parquet_index.rs

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

use arrow::array::{ArrayRef, Int32Array, RecordBatch, StringArray};
use arrow_schema::SchemaRef;
use async_trait::async_trait;
use bytes::Bytes;
use datafusion::catalog::Session;
use datafusion::datasource::listing::PartitionedFile;
use datafusion::datasource::physical_plan::parquet::{
    ParquetAccessPlan, ParquetExecBuilder,
};
use datafusion::datasource::physical_plan::{
    parquet::ParquetFileReaderFactory, FileMeta, FileScanConfig,
};
use datafusion::datasource::TableProvider;
use datafusion::execution::object_store::ObjectStoreUrl;
use datafusion::parquet::arrow::arrow_reader::{
    ArrowReaderOptions, ParquetRecordBatchReaderBuilder, RowSelection, RowSelector,
};
use datafusion::parquet::arrow::async_reader::{AsyncFileReader, ParquetObjectReader};
use datafusion::parquet::arrow::ArrowWriter;
use datafusion::parquet::file::metadata::ParquetMetaData;
use datafusion::parquet::file::properties::{EnabledStatistics, WriterProperties};
use datafusion::parquet::schema::types::ColumnPath;
use datafusion::physical_expr::PhysicalExpr;
use datafusion::physical_optimizer::pruning::PruningPredicate;
use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
use datafusion::physical_plan::ExecutionPlan;
use datafusion::prelude::*;
use datafusion_common::{
    internal_datafusion_err, DFSchema, DataFusionError, Result, ScalarValue,
};
use datafusion_expr::utils::conjunction;
use datafusion_expr::{TableProviderFilterPushDown, TableType};
use datafusion_physical_expr::utils::{Guarantee, LiteralGuarantee};
use futures::future::BoxFuture;
use futures::FutureExt;
use object_store::ObjectStore;
use std::any::Any;
use std::collections::{HashMap, HashSet};
use std::fs::File;
use std::ops::Range;
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use tempfile::TempDir;
use url::Url;

/// This example demonstrates using low level DataFusion APIs to read only
/// certain row groups and ranges from parquet files, based on external
/// information.
///
/// Using these APIs, you can instruct DataFusion's parquet reader to skip
/// ("prune") portions of files that do not contain relevant data. These APIs
/// can be useful for doing low latency queries over a large number of Parquet
/// files on remote storage (e.g. S3) where the cost of reading the metadata for
/// each file is high (e.g. because it requires a network round trip to the
/// storage service).
///
/// Depending on the information from the index, DataFusion can make a request
/// to the storage service (e.g. S3)  to read only the necessary data.
///
/// Note that this example uses a hard coded index implementation. For a more
/// realistic example of creating an index to prune files, see the
/// `parquet_index.rs` example.
///
/// Specifically, this example illustrates how to:
/// 1. Use [`ParquetFileReaderFactory`] to avoid re-reading parquet metadata on each query
/// 2. Use [`PruningPredicate`] for predicate analysis
/// 3. Pass a row group selection to [`ParquetExec`]
/// 4. Pass a row selection (within a row group) to [`ParquetExec`]
///
/// Note this is a *VERY* low level example for people who want to build their
/// own custom indexes (e.g. for low latency queries). Most users should use
/// higher level APIs for reading parquet files:
/// [`SessionContext::read_parquet`] or [`ListingTable`], which also do file
/// pruning based on parquet statistics (using the same underlying APIs)
///
/// # Diagram
///
/// This diagram shows how the `ParquetExec` is configured to do only a single
/// (range) read from a parquet file, for the data that is needed. It does
/// not read the file footer or any of the row groups that are not needed.
///
/// ```text
///         ┌───────────────────────┐ The TableProvider configures the
///         │ ┌───────────────────┐ │ ParquetExec:
///         │ │                   │ │
///         │ └───────────────────┘ │
///         │ ┌───────────────────┐ │
/// Row     │ │                   │ │  1. To read only specific Row
/// Groups  │ └───────────────────┘ │  Groups (the ParquetExec tries
///         │ ┌───────────────────┐ │  to reduce this further based
///         │ │                   │ │  on metadata)
///         │ └───────────────────┘ │              ┌────────────────────┐
///         │ ┌───────────────────┐ │              │                    │
///         │ │                   │◀┼ ─ ─ ┐        │    ParquetExec     │
///         │ └───────────────────┘ │              │  (Parquet Reader)  │
///         │          ...          │     └ ─ ─ ─ ─│                    │
///         │ ┌───────────────────┐ │              │ ╔═══════════════╗  │
///         │ │                   │ │              │ ║ParquetMetadata║  │
///         │ └───────────────────┘ │              │ ╚═══════════════╝  │
///         │ ╔═══════════════════╗ │              └────────────────────┘
///         │ ║  Thrift metadata  ║ │
///         │ ╚═══════════════════╝ │      1. With cached ParquetMetadata, so
///         └───────────────────────┘      the ParquetExec does not re-read /
///          Parquet File                  decode the thrift footer
///
/// ```
///
/// Within a Row Group, Column Chunks store data in DataPages. This example also
/// shows how to configure the ParquetExec to read a `RowSelection` (row ranges)
/// which will skip unneeded data pages. This requires that the Parquet file has
/// a [Page Index].
///
/// ```text
///         ┌───────────────────────┐   If the RowSelection does not include any
///         │          ...          │   rows from a particular Data Page, that
///         │                       │   Data Page is not fetched or decoded.
///         │ ┌───────────────────┐ │   Note this requires a PageIndex
///         │ │     ┌──────────┐  │ │
/// Row     │ │     │DataPage 0│  │ │                 ┌────────────────────┐
/// Groups  │ │     └──────────┘  │ │                 │                    │
///         │ │     ┌──────────┐  │ │                 │    ParquetExec     │
///         │ │ ... │DataPage 1│ ◀┼ ┼ ─ ─ ─           │  (Parquet Reader)  │
///         │ │     └──────────┘  │ │      └ ─ ─ ─ ─ ─│                    │
///         │ │     ┌──────────┐  │ │                 │ ╔═══════════════╗  │
///         │ │     │DataPage 2│  │ │ If only rows    │ ║ParquetMetadata║  │
///         │ │     └──────────┘  │ │ from DataPage 1 │ ╚═══════════════╝  │
///         │ └───────────────────┘ │ are selected,   └────────────────────┘
///         │                       │ only DataPage 1
///         │          ...          │ is fetched and
///         │                       │ decoded
///         │ ╔═══════════════════╗ │
///         │ ║  Thrift metadata  ║ │
///         │ ╚═══════════════════╝ │
///         └───────────────────────┘
///          Parquet File
/// ```
///
/// [`ListingTable`]: datafusion::datasource::listing::ListingTable
/// [Page Index](https://github.com/apache/parquet-format/blob/master/PageIndex.md)
#[tokio::main]
async fn main() -> Result<()> {
    // the object store is used to read the parquet files (in this case, it is
    // a local file system, but in a real system it could be S3, GCS, etc)
    let object_store: Arc<dyn ObjectStore> =
        Arc::new(object_store::local::LocalFileSystem::new());

    // Create a custom table provider with our special index.
    let provider = Arc::new(IndexTableProvider::try_new(Arc::clone(&object_store))?);

    // SessionContext for running queries that has the table provider
    // registered as "index_table"
    let ctx = SessionContext::new();
    ctx.register_table("index_table", Arc::clone(&provider) as _)?;

    // register object store provider for urls like `file://` work
    let url = Url::try_from("file://").unwrap();
    ctx.register_object_store(&url, object_store);

    // Select data from the table without any predicates (and thus no pruning)
    println!("** Select data, no predicates:");
    ctx.sql("SELECT avg(id), max(text) FROM index_table")
        .await?
        .show()
        .await?;
    // the underlying parquet reader makes 10 IO requests, one for each row group

    // Now, run a query that has a predicate that our index can handle
    //
    // For this query, the access plan specifies skipping 8 row groups
    // and scanning 2 of them. The skipped row groups are not read at all:
    //
    // [Skip, Skip, Scan, Skip, Skip, Skip, Skip, Scan, Skip, Skip]
    //
    // Note that the parquet reader makes 2 IO requests - one for the data from
    // each row group.
    println!("** Select data, predicate `id IN (250, 750)`");
    ctx.sql("SELECT text FROM index_table WHERE id IN (250, 750)")
        .await?
        .show()
        .await?;

    // Finally, demonstrate scanning sub ranges within the row groups.
    // Parquet's minimum decode unit is a page, so specifying ranges
    // within a row group can be used to skip pages within a row group.
    //
    // For this query, the access plan specifies skipping all but the last row
    // group and within the last row group, reading only the row with id 950
    //
    // [Skip, Skip, Skip, Skip, Skip, Skip, Skip, Skip, Skip, Selection(skip 49, select 1, skip 50)]
    //
    // Note that the parquet reader makes a single IO request - for the data
    // pages that must be decoded
    //
    // Note: in order to prune pages, the Page Index must be loaded and the
    // ParquetExec will load it on demand if not present. To avoid a second IO
    // during query, this example loaded the Page Index preemptively by setting
    // `ArrowReader::with_page_index` in `IndexedFile::try_new`
    provider.set_use_row_selection(true);
    println!("** Select data, predicate `id = 950`");
    ctx.sql("SELECT text  FROM index_table WHERE id = 950")
        .await?
        .show()
        .await?;

    Ok(())
}

/// DataFusion `TableProvider` that uses knowledge of how data is distributed in
/// a file to prune row groups and rows from the file.
///
/// `file1.parquet` contains values `0..1000`
#[derive(Debug)]
pub struct IndexTableProvider {
    /// Pointer to temporary file storage. Keeping it in scope to prevent temporary folder
    /// to be deleted prematurely
    _tmpdir: TempDir,
    /// The file that is being read.
    indexed_file: IndexedFile,
    /// The underlying object store
    object_store: Arc<dyn ObjectStore>,
    /// if true, use row selections in addition to row group selections
    use_row_selections: AtomicBool,
}
impl IndexTableProvider {
    /// Create a new IndexTableProvider
    /// * `object_store` - the object store implementation to use for reading files
    pub fn try_new(object_store: Arc<dyn ObjectStore>) -> Result<Self> {
        let tmpdir = TempDir::new().expect("Can't make temporary directory");

        let indexed_file =
            IndexedFile::try_new(tmpdir.path().join("indexed_file.parquet"), 0..1000)?;

        Ok(Self {
            indexed_file,
            _tmpdir: tmpdir,
            object_store,
            use_row_selections: AtomicBool::new(false),
        })
    }

    /// set the value of use row selections
    pub fn set_use_row_selection(&self, use_row_selections: bool) {
        self.use_row_selections
            .store(use_row_selections, Ordering::SeqCst);
    }

    /// return the value of use row selections
    pub fn use_row_selections(&self) -> bool {
        self.use_row_selections.load(Ordering::SeqCst)
    }

    /// convert filters like `a = 1`, `b = 2`
    /// to a single predicate like `a = 1 AND b = 2` suitable for execution
    fn filters_to_predicate(
        &self,
        state: &dyn Session,
        filters: &[Expr],
    ) -> Result<Arc<dyn PhysicalExpr>> {
        let df_schema = DFSchema::try_from(self.schema())?;

        let predicate = conjunction(filters.to_vec());
        let predicate = predicate
            .map(|predicate| state.create_physical_expr(predicate, &df_schema))
            .transpose()?
            // if there are no filters, use a literal true to have a predicate
            // that always evaluates to true we can pass to the index
            .unwrap_or_else(|| datafusion_physical_expr::expressions::lit(true));

        Ok(predicate)
    }

    /// Returns a [`ParquetAccessPlan`] that specifies how to scan the
    /// parquet file.
    ///
    /// A `ParquetAccessPlan` specifies which row groups  and which rows within
    /// those row groups to scan.
    fn create_plan(
        &self,
        predicate: &Arc<dyn PhysicalExpr>,
    ) -> Result<ParquetAccessPlan> {
        // In this example, we use the PruningPredicate's literal guarantees to
        // analyze the predicate. In a real system, using
        // `PruningPredicate::prune` would likely be easier to do.
        let pruning_predicate =
            PruningPredicate::try_new(Arc::clone(predicate), self.schema())?;

        // The PruningPredicate's guarantees must all be satisfied in order for
        // the predicate to possibly evaluate to true.
        let guarantees = pruning_predicate.literal_guarantees();
        let Some(constants) = self.value_constants(guarantees) else {
            return Ok(self.indexed_file.scan_all_plan());
        };

        // Begin with a plan that skips all row groups.
        let mut plan = self.indexed_file.scan_none_plan();

        // determine which row groups have the values in the guarantees
        for value in constants {
            let ScalarValue::Int32(Some(val)) = value else {
                // if we have unexpected type of constant, no pruning is possible
                return Ok(self.indexed_file.scan_all_plan());
            };

            // Since we know the values in the files are between 0..1000 and
            // evenly distributed between  in row groups, calculate in what row
            // group this value appears and tell the parquet reader to read it
            let val = *val as usize;
            let num_rows_in_row_group = 1000 / plan.len();
            let row_group_index = val / num_rows_in_row_group;
            plan.scan(row_group_index);

            // If we want to use row selections, which the parquet reader can
            // use to skip data pages when the parquet file has a "page index"
            // and the reader is configured to read it, add a row selection
            if self.use_row_selections() {
                let offset_in_row_group = val - row_group_index * num_rows_in_row_group;
                let selection = RowSelection::from(vec![
                    // skip rows before the desired row
                    RowSelector::skip(offset_in_row_group.saturating_sub(1)),
                    // select the actual row
                    RowSelector::select(1),
                    // skip any remaining rows in the group
                    RowSelector::skip(num_rows_in_row_group - offset_in_row_group),
                ]);

                plan.scan_selection(row_group_index, selection);
            }
        }

        Ok(plan)
    }

    /// Returns the set of constants that the `"id"` column must take in order
    /// for the predicate to be true.
    ///
    /// If `None` is returned, we can't extract the necessary information from
    /// the guarantees.
    fn value_constants<'a>(
        &self,
        guarantees: &'a [LiteralGuarantee],
    ) -> Option<&'a HashSet<ScalarValue>> {
        // only handle a single guarantee for column in this example
        if guarantees.len() != 1 {
            return None;
        }
        let guarantee = guarantees.first()?;

        // Only handle IN guarantees for the "in" column
        if guarantee.guarantee != Guarantee::In || guarantee.column.name() != "id" {
            return None;
        }
        Some(&guarantee.literals)
    }
}

/// Stores information needed to scan a file
#[derive(Debug)]
struct IndexedFile {
    /// File name
    file_name: String,
    /// The path of the file
    path: PathBuf,
    /// The size of the file
    file_size: u64,
    /// The pre-parsed parquet metadata for the file
    metadata: Arc<ParquetMetaData>,
    /// The arrow schema of the file
    schema: SchemaRef,
}

impl IndexedFile {
    fn try_new(path: impl AsRef<Path>, value_range: Range<i32>) -> Result<Self> {
        let path = path.as_ref();
        // write the actual file
        make_demo_file(path, value_range)?;

        // Now, open the file and read its size and metadata
        let file_name = path
            .file_name()
            .ok_or_else(|| internal_datafusion_err!("Invalid path"))?
            .to_str()
            .ok_or_else(|| internal_datafusion_err!("Invalid filename"))?
            .to_string();
        let file_size = path.metadata()?.len();

        let file = File::open(path).map_err(|e| {
            DataFusionError::from(e).context(format!("Error opening file {path:?}"))
        })?;

        let options = ArrowReaderOptions::new()
            // Load the page index when reading metadata to cache
            // so it is available to interpret row selections
            .with_page_index(true);
        let reader =
            ParquetRecordBatchReaderBuilder::try_new_with_options(file, options)?;
        let metadata = reader.metadata().clone();
        let schema = reader.schema().clone();

        // canonicalize after writing the file
        let path = std::fs::canonicalize(path)?;

        Ok(Self {
            file_name,
            path,
            file_size,
            metadata,
            schema,
        })
    }

    /// Return a `PartitionedFile` to scan the underlying file
    ///
    /// The returned value does not have any  `ParquetAccessPlan` specified in
    /// its extensions.
    fn partitioned_file(&self) -> PartitionedFile {
        PartitionedFile::new(self.path.display().to_string(), self.file_size)
    }

    /// Return a `ParquetAccessPlan` that scans all row groups in the file
    fn scan_all_plan(&self) -> ParquetAccessPlan {
        ParquetAccessPlan::new_all(self.metadata.num_row_groups())
    }

    /// Return a `ParquetAccessPlan` that scans no row groups in the file
    fn scan_none_plan(&self) -> ParquetAccessPlan {
        ParquetAccessPlan::new_none(self.metadata.num_row_groups())
    }
}

/// Implement the TableProvider trait for IndexTableProvider
/// so that we can query it as a table.
#[async_trait]
impl TableProvider for IndexTableProvider {
    fn as_any(&self) -> &dyn Any {
        self
    }

    fn schema(&self) -> SchemaRef {
        Arc::clone(&self.indexed_file.schema)
    }

    fn table_type(&self) -> TableType {
        TableType::Base
    }

    async fn scan(
        &self,
        state: &dyn Session,
        projection: Option<&Vec<usize>>,
        filters: &[Expr],
        limit: Option<usize>,
    ) -> Result<Arc<dyn ExecutionPlan>> {
        let indexed_file = &self.indexed_file;
        let predicate = self.filters_to_predicate(state, filters)?;

        // Figure out which row groups to scan based on the predicate
        let access_plan = self.create_plan(&predicate)?;
        println!("{access_plan:?}");

        let partitioned_file = indexed_file
            .partitioned_file()
            // provide the starting access plan to the ParquetExec by
            // storing it as  "extensions" on PartitionedFile
            .with_extensions(Arc::new(access_plan) as _);

        // Prepare for scanning
        let schema = self.schema();
        let object_store_url = ObjectStoreUrl::parse("file://")?;
        let file_scan_config = FileScanConfig::new(object_store_url, schema)
            .with_limit(limit)
            .with_projection(projection.cloned())
            .with_file(partitioned_file);

        // Configure a factory interface to avoid re-reading the metadata for each file
        let reader_factory =
            CachedParquetFileReaderFactory::new(Arc::clone(&self.object_store))
                .with_file(indexed_file);

        // Finally, put it all together into a ParquetExec
        Ok(ParquetExecBuilder::new(file_scan_config)
            // provide the predicate so the ParquetExec can try and prune
            // row groups internally
            .with_predicate(predicate)
            // provide the factory to create parquet reader without re-reading metadata
            .with_parquet_file_reader_factory(Arc::new(reader_factory))
            .build_arc())
    }

    /// Tell DataFusion to push filters down to the scan method
    fn supports_filters_pushdown(
        &self,
        filters: &[&Expr],
    ) -> Result<Vec<TableProviderFilterPushDown>> {
        // Inexact because the pruning can't handle all expressions and pruning
        // is not done at the row level -- there may be rows in returned files
        // that do not pass the filter
        Ok(vec![TableProviderFilterPushDown::Inexact; filters.len()])
    }
}

/// A custom [`ParquetFileReaderFactory`] that handles opening parquet files
/// from object storage, and uses pre-loaded metadata.

#[derive(Debug)]
struct CachedParquetFileReaderFactory {
    /// The underlying object store implementation for reading file data
    object_store: Arc<dyn ObjectStore>,
    /// The parquet metadata for each file in the index, keyed by the file name
    /// (e.g. `file1.parquet`)
    metadata: HashMap<String, Arc<ParquetMetaData>>,
}

impl CachedParquetFileReaderFactory {
    fn new(object_store: Arc<dyn ObjectStore>) -> Self {
        Self {
            object_store,
            metadata: HashMap::new(),
        }
    }
    /// Add the pre-parsed information about the file to the factor
    fn with_file(mut self, indexed_file: &IndexedFile) -> Self {
        self.metadata.insert(
            indexed_file.file_name.clone(),
            Arc::clone(&indexed_file.metadata),
        );
        self
    }
}

impl ParquetFileReaderFactory for CachedParquetFileReaderFactory {
    fn create_reader(
        &self,
        _partition_index: usize,
        file_meta: FileMeta,
        metadata_size_hint: Option<usize>,
        _metrics: &ExecutionPlanMetricsSet,
    ) -> Result<Box<dyn AsyncFileReader + Send>> {
        // for this example we ignore the partition index and metrics
        // but in a real system you would likely use them to report details on
        // the performance of the reader.
        let filename = file_meta
            .location()
            .parts()
            .last()
            .expect("No path in location")
            .as_ref()
            .to_string();

        let object_store = Arc::clone(&self.object_store);
        let mut inner = ParquetObjectReader::new(object_store, file_meta.object_meta);

        if let Some(hint) = metadata_size_hint {
            inner = inner.with_footer_size_hint(hint)
        };

        let metadata = self
            .metadata
            .get(&filename)
            .expect("metadata for file not found: {filename}");
        Ok(Box::new(ParquetReaderWithCache {
            filename,
            metadata: Arc::clone(metadata),
            inner,
        }))
    }
}

/// wrapper around a ParquetObjectReader that caches metadata
struct ParquetReaderWithCache {
    filename: String,
    metadata: Arc<ParquetMetaData>,
    inner: ParquetObjectReader,
}

impl AsyncFileReader for ParquetReaderWithCache {
    fn get_bytes(
        &mut self,
        range: Range<usize>,
    ) -> BoxFuture<'_, datafusion::parquet::errors::Result<Bytes>> {
        println!("get_bytes: {} Reading range {:?}", self.filename, range);
        self.inner.get_bytes(range)
    }

    fn get_byte_ranges(
        &mut self,
        ranges: Vec<Range<usize>>,
    ) -> BoxFuture<'_, datafusion::parquet::errors::Result<Vec<Bytes>>> {
        println!(
            "get_byte_ranges: {} Reading ranges {:?}",
            self.filename, ranges
        );
        self.inner.get_byte_ranges(ranges)
    }

    fn get_metadata(
        &mut self,
    ) -> BoxFuture<'_, datafusion::parquet::errors::Result<Arc<ParquetMetaData>>> {
        println!("get_metadata: {} returning cached metadata", self.filename);

        // return the cached metadata so the parquet reader does not read it
        let metadata = self.metadata.clone();
        async move { Ok(metadata) }.boxed()
    }
}

/// Creates a new parquet file at the specified path.
///
/// * id: Int32
/// * text: Utf8
///
/// The `id` column  increases sequentially from `min_value` to `max_value`
/// The `text` column is a repeating sequence of `TheTextValue{i}`
///
/// Each row group has 100 rows
fn make_demo_file(path: impl AsRef<Path>, value_range: Range<i32>) -> Result<()> {
    let path = path.as_ref();
    let file = File::create(path)?;

    let id = Int32Array::from_iter_values(value_range.clone());
    let text =
        StringArray::from_iter_values(value_range.map(|i| format!("TheTextValue{i}")));

    let batch = RecordBatch::try_from_iter(vec![
        ("id", Arc::new(id) as ArrayRef),
        ("text", Arc::new(text) as ArrayRef),
    ])?;

    let schema = batch.schema();

    // enable page statistics for the tag column,
    // for everything else.
    let props = WriterProperties::builder()
        .set_max_row_group_size(100)
        // compute column chunk (per row group) statistics by default
        .set_statistics_enabled(EnabledStatistics::Chunk)
        // compute column page statistics for the tag column
        .set_column_statistics_enabled(ColumnPath::from("tag"), EnabledStatistics::Page)
        .build();

    // write the actual values to the file
    let mut writer = ArrowWriter::try_new(file, schema, Some(props))?;
    writer.write(&batch)?;
    writer.close()?;

    Ok(())
}