From da82cece8181d5af0d38243229ecb2b4722cb4a9 Mon Sep 17 00:00:00 2001 From: Phillip LeBlanc Date: Tue, 23 Apr 2024 19:36:31 +0900 Subject: [PATCH 1/6] Update .asf.yaml to publish docs to datafusion.apache.org (#10190) * Update .asf.yaml to publish to root * Update website to have top-level ASF links --- .asf.yaml | 5 ++--- docs/README.md | 8 ++++---- docs/source/index.rst | 11 +++++++++++ 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/.asf.yaml b/.asf.yaml index 2c8a74080e8f..8e939c695d57 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -27,7 +27,7 @@ notifications: jira_options: link label worklog github: description: "Apache DataFusion SQL Query Engine" - homepage: https://arrow.apache.org/datafusion + homepage: https://datafusion.apache.org/ labels: - arrow - big-data @@ -50,7 +50,6 @@ github: required_approving_review_count: 1 # publishes the content of the `asf-site` branch to -# https://arrow.apache.org/datafusion/ +# https://datafusion.apache.org/ publish: whoami: asf-site - subdir: datafusion diff --git a/docs/README.md b/docs/README.md index 32fe98466ce7..acf3cb754c00 100644 --- a/docs/README.md +++ b/docs/README.md @@ -21,7 +21,7 @@ This folder contains the source content of the [User Guide](./source/user-guide) and [Contributor Guide](./source/contributor-guide). These are both published to -https://arrow.apache.org/datafusion/ as part of the release process. +https://datafusion.apache.org/ as part of the release process. ## Dependencies @@ -55,7 +55,7 @@ automatically updated. ## Release Process -This documentation is hosted at https://arrow.apache.org/datafusion/ +This documentation is hosted at https://datafusion.apache.org/ When the PR is merged to the `main` branch of the DataFusion repository, a [github workflow](https://github.com/apache/datafusion/blob/main/.github/workflows/docs.yaml) which: @@ -63,7 +63,7 @@ repository, a [github workflow](https://github.com/apache/datafusion/blob/main/. 1. Builds the html content 2. Pushes the html content to the [`asf-site`](https://github.com/apache/datafusion/tree/asf-site) branch in this repository. -The Apache Software Foundation provides https://arrow.apache.org/, +The Apache Software Foundation provides https://datafusion.apache.org/, which serves content based on the configuration in [.asf.yaml](https://github.com/apache/datafusion/blob/main/.asf.yaml), -which specifies the target as https://arrow.apache.org/datafusion/. +which specifies the target as https://datafusion.apache.org/. diff --git a/docs/source/index.rst b/docs/source/index.rst index 41cd7d9aee2a..5944d346ca95 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -50,6 +50,17 @@ Please see the `developer’s guide`_ for contributing and `communication`_ for .. _developer’s guide: contributor-guide/index.html#developer-s-guide .. _communication: contributor-guide/communication.html +.. _toc.asf-links: +.. toctree:: + :maxdepth: 1 + :caption: ASF Links + + Apache Software Foundation + License + Donate + Thanks + Security + .. _toc.links: .. toctree:: :maxdepth: 1 From 089a42ac98d3a57dca3c50c791e34d88bdb7af7d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 23 Apr 2024 09:15:39 -0400 Subject: [PATCH 2/6] Minor: Add `Column::from(Tableref, &FieldRef)`, `Expr::from(Column)` and `Expr::from(Tableref, &FieldRef)` (#10178) * Minor: Add `Column::from(Tableref, &FieldRef)` * Add Expr::from() * fix docs * Fix doc test --- benchmarks/src/tpch/convert.rs | 4 +- datafusion/common/src/column.rs | 11 ++++- datafusion/core/src/dataframe/mod.rs | 6 +-- datafusion/core/src/physical_planner.rs | 11 +---- datafusion/expr/src/expr.rs | 42 ++++++++++++++++++- datafusion/expr/src/expr_rewriter/mod.rs | 8 +--- datafusion/expr/src/logical_plan/builder.rs | 2 +- datafusion/expr/src/utils.rs | 9 +--- .../optimizer/src/common_subexpr_eliminate.rs | 7 +--- .../src/replace_distinct_aggregate.rs | 2 +- datafusion/sql/src/expr/mod.rs | 8 ++-- datafusion/sql/src/statement.rs | 3 +- 12 files changed, 68 insertions(+), 45 deletions(-) diff --git a/benchmarks/src/tpch/convert.rs b/benchmarks/src/tpch/convert.rs index a841fe532294..30178d17aa54 100644 --- a/benchmarks/src/tpch/convert.rs +++ b/benchmarks/src/tpch/convert.rs @@ -88,9 +88,7 @@ impl ConvertOpt { .schema() .iter() .take(schema.fields.len() - 1) - .map(|(qualifier, field)| { - Expr::Column(Column::from((qualifier, field.as_ref()))) - }) + .map(Expr::from) .collect(); csv = csv.select(selection)?; diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index dec87d9d071e..ae3146516349 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -17,7 +17,7 @@ //! Column -use arrow_schema::Field; +use arrow_schema::{Field, FieldRef}; use crate::error::_schema_err; use crate::utils::{parse_identifiers_normalized, quote_identifier}; @@ -63,6 +63,8 @@ impl Column { } /// Create Column from unqualified name. + /// + /// Alias for `Column::new_unqualified` pub fn from_name(name: impl Into) -> Self { Self { relation: None, @@ -346,6 +348,13 @@ impl From<(Option<&TableReference>, &Field)> for Column { } } +/// Create a column, use qualifier and field name +impl From<(Option<&TableReference>, &FieldRef)> for Column { + fn from((relation, field): (Option<&TableReference>, &FieldRef)) -> Self { + Self::new(relation.cloned(), field.name()) + } +} + impl FromStr for Column { type Err = Infallible; diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index abf09772e5bb..bd561e89832e 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -1332,7 +1332,7 @@ impl DataFrame { col_exists = true; new_column.clone() } else { - col(Column::from((qualifier, field.as_ref()))) + col(Column::from((qualifier, field))) } }) .collect(); @@ -1402,9 +1402,9 @@ impl DataFrame { .iter() .map(|(qualifier, field)| { if qualifier.eq(&qualifier_rename) && field.as_ref() == field_rename { - col(Column::from((qualifier, field.as_ref()))).alias(new_name) + col(Column::from((qualifier, field))).alias(new_name) } else { - col(Column::from((qualifier, field.as_ref()))) + col(Column::from((qualifier, field))) } }) .collect::>(); diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index e6785b1dec2a..848f561ffb85 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -1261,15 +1261,8 @@ impl DefaultPhysicalPlanner { // Remove temporary projected columns if left_projected || right_projected { - let final_join_result = join_schema - .iter() - .map(|(qualifier, field)| { - Expr::Column(datafusion_common::Column::from(( - qualifier, - field.as_ref(), - ))) - }) - .collect::>(); + let final_join_result = + join_schema.iter().map(Expr::from).collect::>(); let projection = LogicalPlan::Projection(Projection::try_new( final_join_result, Arc::new(new_join), diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index fb75a3cc7a43..6f76936806c6 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -32,7 +32,7 @@ use crate::{ Signature, }; -use arrow::datatypes::DataType; +use arrow::datatypes::{DataType, FieldRef}; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::{ internal_err, plan_err, Column, DFSchema, Result, ScalarValue, TableReference, @@ -84,6 +84,29 @@ use sqlparser::ast::NullTreatment; /// assert_eq!(binary_expr.op, Operator::Eq); /// } /// ``` +/// +/// ## Return a list of [`Expr::Column`] from a schema's columns +/// ``` +/// # use arrow::datatypes::{DataType, Field, Schema}; +/// # use datafusion_common::{DFSchema, Column}; +/// # use datafusion_expr::Expr; +/// +/// let arrow_schema = Schema::new(vec![ +/// Field::new("c1", DataType::Int32, false), +/// Field::new("c2", DataType::Float64, false), +/// ]); +/// let df_schema = DFSchema::try_from_qualified_schema("t1", &arrow_schema).unwrap(); +/// +/// // Form a list of expressions for each item in the schema +/// let exprs: Vec<_> = df_schema.iter() +/// .map(Expr::from) +/// .collect(); +/// +/// assert_eq!(exprs, vec![ +/// Expr::from(Column::from_qualified_name("t1.c1")), +/// Expr::from(Column::from_qualified_name("t1.c2")), +/// ]); +/// ``` #[derive(Clone, PartialEq, Eq, Hash, Debug)] pub enum Expr { /// An expression with a specific name. @@ -190,6 +213,23 @@ impl Default for Expr { } } +/// Create an [`Expr`] from a [`Column`] +impl From for Expr { + fn from(value: Column) -> Self { + Expr::Column(value) + } +} + +/// Create an [`Expr`] from an optional qualifier and a [`FieldRef`]. This is +/// useful for creating [`Expr`] from a [`DFSchema`]. +/// +/// See example on [`Expr`] +impl<'a> From<(Option<&'a TableReference>, &'a FieldRef)> for Expr { + fn from(value: (Option<&'a TableReference>, &'a FieldRef)) -> Self { + Expr::from(Column::from(value)) + } +} + #[derive(Clone, PartialEq, Eq, Hash, Debug)] pub struct Unnest { pub expr: Box, diff --git a/datafusion/expr/src/expr_rewriter/mod.rs b/datafusion/expr/src/expr_rewriter/mod.rs index f5779df812f1..fd6446eba971 100644 --- a/datafusion/expr/src/expr_rewriter/mod.rs +++ b/datafusion/expr/src/expr_rewriter/mod.rs @@ -218,13 +218,7 @@ pub fn coerce_plan_expr_for_schema( Ok(LogicalPlan::Projection(projection)) } _ => { - let exprs: Vec = plan - .schema() - .iter() - .map(|(qualifier, field)| { - Expr::Column(Column::from((qualifier, field.as_ref()))) - }) - .collect(); + let exprs: Vec = plan.schema().iter().map(Expr::from).collect(); let new_exprs = coerce_exprs_for_schema(exprs, plan.schema(), schema)?; let add_project = new_exprs.iter().any(|expr| expr.try_into_col().is_err()); diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 2810425ae1d8..fa4b0b964295 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1577,7 +1577,7 @@ pub fn unnest_with_options( return Ok(input); } }; - qualified_columns.push(Column::from((unnest_qualifier, unnested_field.as_ref()))); + qualified_columns.push(Column::from((unnest_qualifier, &unnested_field))); unnested_fields.insert(index, unnested_field); } diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 8da93c244c07..64fe98c23b08 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -356,12 +356,7 @@ fn get_exprs_except_skipped( columns_to_skip: HashSet, ) -> Vec { if columns_to_skip.is_empty() { - schema - .iter() - .map(|(qualifier, field)| { - Expr::Column(Column::from((qualifier, field.as_ref()))) - }) - .collect::>() + schema.iter().map(Expr::from).collect::>() } else { schema .columns() @@ -855,7 +850,7 @@ pub fn expr_as_column_expr(expr: &Expr, plan: &LogicalPlan) -> Result { match expr { Expr::Column(col) => { let (qualifier, field) = plan.schema().qualified_field_from_column(col)?; - Ok(Expr::Column(Column::from((qualifier, field)))) + Ok(Expr::from(Column::from((qualifier, field)))) } _ => Ok(Expr::Column(Column::from_name(expr.display_name()?))), } diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index 690b596ed35f..b859dda9d53f 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -506,7 +506,7 @@ fn build_common_expr_project_plan( for (qualifier, field) in input.schema().iter() { if fields_set.insert(qualified_name(qualifier, field.name())) { - project_exprs.push(Expr::Column(Column::from((qualifier, field.as_ref())))); + project_exprs.push(Expr::from((qualifier, field))); } } @@ -525,10 +525,7 @@ fn build_recover_project_plan( schema: &DFSchema, input: LogicalPlan, ) -> Result { - let col_exprs = schema - .iter() - .map(|(qualifier, field)| Expr::Column(Column::from((qualifier, field.as_ref())))) - .collect(); + let col_exprs = schema.iter().map(Expr::from).collect(); Ok(LogicalPlan::Projection(Projection::try_new( col_exprs, Arc::new(input), diff --git a/datafusion/optimizer/src/replace_distinct_aggregate.rs b/datafusion/optimizer/src/replace_distinct_aggregate.rs index f464506057ff..4f68e2623f40 100644 --- a/datafusion/optimizer/src/replace_distinct_aggregate.rs +++ b/datafusion/optimizer/src/replace_distinct_aggregate.rs @@ -127,7 +127,7 @@ impl OptimizerRule for ReplaceDistinctWithAggregate { .skip(on_expr.len()) .zip(schema.iter()) .map(|((new_qualifier, new_field), (old_qualifier, old_field))| { - Ok(col(Column::from((new_qualifier, new_field.as_ref()))) + Ok(col(Column::from((new_qualifier, new_field))) .alias_qualified(old_qualifier.cloned(), old_field.name())) }) .collect::>>()?; diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index 4bc0719fbaa5..3f2134bf7e9b 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -21,8 +21,8 @@ use sqlparser::ast::{ArrayAgg, Expr as SQLExpr, JsonOperator, TrimWhereField, Va use sqlparser::parser::ParserError::ParserError; use datafusion_common::{ - internal_datafusion_err, internal_err, not_impl_err, plan_err, Column, DFSchema, - Result, ScalarValue, + internal_datafusion_err, internal_err, not_impl_err, plan_err, DFSchema, Result, + ScalarValue, }; use datafusion_expr::expr::AggregateFunctionDefinition; use datafusion_expr::expr::InList; @@ -142,9 +142,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } _ => false, }) { - Some((qualifier, df_field)) => { - Expr::Column(Column::from((qualifier, df_field.as_ref()))) - } + Some((qualifier, df_field)) => Expr::from((qualifier, df_field)), None => Expr::Column(col), } } diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index 759a5e8ce9d3..c81217aa7017 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -1307,8 +1307,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { )) } else { datafusion_expr::Expr::Column(Column::from(( - qualifier, - field.as_ref(), + qualifier, field, ))) } } From c96182bf1aba0d0bf718be6b5b5dec527df6cced Mon Sep 17 00:00:00 2001 From: Lordworms <48054792+Lordworms@users.noreply.github.com> Date: Tue, 23 Apr 2024 12:35:22 -0500 Subject: [PATCH 3/6] implement rewrite for FilterNullJoinKeys (#10166) --- .../optimizer/src/filter_null_join_keys.rs | 61 ++++++++++--------- 1 file changed, 32 insertions(+), 29 deletions(-) diff --git a/datafusion/optimizer/src/filter_null_join_keys.rs b/datafusion/optimizer/src/filter_null_join_keys.rs index fcf85327fdb0..a4e345ceb3c1 100644 --- a/datafusion/optimizer/src/filter_null_join_keys.rs +++ b/datafusion/optimizer/src/filter_null_join_keys.rs @@ -19,9 +19,11 @@ use crate::optimizer::ApplyOrder; use crate::{OptimizerConfig, OptimizerRule}; -use datafusion_common::Result; +use datafusion_common::tree_node::Transformed; +use datafusion_common::{internal_err, Result}; +use datafusion_expr::utils::conjunction; use datafusion_expr::{ - and, logical_plan::Filter, logical_plan::JoinType, Expr, ExprSchemable, LogicalPlan, + logical_plan::Filter, logical_plan::JoinType, Expr, ExprSchemable, LogicalPlan, }; use std::sync::Arc; @@ -32,24 +34,34 @@ use std::sync::Arc; #[derive(Default)] pub struct FilterNullJoinKeys {} -impl FilterNullJoinKeys { - pub const NAME: &'static str = "filter_null_join_keys"; -} - impl OptimizerRule for FilterNullJoinKeys { fn try_optimize( &self, - plan: &LogicalPlan, - config: &dyn OptimizerConfig, + _plan: &LogicalPlan, + _config: &dyn OptimizerConfig, ) -> Result> { + internal_err!("Should have called FilterNullJoinKeys::rewrite") + } + + fn supports_rewrite(&self) -> bool { + true + } + + fn apply_order(&self) -> Option { + Some(ApplyOrder::BottomUp) + } + + fn rewrite( + &self, + plan: LogicalPlan, + config: &dyn OptimizerConfig, + ) -> Result> { if !config.options().optimizer.filter_null_join_keys { - return Ok(None); + return Ok(Transformed::no(plan)); } match plan { - LogicalPlan::Join(join) if join.join_type == JoinType::Inner => { - let mut join = join.clone(); - + LogicalPlan::Join(mut join) if join.join_type == JoinType::Inner => { let left_schema = join.left.schema(); let right_schema = join.right.schema(); @@ -69,29 +81,22 @@ impl OptimizerRule for FilterNullJoinKeys { if !left_filters.is_empty() { let predicate = create_not_null_predicate(left_filters); join.left = Arc::new(LogicalPlan::Filter(Filter::try_new( - predicate, - join.left.clone(), + predicate, join.left, )?)); } if !right_filters.is_empty() { let predicate = create_not_null_predicate(right_filters); join.right = Arc::new(LogicalPlan::Filter(Filter::try_new( - predicate, - join.right.clone(), + predicate, join.right, )?)); } - Ok(Some(LogicalPlan::Join(join))) + Ok(Transformed::yes(LogicalPlan::Join(join))) } - _ => Ok(None), + _ => Ok(Transformed::no(plan)), } } - fn name(&self) -> &str { - Self::NAME - } - - fn apply_order(&self) -> Option { - Some(ApplyOrder::BottomUp) + "filter_null_join_keys" } } @@ -100,11 +105,9 @@ fn create_not_null_predicate(filters: Vec) -> Expr { .into_iter() .map(|c| Expr::IsNotNull(Box::new(c))) .collect(); - // combine the IsNotNull expressions with AND - not_null_exprs - .iter() - .skip(1) - .fold(not_null_exprs[0].clone(), |a, b| and(a, b.clone())) + + // directly unwrap since it should always have a value + conjunction(not_null_exprs).unwrap() } #[cfg(test)] From 08030f37fa34995af2ea43143922476ace82e39d Mon Sep 17 00:00:00 2001 From: Lordworms <48054792+Lordworms@users.noreply.github.com> Date: Tue, 23 Apr 2024 12:38:37 -0500 Subject: [PATCH 4/6] Implement rewrite for EliminateOneUnion and EliminateJoin (#10184) * implement rewrite for EliminateJoin * implement rewrite for eliminate_on_union --- datafusion/expr/src/logical_plan/tree_node.rs | 2 +- datafusion/optimizer/src/eliminate_join.rs | 51 ++++++++++++------- .../optimizer/src/eliminate_one_union.rs | 30 +++++++---- 3 files changed, 54 insertions(+), 29 deletions(-) diff --git a/datafusion/expr/src/logical_plan/tree_node.rs b/datafusion/expr/src/logical_plan/tree_node.rs index f5db5a2704bc..37a36c36ca53 100644 --- a/datafusion/expr/src/logical_plan/tree_node.rs +++ b/datafusion/expr/src/logical_plan/tree_node.rs @@ -363,7 +363,7 @@ impl TreeNode for LogicalPlan { /// Converts a `Arc` without copying, if possible. Copies the plan /// if there is a shared reference -fn unwrap_arc(plan: Arc) -> LogicalPlan { +pub fn unwrap_arc(plan: Arc) -> LogicalPlan { Arc::try_unwrap(plan) // if None is returned, there is another reference to this // LogicalPlan, so we can not own it, and must clone instead diff --git a/datafusion/optimizer/src/eliminate_join.rs b/datafusion/optimizer/src/eliminate_join.rs index caf45dda9896..fea87e758790 100644 --- a/datafusion/optimizer/src/eliminate_join.rs +++ b/datafusion/optimizer/src/eliminate_join.rs @@ -18,7 +18,8 @@ //! [`EliminateJoin`] rewrites `INNER JOIN` with `true`/`null` use crate::optimizer::ApplyOrder; use crate::{OptimizerConfig, OptimizerRule}; -use datafusion_common::{Result, ScalarValue}; +use datafusion_common::tree_node::Transformed; +use datafusion_common::{internal_err, Result, ScalarValue}; use datafusion_expr::JoinType::Inner; use datafusion_expr::{ logical_plan::{EmptyRelation, LogicalPlan}, @@ -39,38 +40,50 @@ impl EliminateJoin { impl OptimizerRule for EliminateJoin { fn try_optimize( &self, - plan: &LogicalPlan, + _plan: &LogicalPlan, _config: &dyn OptimizerConfig, ) -> Result> { + internal_err!("Should have called EliminateJoin::rewrite") + } + + fn name(&self) -> &str { + "eliminate_join" + } + + fn apply_order(&self) -> Option { + Some(ApplyOrder::TopDown) + } + + fn rewrite( + &self, + plan: LogicalPlan, + _config: &dyn OptimizerConfig, + ) -> Result> { match plan { LogicalPlan::Join(join) if join.join_type == Inner && join.on.is_empty() => { match join.filter { Some(Expr::Literal(ScalarValue::Boolean(Some(true)))) => { - Ok(Some(LogicalPlan::CrossJoin(CrossJoin { - left: join.left.clone(), - right: join.right.clone(), - schema: join.schema.clone(), + Ok(Transformed::yes(LogicalPlan::CrossJoin(CrossJoin { + left: join.left, + right: join.right, + schema: join.schema, }))) } - Some(Expr::Literal(ScalarValue::Boolean(Some(false)))) => { - Ok(Some(LogicalPlan::EmptyRelation(EmptyRelation { + Some(Expr::Literal(ScalarValue::Boolean(Some(false)))) => Ok( + Transformed::yes(LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, - schema: join.schema.clone(), - }))) - } - _ => Ok(None), + schema: join.schema, + })), + ), + _ => Ok(Transformed::no(LogicalPlan::Join(join))), } } - _ => Ok(None), + _ => Ok(Transformed::no(plan)), } } - fn name(&self) -> &str { - "eliminate_join" - } - - fn apply_order(&self) -> Option { - Some(ApplyOrder::TopDown) + fn supports_rewrite(&self) -> bool { + true } } diff --git a/datafusion/optimizer/src/eliminate_one_union.rs b/datafusion/optimizer/src/eliminate_one_union.rs index 95a3370ab1b5..11a9009cd96c 100644 --- a/datafusion/optimizer/src/eliminate_one_union.rs +++ b/datafusion/optimizer/src/eliminate_one_union.rs @@ -17,8 +17,8 @@ //! [`EliminateOneUnion`] eliminates single element `Union` use crate::{OptimizerConfig, OptimizerRule}; -use datafusion_common::Result; -use datafusion_expr::logical_plan::{LogicalPlan, Union}; +use datafusion_common::{internal_err, tree_node::Transformed, Result}; +use datafusion_expr::logical_plan::{tree_node::unwrap_arc, LogicalPlan, Union}; use crate::optimizer::ApplyOrder; @@ -36,21 +36,33 @@ impl EliminateOneUnion { impl OptimizerRule for EliminateOneUnion { fn try_optimize( &self, - plan: &LogicalPlan, + _plan: &LogicalPlan, _config: &dyn OptimizerConfig, ) -> Result> { - match plan { - LogicalPlan::Union(Union { inputs, .. }) if inputs.len() == 1 => { - Ok(inputs.first().map(|input| input.as_ref().clone())) - } - _ => Ok(None), - } + internal_err!("Should have called EliminateOneUnion::rewrite") } fn name(&self) -> &str { "eliminate_one_union" } + fn supports_rewrite(&self) -> bool { + true + } + + fn rewrite( + &self, + plan: LogicalPlan, + _config: &dyn OptimizerConfig, + ) -> Result> { + match plan { + LogicalPlan::Union(Union { mut inputs, .. }) if inputs.len() == 1 => { + Ok(Transformed::yes(unwrap_arc(inputs.pop().unwrap()))) + } + _ => Ok(Transformed::no(plan)), + } + } + fn apply_order(&self) -> Option { Some(ApplyOrder::TopDown) } From 41b3f5c6b05734685c8a2bf2ba153f66f0b654f8 Mon Sep 17 00:00:00 2001 From: Phillip LeBlanc Date: Wed, 24 Apr 2024 02:52:12 +0900 Subject: [PATCH 5/6] Update links to datafusion.apache.org (#10195) --- CONTRIBUTING.md | 2 +- README.md | 14 +++++++------- datafusion-cli/README.md | 4 ++-- datafusion/core/README.md | 2 +- datafusion/core/src/catalog/mod.rs | 2 +- datafusion/core/src/lib.rs | 6 +++--- datafusion/expr/src/aggregate_function.rs | 2 +- datafusion/proto/CONTRIBUTING.md | 2 +- datafusion/proto/README.md | 2 +- dev/changelog/15.0.0.md | 2 +- dev/release/README.md | 2 +- docs/source/contributor-guide/index.md | 4 ++-- docs/source/user-guide/introduction.md | 4 ++-- docs/source/user-guide/sql/select.md | 2 +- docs/source/user-guide/sql/write_options.md | 2 +- 15 files changed, 26 insertions(+), 26 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6d5c5d1460da..896a55b9238c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -17,4 +17,4 @@ under the License. --> -See the [Contributor Guide](https://arrow.apache.org/datafusion/contributor-guide/index.html) +See the [Contributor Guide](https://datafusion.apache.org/contributor-guide/index.html) diff --git a/README.md b/README.md index 60693402ffa3..8c2392850953 100644 --- a/README.md +++ b/README.md @@ -46,10 +46,10 @@ in-memory format. [Python Bindings](https://github.com/apache/datafusion-python) Here are links to some important information -- [Project Site](https://arrow.apache.org/datafusion) -- [Installation](https://arrow.apache.org/datafusion/user-guide/cli/installation.html) -- [Rust Getting Started](https://arrow.apache.org/datafusion/user-guide/example-usage.html) -- [Rust DataFrame API](https://arrow.apache.org/datafusion/user-guide/dataframe.html) +- [Project Site](https://datafusion.apache.org/) +- [Installation](https://datafusion.apache.org/user-guide/cli/installation.html) +- [Rust Getting Started](https://datafusion.apache.org/user-guide/example-usage.html) +- [Rust DataFrame API](https://datafusion.apache.org/user-guide/dataframe.html) - [Rust API docs](https://docs.rs/datafusion/latest/datafusion) - [Rust Examples](https://github.com/apache/datafusion/tree/master/datafusion-examples) - [Python DataFrame API](https://arrow.apache.org/datafusion-python/) @@ -58,14 +58,14 @@ Here are links to some important information ## What can you do with this crate? DataFusion is great for building projects such as domain specific query engines, new database platforms and data pipelines, query languages and more. -It lets you start quickly from a fully working engine, and then customize those features specific to your use. [Click Here](https://arrow.apache.org/datafusion/user-guide/introduction.html#known-users) to see a list known users. +It lets you start quickly from a fully working engine, and then customize those features specific to your use. [Click Here](https://datafusion.apache.org/user-guide/introduction.html#known-users) to see a list known users. ## Contributing to DataFusion Please see the [contributor guide] and [communication] pages for more information. -[contributor guide]: https://arrow.apache.org/datafusion/contributor-guide -[communication]: https://arrow.apache.org/datafusion/contributor-guide/communication.html +[contributor guide]: https://datafusion.apache.org/contributor-guide +[communication]: https://datafusion.apache.org/contributor-guide/communication.html ## Crate features diff --git a/datafusion-cli/README.md b/datafusion-cli/README.md index a3fea22ddd0a..73a2eb01b76f 100644 --- a/datafusion-cli/README.md +++ b/datafusion-cli/README.md @@ -21,7 +21,7 @@ # DataFusion Command-line Interface -[DataFusion](https://arrow.apache.org/datafusion/) is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. +[DataFusion](https://datafusion.apache.org/) is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. DataFusion CLI (`datafusion-cli`) is a small command line utility that runs SQL queries using the DataFusion engine. @@ -29,7 +29,7 @@ DataFusion CLI (`datafusion-cli`) is a small command line utility that runs SQL ## Where can I find more information? -See the [`datafusion-cli` documentation](https://arrow.apache.org/datafusion/user-guide/cli.html) for further information. +See the [`datafusion-cli` documentation](https://datafusion.apache.org/user-guide/cli/index.html) for further information. ## How do I make my IDE work with `datafusion-cli`? diff --git a/datafusion/core/README.md b/datafusion/core/README.md index 16bc0672a277..b5501087d264 100644 --- a/datafusion/core/README.md +++ b/datafusion/core/README.md @@ -27,5 +27,5 @@ This crate contains the main entry points and high level DataFusion APIs such as For more information, please see: -- [DataFusion Website](https://arrow.apache.org/datafusion) +- [DataFusion Website](https://datafusion.apache.org) - [DataFusion API Docs](https://docs.rs/datafusion/latest/datafusion/) diff --git a/datafusion/core/src/catalog/mod.rs b/datafusion/core/src/catalog/mod.rs index 4a9c5170c2f5..209d9b2af297 100644 --- a/datafusion/core/src/catalog/mod.rs +++ b/datafusion/core/src/catalog/mod.rs @@ -175,7 +175,7 @@ impl CatalogProviderList for MemoryCatalogProviderList { /// * [delta-rs]: [`UnityCatalogProvider`] implementation that can /// read from Delta Lake tables /// -/// [`datafusion-cli`]: https://arrow.apache.org/datafusion/user-guide/cli.html +/// [`datafusion-cli`]: https://datafusion.apache.org/user-guide/cli/index.html /// [`DynamicFileCatalogProvider`]: https://github.com/apache/datafusion/blob/31b9b48b08592b7d293f46e75707aad7dadd7cbc/datafusion-cli/src/catalog.rs#L75 /// [`catalog.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/catalog.rs /// [delta-rs]: https://github.com/delta-io/delta-rs diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs index 21ca6d70eb58..24be185fb079 100644 --- a/datafusion/core/src/lib.rs +++ b/datafusion/core/src/lib.rs @@ -30,10 +30,10 @@ //! additional data sources, query languages, functions, custom //! operators and more. See the [Architecture] section for more details. //! -//! [DataFusion]: https://arrow.apache.org/datafusion/ +//! [DataFusion]: https://datafusion.apache.org/ //! [Apache Arrow]: https://arrow.apache.org -//! [use cases]: https://arrow.apache.org/datafusion/user-guide/introduction.html#use-cases -//! [SQL]: https://arrow.apache.org/datafusion/user-guide/sql/index.html +//! [use cases]: https://datafusion.apache.org/user-guide/introduction.html#use-cases +//! [SQL]: https://datafusion.apache.org/user-guide/sql/index.html //! [`DataFrame`]: dataframe::DataFrame //! [Architecture]: #architecture //! diff --git a/datafusion/expr/src/aggregate_function.rs b/datafusion/expr/src/aggregate_function.rs index 890a2ed04965..3dc9c3a01c15 100644 --- a/datafusion/expr/src/aggregate_function.rs +++ b/datafusion/expr/src/aggregate_function.rs @@ -30,7 +30,7 @@ use strum_macros::EnumIter; /// Enum of all built-in aggregate functions // Contributor's guide for adding new aggregate functions -// https://arrow.apache.org/datafusion/contributor-guide/index.html#how-to-add-a-new-aggregate-function +// https://datafusion.apache.org/contributor-guide/index.html#how-to-add-a-new-aggregate-function #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash, EnumIter)] pub enum AggregateFunction { /// Count diff --git a/datafusion/proto/CONTRIBUTING.md b/datafusion/proto/CONTRIBUTING.md index b793b47e76a6..f124c233d04f 100644 --- a/datafusion/proto/CONTRIBUTING.md +++ b/datafusion/proto/CONTRIBUTING.md @@ -29,4 +29,4 @@ valid installation of [protoc] (see [installation instructions] for details). ``` [protoc]: https://github.com/protocolbuffers/protobuf#protocol-compiler-installation -[installation instructions]: https://arrow.apache.org/datafusion/contributor-guide/#protoc-installation +[installation instructions]: https://datafusion.apache.org/contributor-guide/#protoc-installation diff --git a/datafusion/proto/README.md b/datafusion/proto/README.md index 23485cf32876..f51e4664d5d9 100644 --- a/datafusion/proto/README.md +++ b/datafusion/proto/README.md @@ -25,5 +25,5 @@ when building a distributed query engine. See [API Docs] for details and examples. -[datafusion]: https://arrow.apache.org/datafusion +[datafusion]: https://datafusion.apache.org [api docs]: http://docs.rs/datafusion-proto/latest diff --git a/dev/changelog/15.0.0.md b/dev/changelog/15.0.0.md index 71b9d40b04b5..63de08bef216 100644 --- a/dev/changelog/15.0.0.md +++ b/dev/changelog/15.0.0.md @@ -89,7 +89,7 @@ - Consider to categorize Operator [\#3216](https://github.com/apache/datafusion/issues/3216) - Replace Projection.alias with SubqueryAlias [\#2212](https://github.com/apache/datafusion/issues/2212) - \[Optimizer\] Eliminate the distinct [\#2045](https://github.com/apache/datafusion/issues/2045) -- beautify datafusion's site: https://arrow.apache.org/datafusion/ [\#1819](https://github.com/apache/datafusion/issues/1819) +- beautify datafusion's site: https://datafusion.apache.org/ [\#1819](https://github.com/apache/datafusion/issues/1819) - split datafusion-logical-plan sub-module [\#1755](https://github.com/apache/datafusion/issues/1755) - convert `outer join` to `inner join` to improve performance [\#1585](https://github.com/apache/datafusion/issues/1585) - Add sqllogictest for datafusion [\#1453](https://github.com/apache/datafusion/issues/1453) diff --git a/dev/release/README.md b/dev/release/README.md index 4d0ff0e3aea9..c1701062cadf 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -463,7 +463,7 @@ svn delete -m "delete old DataFusion release" https://dist.apache.org/repos/dist - Checkout the `asf-site` branch - Copy content from `docs/build/html/*` to the `datafusion` directory in arrow-site - Create a PR against the `asf-site` branch ([example](https://github.com/apache/arrow-site/pull/237)) -- Once the PR is merged, the content will be published to https://arrow.apache.org/datafusion/ by GitHub Pages (this +- Once the PR is merged, the content will be published to https://datafusion.apache.org/ by GitHub Pages (this can take some time). ### Optional: Write a blog post announcing the release diff --git a/docs/source/contributor-guide/index.md b/docs/source/contributor-guide/index.md index 110fddd995f5..252848bb7132 100644 --- a/docs/source/contributor-guide/index.md +++ b/docs/source/contributor-guide/index.md @@ -348,8 +348,8 @@ new specifications as you see fit. Here is the list current active specifications: -- [Output field name semantic](https://arrow.apache.org/datafusion/contributor-guide/specification/output-field-name-semantic.html) -- [Invariants](https://arrow.apache.org/datafusion/contributor-guide/specification/invariants.html) +- [Output field name semantic](https://datafusion.apache.org/contributor-guide/specification/output-field-name-semantic.html) +- [Invariants](https://datafusion.apache.org/contributor-guide/specification/invariants.html) All specifications are stored in the `docs/source/specification` folder. diff --git a/docs/source/user-guide/introduction.md b/docs/source/user-guide/introduction.md index c81cb00a26ea..a3fefdc56a7e 100644 --- a/docs/source/user-guide/introduction.md +++ b/docs/source/user-guide/introduction.md @@ -39,7 +39,7 @@ Arrow](https://arrow.apache.org/). ## Features -- Feature-rich [SQL support](https://arrow.apache.org/datafusion/user-guide/sql/index.html) and [DataFrame API](https://arrow.apache.org/datafusion/user-guide/dataframe.html) +- Feature-rich [SQL support](https://datafusion.apache.org/user-guide/sql/index.html) and [DataFrame API](https://datafusion.apache.org/user-guide/dataframe.html) - Blazingly fast, vectorized, multi-threaded, streaming execution engine. - Native support for Parquet, CSV, JSON, and Avro file formats. Support for custom file formats and non file datasources via the `TableProvider` trait. @@ -49,7 +49,7 @@ Arrow](https://arrow.apache.org/). Azure Blob Storage, and Google Cloud Storage (Other storage systems are supported via the `ObjectStore` trait). - [Excellent Documentation](https://docs.rs/datafusion/latest) and a - [welcoming community](https://arrow.apache.org/datafusion/contributor-guide/communication.html). + [welcoming community](https://datafusion.apache.org/contributor-guide/communication.html). - A state of the art query optimizer with expression coercion and simplification, projection and filter pushdown, sort and distribution aware optimizations, automatic join reordering, and more. diff --git a/docs/source/user-guide/sql/select.md b/docs/source/user-guide/sql/select.md index 4463b0bb0863..b2fa0a630588 100644 --- a/docs/source/user-guide/sql/select.md +++ b/docs/source/user-guide/sql/select.md @@ -22,7 +22,7 @@ The queries in DataFusion scan data from tables and return 0 or more rows. Please be aware that column names in queries are made lower-case, but not on the inferred schema. Accordingly, if you want to query against a capitalized field, make sure to use double quotes. Please see this -[example](https://arrow.apache.org/datafusion/user-guide/example-usage.html) for clarification. +[example](https://datafusion.apache.org/user-guide/example-usage.html) for clarification. In this documentation we describe the SQL syntax in DataFusion. DataFusion supports the following syntax for queries: diff --git a/docs/source/user-guide/sql/write_options.md b/docs/source/user-guide/sql/write_options.md index 5c204d8fc0e6..7631525f13e5 100644 --- a/docs/source/user-guide/sql/write_options.md +++ b/docs/source/user-guide/sql/write_options.md @@ -95,7 +95,7 @@ The following options are available when writing CSV files. Note: if any unsuppo ### Parquet Format Specific Options -The following options are available when writing parquet files. If any unsupported option is specified an error will be raised and the query will fail. If a column specific option is specified for a column which does not exist, the option will be ignored without error. For default values, see: [Configuration Settings](https://arrow.apache.org/datafusion/user-guide/configs.html). +The following options are available when writing parquet files. If any unsupported option is specified an error will be raised and the query will fail. If a column specific option is specified for a column which does not exist, the option will be ignored without error. For default values, see: [Configuration Settings](https://datafusion.apache.org/user-guide/configs.html). | Option | Can be Column Specific? | Description | | ---------------------------- | ----------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | From 15045a8849e80327de68b1ec59e4a0644f05582c Mon Sep 17 00:00:00 2001 From: Peter Toth Date: Tue, 23 Apr 2024 20:08:31 +0200 Subject: [PATCH 6/6] Introduce `Expr::is_volatile()`, adjust `TreeNode::exists()` (#10191) --- datafusion/common/src/tree_node.rs | 7 +++---- datafusion/expr/src/expr.rs | 20 +++++++++---------- .../optimizer/src/common_subexpr_eliminate.rs | 5 ++--- datafusion/optimizer/src/push_down_filter.rs | 5 +---- datafusion/optimizer/src/utils.rs | 16 --------------- 5 files changed, 16 insertions(+), 37 deletions(-) diff --git a/datafusion/common/src/tree_node.rs b/datafusion/common/src/tree_node.rs index 7003f5ac7f38..43026f3a9206 100644 --- a/datafusion/common/src/tree_node.rs +++ b/datafusion/common/src/tree_node.rs @@ -405,18 +405,17 @@ pub trait TreeNode: Sized { /// Returns true if `f` returns true for any node in the tree. /// /// Stops recursion as soon as a matching node is found - fn exists bool>(&self, mut f: F) -> bool { + fn exists Result>(&self, mut f: F) -> Result { let mut found = false; self.apply(|n| { - Ok(if f(n) { + Ok(if f(n)? { found = true; TreeNodeRecursion::Stop } else { TreeNodeRecursion::Continue }) }) - .unwrap(); - found + .map(|_| found) } /// Low-level API used to implement other APIs. diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 6f76936806c6..ea2cfeafe6e5 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -1271,7 +1271,16 @@ impl Expr { /// Return true when the expression contains out reference(correlated) expressions. pub fn contains_outer(&self) -> bool { - self.exists(|expr| matches!(expr, Expr::OuterReferenceColumn { .. })) + self.exists(|expr| Ok(matches!(expr, Expr::OuterReferenceColumn { .. }))) + .unwrap() + } + + /// Returns true if the expression is volatile, i.e. whether it can return different + /// results when evaluated multiple times with the same input. + pub fn is_volatile(&self) -> Result { + self.exists(|expr| { + Ok(matches!(expr, Expr::ScalarFunction(func) if func.func_def.is_volatile()?)) + }) } /// Recursively find all [`Expr::Placeholder`] expressions, and @@ -1931,15 +1940,6 @@ fn create_names(exprs: &[Expr]) -> Result { .join(", ")) } -/// Whether the given expression is volatile, i.e. whether it can return different results -/// when evaluated multiple times with the same input. -pub fn is_volatile(expr: &Expr) -> Result { - match expr { - Expr::ScalarFunction(func) => func.func_def.is_volatile(), - _ => Ok(false), - } -} - #[cfg(test)] mod test { use crate::expr::Cast; diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index b859dda9d53f..081d9c250573 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -21,7 +21,6 @@ use std::collections::hash_map::Entry; use std::collections::{BTreeSet, HashMap}; use std::sync::Arc; -use crate::utils::is_volatile_expression; use crate::{utils, OptimizerConfig, OptimizerRule}; use arrow::datatypes::{DataType, Field}; @@ -661,7 +660,7 @@ impl TreeNodeVisitor for ExprIdentifierVisitor<'_> { fn f_down(&mut self, expr: &Expr) -> Result { // related to https://github.com/apache/datafusion/issues/8814 // If the expr contain volatile expression or is a short-circuit expression, skip it. - if expr.short_circuits() || is_volatile_expression(expr)? { + if expr.short_circuits() || expr.is_volatile()? { self.visit_stack .push(VisitRecord::JumpMark(self.node_count)); return Ok(TreeNodeRecursion::Jump); // go to f_up @@ -717,7 +716,7 @@ impl TreeNodeRewriter for CommonSubexprRewriter<'_> { // The `CommonSubexprRewriter` relies on `ExprIdentifierVisitor` to generate // the `id_array`, which records the expr's identifier used to rewrite expr. So if we // skip an expr in `ExprIdentifierVisitor`, we should skip it here, too. - if expr.short_circuits() || is_volatile_expression(&expr)? { + if expr.short_circuits() || expr.is_volatile()? { return Ok(Transformed::new(expr, false, TreeNodeRecursion::Jump)); } diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs index 950932f479c9..e1561ad9d6fe 100644 --- a/datafusion/optimizer/src/push_down_filter.rs +++ b/datafusion/optimizer/src/push_down_filter.rs @@ -18,7 +18,6 @@ use std::collections::{HashMap, HashSet}; use std::sync::Arc; use crate::optimizer::ApplyOrder; -use crate::utils::is_volatile_expression; use crate::{OptimizerConfig, OptimizerRule}; use datafusion_common::tree_node::{ @@ -705,9 +704,7 @@ impl OptimizerRule for PushDownFilter { (qualified_name(qualifier, field.name()), expr) }) - .partition(|(_, value)| { - is_volatile_expression(value).unwrap_or(true) - }); + .partition(|(_, value)| value.is_volatile().unwrap_or(true)); let mut push_predicates = vec![]; let mut keep_predicates = vec![]; diff --git a/datafusion/optimizer/src/utils.rs b/datafusion/optimizer/src/utils.rs index aad89be7db79..1c20501da53a 100644 --- a/datafusion/optimizer/src/utils.rs +++ b/datafusion/optimizer/src/utils.rs @@ -21,9 +21,7 @@ use std::collections::{BTreeSet, HashMap}; use crate::{OptimizerConfig, OptimizerRule}; -use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; use datafusion_common::{Column, DFSchema, DFSchemaRef, Result}; -use datafusion_expr::expr::is_volatile; use datafusion_expr::expr_rewriter::replace_col; use datafusion_expr::utils as expr_utils; use datafusion_expr::{logical_plan::LogicalPlan, Expr, Operator}; @@ -97,20 +95,6 @@ pub fn log_plan(description: &str, plan: &LogicalPlan) { trace!("{description}::\n{}\n", plan.display_indent_schema()); } -/// check whether the expression is volatile predicates -pub(crate) fn is_volatile_expression(e: &Expr) -> Result { - let mut is_volatile_expr = false; - e.apply(|expr| { - Ok(if is_volatile(expr)? { - is_volatile_expr = true; - TreeNodeRecursion::Stop - } else { - TreeNodeRecursion::Continue - }) - })?; - Ok(is_volatile_expr) -} - /// Splits a conjunctive [`Expr`] such as `A AND B AND C` => `[A, B, C]` /// /// See [`split_conjunction_owned`] for more details and an example.