From 928c415e844ea82a07b4a63f25c7b1bfe76bb776 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 27 Oct 2023 09:24:32 -0600 Subject: [PATCH 01/26] Add some initial content about creating logical plans --- Cargo.toml | 1 + docs/Cargo.toml | 32 ++++++++ .../building-logical-plans.md | 82 ++++++++++++++++++- docs/src/lib.rs | 19 +++++ docs/src/library_logical_plan.rs | 50 +++++++++++ 5 files changed, 183 insertions(+), 1 deletion(-) create mode 100644 docs/Cargo.toml create mode 100644 docs/src/lib.rs create mode 100644 docs/src/library_logical_plan.rs diff --git a/Cargo.toml b/Cargo.toml index 71088e7fc7ad..77e3c6038ea7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,6 +32,7 @@ members = [ "datafusion/substrait", "datafusion/wasmtest", "datafusion-examples", + "docs", "test-utils", "benchmarks", ] diff --git a/docs/Cargo.toml b/docs/Cargo.toml new file mode 100644 index 000000000000..dc92994f773c --- /dev/null +++ b/docs/Cargo.toml @@ -0,0 +1,32 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "datafusion-docs" +description = "DataFusion Documentation" +publish = false +version = { workspace = true } +edition = { workspace = true } +readme = { workspace = true } +homepage = { workspace = true } +repository = { workspace = true } +license = { workspace = true } +authors = { workspace = true } +rust-version = "1.70" + +[dependencies] +datafusion = { path = "../datafusion/core" } \ No newline at end of file diff --git a/docs/source/library-user-guide/building-logical-plans.md b/docs/source/library-user-guide/building-logical-plans.md index 406f4881129c..f7b45c98454d 100644 --- a/docs/source/library-user-guide/building-logical-plans.md +++ b/docs/source/library-user-guide/building-logical-plans.md @@ -19,4 +19,84 @@ # Building Logical Plans -Coming Soon +A logical plan is a structured representation of a database query that describes the high-level operations and +transformations needed to retrieve data from a database or data source. It abstracts away specific implementation +details and focuses on the logical flow of the query, including operations like filtering, sorting, and joining tables. + +This logical plan serves as an intermediate step before generating an optimized physical execution plan. + +DataFusion logical plans are typically created using the [LogicalPlanBuilder] struct. The following associated functions can be +used to create a new builder: + +- `empty` - create an empty plan with no fields +- `values` - create a plan from a set of literal values +- `scan` - create a plan representing a table scan +- `scan_with_filters` - create a plan representing a table scan with filters + +Once the builder is created, transformation methods can be called to declare that further operations should be +performed on the plan. Note that all we are doing at this stage is building up the logical plan structure. No query +execution will be performed. + +Here are some examples of transformation methods, but for a full list, refer to the [LogicalPlanBuilder] API documentation. + +- `filter` +- `limit` +- `sort` +- `distinct` +- `join` + +The following example demonstrates building a simple query consisting of a table scan followed by a filter. + + +```rust +// create a logical table source +let schema = Schema::new(vec![ + Field::new("id", DataType::Int32, true), + Field::new("name", DataType::Utf8, true), +]); +let table_source = LogicalTableSource::new(SchemaRef::new(schema)); + +// optional projection +let projection = None; + +// create a LogicalPlanBuilder for a table scan +let builder = LogicalPlanBuilder::scan("person", Arc::new(table_source), projection)?; + +// perform a filter operation and build the plan +let plan = builder + .filter(col("id").gt(lit(500)))? // WHERE id > 500 + .build()?; + +// print the plan +println!("{}", plan.display_indent_schema()); +``` + +This example produces the following plan: + +``` +Filter: person.id > Int32(500) [id:Int32;N, name:Utf8;N] + TableScan: person [id:Int32;N, name:Utf8;N] +``` + +## Table Sources + +The previous example used a [LogicalTableSource], which is used for tests and documentation in DataFusion, and is also +suitable if you are using DataFusion to build logical plans but do not use DataFusion's physical plan. However, if you +want to use a TableSource that can be executed in DataFusion then you will need to [DefaultTableSource], which is a +wrapper for a [TableProvider]. + +Both [LogicalTableSource] and [DefaultTableSource] implement the [TableSource] trait. [DefaultTableSource] acts as a +bridge between DataFusion's logical and physical plans and is necessary because the logical plan is contained in +the `datafusion_expr` crate, which does not know about DataFusion's physical plans. + +```rust +pub struct DefaultTableSource { + pub table_provider: Arc, +} +``` + +[LogicalPlanBuilder]: https://docs.rs/datafusion-expr/latest/datafusion_expr/logical_plan/builder/struct.LogicalPlanBuilder.html +[LogicalTableSource]: https://docs.rs/datafusion-expr/latest/datafusion_expr/logical_plan/builder/struct.LogicalTableSource.html +[DefaultTableSource]: https://docs.rs/datafusion/latest/datafusion/datasource/default_table_source/struct.DefaultTableSource.html +[TableProvider]: https://docs.rs/datafusion/latest/datafusion/datasource/provider/trait.TableProvider.html +[TableSource]: https://docs.rs/datafusion-expr/latest/datafusion_expr/trait.TableSource.html \ No newline at end of file diff --git a/docs/src/lib.rs b/docs/src/lib.rs new file mode 100644 index 000000000000..4b32a27f3cde --- /dev/null +++ b/docs/src/lib.rs @@ -0,0 +1,19 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#[cfg(test)] +mod library_logical_plan; \ No newline at end of file diff --git a/docs/src/library_logical_plan.rs b/docs/src/library_logical_plan.rs new file mode 100644 index 000000000000..9783329a58f0 --- /dev/null +++ b/docs/src/library_logical_plan.rs @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion::prelude::*; +use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use datafusion::error::Result; +use datafusion::logical_expr::builder::LogicalTableSource; +use datafusion::logical_expr::LogicalPlanBuilder; +use std::sync::Arc; + +#[test] +fn plan_builder_1() -> Result<()> { + + // create a logical table source + let schema = Schema::new(vec![ + Field::new("id", DataType::Int32, true), + Field::new("name", DataType::Utf8, true), + ]); + let table_source = LogicalTableSource::new(SchemaRef::new(schema)); + + // optional projection + let projection = None; + + // create a LogicalPlanBuilder for a table scan + let builder = LogicalPlanBuilder::scan("person", Arc::new(table_source), projection)?; + + // perform a filter operation and build the plan + let plan = builder + .filter(col("id").gt(lit(500)))? + .build()?; + + // print the plan + println!("{}", plan.display_indent_schema()); + + Ok(()) +} From 1b220a1f1220655a43f5f569898162af10ea794d Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 27 Oct 2023 09:31:51 -0600 Subject: [PATCH 02/26] prettier --- datafusion/core/src/prelude.rs | 2 +- .../building-logical-plans.md | 29 ++++++++++--------- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/datafusion/core/src/prelude.rs b/datafusion/core/src/prelude.rs index 7689468e5d13..5cd8b3870f81 100644 --- a/datafusion/core/src/prelude.rs +++ b/datafusion/core/src/prelude.rs @@ -13,7 +13,7 @@ // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations -// under the License.pub}, +// under the License. //! DataFusion "prelude" to simplify importing common types. //! diff --git a/docs/source/library-user-guide/building-logical-plans.md b/docs/source/library-user-guide/building-logical-plans.md index f7b45c98454d..56aa7dc7563f 100644 --- a/docs/source/library-user-guide/building-logical-plans.md +++ b/docs/source/library-user-guide/building-logical-plans.md @@ -19,22 +19,22 @@ # Building Logical Plans -A logical plan is a structured representation of a database query that describes the high-level operations and -transformations needed to retrieve data from a database or data source. It abstracts away specific implementation -details and focuses on the logical flow of the query, including operations like filtering, sorting, and joining tables. +A logical plan is a structured representation of a database query that describes the high-level operations and +transformations needed to retrieve data from a database or data source. It abstracts away specific implementation +details and focuses on the logical flow of the query, including operations like filtering, sorting, and joining tables. This logical plan serves as an intermediate step before generating an optimized physical execution plan. -DataFusion logical plans are typically created using the [LogicalPlanBuilder] struct. The following associated functions can be -used to create a new builder: +DataFusion logical plans are typically created using the [LogicalPlanBuilder] struct. The following associated functions can be +used to create a new builder: - `empty` - create an empty plan with no fields - `values` - create a plan from a set of literal values - `scan` - create a plan representing a table scan - `scan_with_filters` - create a plan representing a table scan with filters -Once the builder is created, transformation methods can be called to declare that further operations should be -performed on the plan. Note that all we are doing at this stage is building up the logical plan structure. No query +Once the builder is created, transformation methods can be called to declare that further operations should be +performed on the plan. Note that all we are doing at this stage is building up the logical plan structure. No query execution will be performed. Here are some examples of transformation methods, but for a full list, refer to the [LogicalPlanBuilder] API documentation. @@ -45,9 +45,10 @@ Here are some examples of transformation methods, but for a full list, refer to - `distinct` - `join` -The following example demonstrates building a simple query consisting of a table scan followed by a filter. +The following example demonstrates building a simple query consisting of a table scan followed by a filter. + ```rust // create a logical table source let schema = Schema::new(vec![ @@ -80,13 +81,13 @@ Filter: person.id > Int32(500) [id:Int32;N, name:Utf8;N] ## Table Sources -The previous example used a [LogicalTableSource], which is used for tests and documentation in DataFusion, and is also -suitable if you are using DataFusion to build logical plans but do not use DataFusion's physical plan. However, if you -want to use a TableSource that can be executed in DataFusion then you will need to [DefaultTableSource], which is a +The previous example used a [LogicalTableSource], which is used for tests and documentation in DataFusion, and is also +suitable if you are using DataFusion to build logical plans but do not use DataFusion's physical plan. However, if you +want to use a TableSource that can be executed in DataFusion then you will need to [DefaultTableSource], which is a wrapper for a [TableProvider]. -Both [LogicalTableSource] and [DefaultTableSource] implement the [TableSource] trait. [DefaultTableSource] acts as a -bridge between DataFusion's logical and physical plans and is necessary because the logical plan is contained in +Both [LogicalTableSource] and [DefaultTableSource] implement the [TableSource] trait. [DefaultTableSource] acts as a +bridge between DataFusion's logical and physical plans and is necessary because the logical plan is contained in the `datafusion_expr` crate, which does not know about DataFusion's physical plans. ```rust @@ -99,4 +100,4 @@ pub struct DefaultTableSource { [LogicalTableSource]: https://docs.rs/datafusion-expr/latest/datafusion_expr/logical_plan/builder/struct.LogicalTableSource.html [DefaultTableSource]: https://docs.rs/datafusion/latest/datafusion/datasource/default_table_source/struct.DefaultTableSource.html [TableProvider]: https://docs.rs/datafusion/latest/datafusion/datasource/provider/trait.TableProvider.html -[TableSource]: https://docs.rs/datafusion-expr/latest/datafusion_expr/trait.TableSource.html \ No newline at end of file +[TableSource]: https://docs.rs/datafusion-expr/latest/datafusion_expr/trait.TableSource.html From 653bace135a45a346afb7a727968bb0a6a0a4da9 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 27 Oct 2023 09:32:58 -0600 Subject: [PATCH 03/26] formatting --- docs/src/lib.rs | 2 +- docs/src/library_logical_plan.rs | 7 ++----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/docs/src/lib.rs b/docs/src/lib.rs index 4b32a27f3cde..f73132468ec9 100644 --- a/docs/src/lib.rs +++ b/docs/src/lib.rs @@ -16,4 +16,4 @@ // under the License. #[cfg(test)] -mod library_logical_plan; \ No newline at end of file +mod library_logical_plan; diff --git a/docs/src/library_logical_plan.rs b/docs/src/library_logical_plan.rs index 9783329a58f0..15c2d3b63e46 100644 --- a/docs/src/library_logical_plan.rs +++ b/docs/src/library_logical_plan.rs @@ -15,16 +15,15 @@ // specific language governing permissions and limitations // under the License. -use datafusion::prelude::*; use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion::error::Result; use datafusion::logical_expr::builder::LogicalTableSource; use datafusion::logical_expr::LogicalPlanBuilder; +use datafusion::prelude::*; use std::sync::Arc; #[test] fn plan_builder_1() -> Result<()> { - // create a logical table source let schema = Schema::new(vec![ Field::new("id", DataType::Int32, true), @@ -39,9 +38,7 @@ fn plan_builder_1() -> Result<()> { let builder = LogicalPlanBuilder::scan("person", Arc::new(table_source), projection)?; // perform a filter operation and build the plan - let plan = builder - .filter(col("id").gt(lit(500)))? - .build()?; + let plan = builder.filter(col("id").gt(lit(500)))?.build()?; // print the plan println!("{}", plan.display_indent_schema()); From 4db03cc9e3b7f528462eef392ccdfc69e6e7c591 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 27 Oct 2023 09:38:11 -0600 Subject: [PATCH 04/26] formatting --- .../library-user-guide/building-logical-plans.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/source/library-user-guide/building-logical-plans.md b/docs/source/library-user-guide/building-logical-plans.md index 56aa7dc7563f..0bab0c062efd 100644 --- a/docs/source/library-user-guide/building-logical-plans.md +++ b/docs/source/library-user-guide/building-logical-plans.md @@ -96,8 +96,8 @@ pub struct DefaultTableSource { } ``` -[LogicalPlanBuilder]: https://docs.rs/datafusion-expr/latest/datafusion_expr/logical_plan/builder/struct.LogicalPlanBuilder.html -[LogicalTableSource]: https://docs.rs/datafusion-expr/latest/datafusion_expr/logical_plan/builder/struct.LogicalTableSource.html -[DefaultTableSource]: https://docs.rs/datafusion/latest/datafusion/datasource/default_table_source/struct.DefaultTableSource.html -[TableProvider]: https://docs.rs/datafusion/latest/datafusion/datasource/provider/trait.TableProvider.html -[TableSource]: https://docs.rs/datafusion-expr/latest/datafusion_expr/trait.TableSource.html +[logicalplanbuilder]: https://docs.rs/datafusion-expr/latest/datafusion_expr/logical_plan/builder/struct.LogicalPlanBuilder.html +[logicaltablesource]: https://docs.rs/datafusion-expr/latest/datafusion_expr/logical_plan/builder/struct.LogicalTableSource.html +[defaulttablesource]: https://docs.rs/datafusion/latest/datafusion/datasource/default_table_source/struct.DefaultTableSource.html +[tableprovider]: https://docs.rs/datafusion/latest/datafusion/datasource/provider/trait.TableProvider.html +[tablesource]: https://docs.rs/datafusion-expr/latest/datafusion_expr/trait.TableSource.html From feedaf735dbffbd3e70ce43740acf501eabd058d Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 27 Oct 2023 09:38:42 -0600 Subject: [PATCH 05/26] formatting --- docs/source/library-user-guide/building-logical-plans.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/library-user-guide/building-logical-plans.md b/docs/source/library-user-guide/building-logical-plans.md index 0bab0c062efd..6501446746f5 100644 --- a/docs/source/library-user-guide/building-logical-plans.md +++ b/docs/source/library-user-guide/building-logical-plans.md @@ -83,7 +83,7 @@ Filter: person.id > Int32(500) [id:Int32;N, name:Utf8;N] The previous example used a [LogicalTableSource], which is used for tests and documentation in DataFusion, and is also suitable if you are using DataFusion to build logical plans but do not use DataFusion's physical plan. However, if you -want to use a TableSource that can be executed in DataFusion then you will need to [DefaultTableSource], which is a +want to use a [TableSource] that can be executed in DataFusion then you will need to use [DefaultTableSource], which is a wrapper for a [TableProvider]. Both [LogicalTableSource] and [DefaultTableSource] implement the [TableSource] trait. [DefaultTableSource] acts as a From 3de5a041c30ba58eaff6bef31b46a6dc6f40ecc7 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 27 Oct 2023 09:59:00 -0600 Subject: [PATCH 06/26] more content --- .../building-logical-plans.md | 48 +++++++++++++++++++ docs/src/library_logical_plan.rs | 33 ++++++++++++- 2 files changed, 80 insertions(+), 1 deletion(-) diff --git a/docs/source/library-user-guide/building-logical-plans.md b/docs/source/library-user-guide/building-logical-plans.md index 6501446746f5..bf7d33cecb76 100644 --- a/docs/source/library-user-guide/building-logical-plans.md +++ b/docs/source/library-user-guide/building-logical-plans.md @@ -25,6 +25,53 @@ details and focuses on the logical flow of the query, including operations like This logical plan serves as an intermediate step before generating an optimized physical execution plan. +## Building Logical Plans Manually + +DataFusion's [LogicalPlan] is an enum containing variants representing all the supported operators, and also +contains an `Extension` variant that allows projects building on DataFusion to add custom logical operators. + +It is possible to create logical plans by directly creating instances of the [LogicalPlan] enum as follows, but is is +much easier to use the [LogicalPlanBuilder], which is described in the next section. + +Here is an example of building a logical plan directly: + +```rust +// create a logical table source +let schema = Schema::new(vec![ + Field::new("id", DataType::Int32, true), + Field::new("name", DataType::Utf8, true), +]); +let table_source = LogicalTableSource::new(SchemaRef::new(schema)); + +// create a TableScan plan +let projection = None; // optional projection +let filters = vec![]; // optional filters to push down +let fetch = None; // optional LIMIT +let table_scan = LogicalPlan::TableScan(TableScan::try_new( + "my_table", + Arc::new(table_source), + projection, + filters, + fetch, +)?); + +// create a Filter plan that wraps the TableScan +let filter_expr = col("id").gt(lit(500)); +let plan = LogicalPlan::Filter(Filter::try_new(filter_expr, Arc::new(table_scan))?); + +// print the plan +println!("{}", plan.display_indent_schema()); +``` + +This example produces the following plan: + +``` +Filter: person.id > Int32(500) [id:Int32;N, name:Utf8;N] + TableScan: person [id:Int32;N, name:Utf8;N] +``` + +## Building Logical Plans with LogicalPlanBuilder + DataFusion logical plans are typically created using the [LogicalPlanBuilder] struct. The following associated functions can be used to create a new builder: @@ -96,6 +143,7 @@ pub struct DefaultTableSource { } ``` +[logicalplan]: https://docs.rs/datafusion-expr/latest/datafusion_expr/logical_plan/enum.LogicalPlan.html [logicalplanbuilder]: https://docs.rs/datafusion-expr/latest/datafusion_expr/logical_plan/builder/struct.LogicalPlanBuilder.html [logicaltablesource]: https://docs.rs/datafusion-expr/latest/datafusion_expr/logical_plan/builder/struct.LogicalTableSource.html [defaulttablesource]: https://docs.rs/datafusion/latest/datafusion/datasource/default_table_source/struct.DefaultTableSource.html diff --git a/docs/src/library_logical_plan.rs b/docs/src/library_logical_plan.rs index 15c2d3b63e46..39b00734b721 100644 --- a/docs/src/library_logical_plan.rs +++ b/docs/src/library_logical_plan.rs @@ -18,10 +18,41 @@ use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion::error::Result; use datafusion::logical_expr::builder::LogicalTableSource; -use datafusion::logical_expr::LogicalPlanBuilder; +use datafusion::logical_expr::{Filter, LogicalPlan, LogicalPlanBuilder, TableScan}; use datafusion::prelude::*; use std::sync::Arc; +#[test] +fn plan_1() -> Result<()> { + // create a logical table source + let schema = Schema::new(vec![ + Field::new("id", DataType::Int32, true), + Field::new("name", DataType::Utf8, true), + ]); + let table_source = LogicalTableSource::new(SchemaRef::new(schema)); + + // create a TableScan plan + let projection = None; // optional projection + let filters = vec![]; // optional filters to push down + let fetch = None; // optional LIMIT + let table_scan = LogicalPlan::TableScan(TableScan::try_new( + "my_table", + Arc::new(table_source), + projection, + filters, + fetch, + )?); + + // create a Filter plan that wraps the TableScan + let filter_expr = col("id").gt(lit(500)); + let plan = LogicalPlan::Filter(Filter::try_new(filter_expr, Arc::new(table_scan))?); + + // print the plan + println!("{}", plan.display_indent_schema()); + + Ok(()) +} + #[test] fn plan_builder_1() -> Result<()> { // create a logical table source From fb35b4ae8ffb8a7b6e084a2ae0f553807c0a2b3e Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 27 Oct 2023 10:05:29 -0600 Subject: [PATCH 07/26] use correct table name --- docs/source/library-user-guide/building-logical-plans.md | 4 ++-- docs/src/library_logical_plan.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/library-user-guide/building-logical-plans.md b/docs/source/library-user-guide/building-logical-plans.md index bf7d33cecb76..e38e0aeeaa1e 100644 --- a/docs/source/library-user-guide/building-logical-plans.md +++ b/docs/source/library-user-guide/building-logical-plans.md @@ -35,6 +35,7 @@ much easier to use the [LogicalPlanBuilder], which is described in the next sect Here is an example of building a logical plan directly: + ```rust // create a logical table source let schema = Schema::new(vec![ @@ -48,7 +49,7 @@ let projection = None; // optional projection let filters = vec![]; // optional filters to push down let fetch = None; // optional LIMIT let table_scan = LogicalPlan::TableScan(TableScan::try_new( - "my_table", + "person", Arc::new(table_source), projection, filters, @@ -95,7 +96,6 @@ Here are some examples of transformation methods, but for a full list, refer to The following example demonstrates building a simple query consisting of a table scan followed by a filter. - ```rust // create a logical table source let schema = Schema::new(vec![ diff --git a/docs/src/library_logical_plan.rs b/docs/src/library_logical_plan.rs index 39b00734b721..91c18b385af5 100644 --- a/docs/src/library_logical_plan.rs +++ b/docs/src/library_logical_plan.rs @@ -36,7 +36,7 @@ fn plan_1() -> Result<()> { let filters = vec![]; // optional filters to push down let fetch = None; // optional LIMIT let table_scan = LogicalPlan::TableScan(TableScan::try_new( - "my_table", + "person", Arc::new(table_source), projection, filters, From 8d7719764cd0e5c199748e68958a79b87b8a7145 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 27 Oct 2023 10:28:33 -0600 Subject: [PATCH 08/26] formatting --- dev/update_datafusion_versions.py | 1 + docs/Cargo.toml | 2 +- docs/source/library-user-guide/building-logical-plans.md | 2 ++ 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/dev/update_datafusion_versions.py b/dev/update_datafusion_versions.py index 7cbe39fdfb66..19701b813671 100755 --- a/dev/update_datafusion_versions.py +++ b/dev/update_datafusion_versions.py @@ -43,6 +43,7 @@ 'datafusion-wasmtest': 'datafusion/wasmtest/Cargo.toml', 'datafusion-benchmarks': 'benchmarks/Cargo.toml', 'datafusion-examples': 'datafusion-examples/Cargo.toml', + 'datafusion-docs': 'docs/Cargo.toml', } def update_workspace_version(new_version: str): diff --git a/docs/Cargo.toml b/docs/Cargo.toml index dc92994f773c..21a69fb2bc06 100644 --- a/docs/Cargo.toml +++ b/docs/Cargo.toml @@ -29,4 +29,4 @@ authors = { workspace = true } rust-version = "1.70" [dependencies] -datafusion = { path = "../datafusion/core" } \ No newline at end of file +datafusion = { path = "../datafusion/core", version = "32.0.0", default-features = false } \ No newline at end of file diff --git a/docs/source/library-user-guide/building-logical-plans.md b/docs/source/library-user-guide/building-logical-plans.md index e38e0aeeaa1e..ff118c8fb040 100644 --- a/docs/source/library-user-guide/building-logical-plans.md +++ b/docs/source/library-user-guide/building-logical-plans.md @@ -36,6 +36,7 @@ much easier to use the [LogicalPlanBuilder], which is described in the next sect Here is an example of building a logical plan directly: + ```rust // create a logical table source let schema = Schema::new(vec![ @@ -96,6 +97,7 @@ Here are some examples of transformation methods, but for a full list, refer to The following example demonstrates building a simple query consisting of a table scan followed by a filter. + ```rust // create a logical table source let schema = Schema::new(vec![ From 9594b309c1a75693f651353690b4f7e30931a4ff Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 27 Oct 2023 11:14:17 -0600 Subject: [PATCH 09/26] Update docs generation to include source from datafusion_docs tests --- docs/build.sh | 6 ++ docs/preprocess.py | 66 +++++++++++++++++++ .../building-logical-plans.md | 55 +--------------- 3 files changed, 74 insertions(+), 53 deletions(-) create mode 100644 docs/preprocess.py diff --git a/docs/build.sh b/docs/build.sh index 3fdcd0327024..262046b39992 100755 --- a/docs/build.sh +++ b/docs/build.sh @@ -21,8 +21,14 @@ set -e rm -rf build 2> /dev/null rm -rf temp 2> /dev/null + +# copy source to temp dir mkdir temp cp -rf source/* temp/ + +# copy markdown files into temp dir again and insert source from tests +python preprocess.py + # replace relative URLs with absolute URLs sed -i -e 's/\.\.\/\.\.\/\.\.\//https:\/\/github.com\/apache\/arrow-datafusion\/blob\/main\//g' temp/contributor-guide/index.md make SOURCEDIR=`pwd`/temp html diff --git a/docs/preprocess.py b/docs/preprocess.py new file mode 100644 index 000000000000..6b016e2ad548 --- /dev/null +++ b/docs/preprocess.py @@ -0,0 +1,66 @@ +#!/usr/bin/python +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import glob +import os +import re + +def copy_test_source(test_filename, test_method, output): + output.write("```rust\n") + with open(test_filename) as test: + found = False + for test_line in test.readlines(): + if test_line.startswith("fn {}".format(test_method)): + found = True + continue + if found: + if test_line.strip() == "Ok(())": + break + # TODO strip leading indent + output.write(test_line) + output.write("```") + + +def add_source(input, output): + print("Copying", input, "to", output) + # + include_pattern = "" + with open(input, "r") as input: + with open(output, "w") as output: + for line in input.readlines(): + matches = re.search(include_pattern, line) + if matches is not None: + test_file = matches.group(1) + test_method = matches.group(2) + test_filename = "src/{}.rs".format(test_file) + copy_test_source(test_filename, test_method, output) + else: + output.write(line) + + +def main(): + for file in glob.glob("source/**/*.md"): + dest = "temp/" + file[7:] + last_path_sep = dest.rindex("/") + dir = dest[0:last_path_sep] + if not os.path.exists(dir): + os.makedirs(dir) + add_source(file, dest) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/docs/source/library-user-guide/building-logical-plans.md b/docs/source/library-user-guide/building-logical-plans.md index ff118c8fb040..bdc35948defe 100644 --- a/docs/source/library-user-guide/building-logical-plans.md +++ b/docs/source/library-user-guide/building-logical-plans.md @@ -35,35 +35,7 @@ much easier to use the [LogicalPlanBuilder], which is described in the next sect Here is an example of building a logical plan directly: - - -```rust -// create a logical table source -let schema = Schema::new(vec![ - Field::new("id", DataType::Int32, true), - Field::new("name", DataType::Utf8, true), -]); -let table_source = LogicalTableSource::new(SchemaRef::new(schema)); - -// create a TableScan plan -let projection = None; // optional projection -let filters = vec![]; // optional filters to push down -let fetch = None; // optional LIMIT -let table_scan = LogicalPlan::TableScan(TableScan::try_new( - "person", - Arc::new(table_source), - projection, - filters, - fetch, -)?); - -// create a Filter plan that wraps the TableScan -let filter_expr = col("id").gt(lit(500)); -let plan = LogicalPlan::Filter(Filter::try_new(filter_expr, Arc::new(table_scan))?); - -// print the plan -println!("{}", plan.display_indent_schema()); -``` + This example produces the following plan: @@ -96,30 +68,7 @@ Here are some examples of transformation methods, but for a full list, refer to The following example demonstrates building a simple query consisting of a table scan followed by a filter. - - -```rust -// create a logical table source -let schema = Schema::new(vec![ - Field::new("id", DataType::Int32, true), - Field::new("name", DataType::Utf8, true), -]); -let table_source = LogicalTableSource::new(SchemaRef::new(schema)); - -// optional projection -let projection = None; - -// create a LogicalPlanBuilder for a table scan -let builder = LogicalPlanBuilder::scan("person", Arc::new(table_source), projection)?; - -// perform a filter operation and build the plan -let plan = builder - .filter(col("id").gt(lit(500)))? // WHERE id > 500 - .build()?; - -// print the plan -println!("{}", plan.display_indent_schema()); -``` + This example produces the following plan: From ca62c0831ac87265305e613ca49c3b3f00b3436e Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 27 Oct 2023 11:31:51 -0600 Subject: [PATCH 10/26] improve script --- docs/preprocess.py | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/docs/preprocess.py b/docs/preprocess.py index 6b016e2ad548..98be94921e47 100644 --- a/docs/preprocess.py +++ b/docs/preprocess.py @@ -19,8 +19,9 @@ import os import re + def copy_test_source(test_filename, test_method, output): - output.write("```rust\n") + lines = [] with open(test_filename) as test: found = False for test_line in test.readlines(): @@ -30,8 +31,34 @@ def copy_test_source(test_filename, test_method, output): if found: if test_line.strip() == "Ok(())": break - # TODO strip leading indent - output.write(test_line) + lines.append(test_line) + + # remove blank lines from the end of the list + while lines and lines[-1] == "": + lines.pop() + + # remove leading indent when possible + consistent_indent = True + for line in lines: + if len(line.strip()) > 0 and not ( + line.startswith(" ") or line.startswith("\t") + ): + print("not consistent", line) + consistent_indent = False + break + if consistent_indent: + old_lines = lines + lines = [] + for line in old_lines: + if len(line) >= 4: + lines.append(line[4:]) + else: + lines.append(line) + + # write to output + output.write("```rust\n") + for line in lines: + output.write(line) output.write("```") @@ -63,4 +90,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() From 0137395d7c46ed626bc2a3be261683a3757f935a Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 28 Oct 2023 09:49:14 -0600 Subject: [PATCH 11/26] fix merge conflict --- docs/source/library-user-guide/building-logical-plans.md | 5 ----- 1 file changed, 5 deletions(-) diff --git a/docs/source/library-user-guide/building-logical-plans.md b/docs/source/library-user-guide/building-logical-plans.md index 5a7d2bff9e65..55a92d99262d 100644 --- a/docs/source/library-user-guide/building-logical-plans.md +++ b/docs/source/library-user-guide/building-logical-plans.md @@ -23,14 +23,9 @@ A logical plan is a structured representation of a database query that describes transformations needed to retrieve data from a database or data source. It abstracts away specific implementation details and focuses on the logical flow of the query, including operations like filtering, sorting, and joining tables. -<<<<<<< HEAD -This logical plan serves as an intermediate step before generating an optimized physical execution plan. -======= This logical plan serves as an intermediate step before generating an optimized physical execution plan. This is explained in more detail in the [Query Planning and Execution Overview] section of the [Architecture Guide]. -> > > > > > > apache/main - ## Building Logical Plans Manually DataFusion's [LogicalPlan] is an enum containing variants representing all the supported operators, and also From edd3bf65ce1dd0a0e6699a5d05a3544931c2b2cd Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 28 Oct 2023 09:49:21 -0600 Subject: [PATCH 12/26] fix merge conflict --- docs/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/Cargo.toml b/docs/Cargo.toml index b2df597fff00..9caa0bde3608 100644 --- a/docs/Cargo.toml +++ b/docs/Cargo.toml @@ -29,4 +29,4 @@ authors = { workspace = true } rust-version = "1.70" [dependencies] -datafusion = { path = "../datafusion/core", version = "32.0.0", default-features = false } \ No newline at end of file +datafusion = { path = "../datafusion/core", version = "32.0.0", default-features = false } From fa4f8f4c6386603e9366944c4b65fe3f971f8fd3 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 28 Oct 2023 09:51:21 -0600 Subject: [PATCH 13/26] remove debug logging --- docs/preprocess.py | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/preprocess.py b/docs/preprocess.py index 98be94921e47..63a8a0b8ce0e 100644 --- a/docs/preprocess.py +++ b/docs/preprocess.py @@ -43,7 +43,6 @@ def copy_test_source(test_filename, test_method, output): if len(line.strip()) > 0 and not ( line.startswith(" ") or line.startswith("\t") ): - print("not consistent", line) consistent_indent = False break if consistent_indent: From 44434d7a1e9666364e27eb7106a1126e2160ded7 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 28 Oct 2023 10:34:01 -0600 Subject: [PATCH 14/26] Move udf source to tests --- docs/Cargo.toml | 1 + docs/README.md | 40 +++++++- docs/preprocess.py | 12 +-- docs/source/library-user-guide/adding-udfs.md | 54 +---------- .../building-logical-plans.md | 4 +- docs/src/lib.rs | 3 +- docs/src/library_logical_plan.rs | 16 +++- docs/src/library_udfs.rs | 92 +++++++++++++++++++ 8 files changed, 156 insertions(+), 66 deletions(-) create mode 100644 docs/src/library_udfs.rs diff --git a/docs/Cargo.toml b/docs/Cargo.toml index 9caa0bde3608..00e66bcdca47 100644 --- a/docs/Cargo.toml +++ b/docs/Cargo.toml @@ -30,3 +30,4 @@ rust-version = "1.70" [dependencies] datafusion = { path = "../datafusion/core", version = "32.0.0", default-features = false } +tokio = { version = "^1.0", features = ["rt-multi-thread"] } diff --git a/docs/README.md b/docs/README.md index 8b55e8756e19..b0e1dc8ad17a 100644 --- a/docs/README.md +++ b/docs/README.md @@ -19,17 +19,24 @@ # DataFusion Documentation -This folder contains the source content of the [User Guide](./source/user-guide) -and [Contributor Guide](./source/contributor-guide). These are both published to -https://arrow.apache.org/datafusion/ as part of the release process. +This folder contains the source content of the following guides: + +- [User Guide] +- [Library Guide] +- [Contributor Guide] + +These guides are published to https://arrow.apache.org/datafusion/ as part of the release process. ## Dependencies It's recommended to install build dependencies and build the documentation inside a Python virtualenv. -- Python -- `pip install -r requirements.txt` +Install Python and then use pip to install dependencies: + +```shell +pip install -r requirements.txt +``` ## Build & Preview @@ -53,6 +60,25 @@ To make changes to the docs, simply make a Pull Request with your proposed changes as normal. When the PR is merged the docs will be automatically updated. +## Including Source Code + +We want to make sure that all source code in the documentation is tested as part of the build and release process. We +achieve this by writing the code in standard Rust tests in the `datafusion-docs-test` crate, and annotate the code with +comments that mark the beginning and end of the code example. + +```rust +//begin:my_example +let foo = 1 + 1; +//end:my_example +``` + +We can now include an include directive in the markdown file, specifying the name of the Rust file containing the test +and the name of the example. + +```md + +``` + ## Release Process This documentation is hosted at https://arrow.apache.org/datafusion/ @@ -67,3 +93,7 @@ The Apache Software Foundation provides https://arrow.apache.org/, which serves content based on the configuration in [.asf.yaml](https://github.com/apache/arrow-datafusion/blob/main/.asf.yaml), which specifies the target as https://arrow.apache.org/datafusion/. + +[user guide]: ./source/user-guide +[library guide]: ./source/library-user-guide +[contributor guide]: ./source/contributor-guide diff --git a/docs/preprocess.py b/docs/preprocess.py index 63a8a0b8ce0e..82b22a33de5a 100644 --- a/docs/preprocess.py +++ b/docs/preprocess.py @@ -23,14 +23,14 @@ def copy_test_source(test_filename, test_method, output): lines = [] with open(test_filename) as test: - found = False + in_example_code = False for test_line in test.readlines(): - if test_line.startswith("fn {}".format(test_method)): - found = True + if test_line.strip() == "//begin:{}".format(test_method): + in_example_code = True continue - if found: - if test_line.strip() == "Ok(())": - break + if test_line.strip() == "//end:{}".format(test_method): + break + if in_example_code: lines.append(test_line) # remove blank lines from the end of the list diff --git a/docs/source/library-user-guide/adding-udfs.md b/docs/source/library-user-guide/adding-udfs.md index a4b5ed0b40f1..3d1cefc09a4b 100644 --- a/docs/source/library-user-guide/adding-udfs.md +++ b/docs/source/library-user-guide/adding-udfs.md @@ -35,41 +35,13 @@ First we'll talk about adding an Scalar UDF end-to-end, then we'll talk about th A Scalar UDF is a function that takes a row of data and returns a single value. For example, this function takes a single i64 and returns a single i64 with 1 added to it: -```rust -use std::sync::Arc; - -use arrow::array::{ArrayRef, Int64Array}; -use datafusion::common::Result; - -use datafusion::common::cast::as_int64_array; - -pub fn add_one(args: &[ArrayRef]) -> Result { - // Error handling omitted for brevity - - let i64s = as_int64_array(&args[0])?; - - let new_array = i64s - .iter() - .map(|array_elem| array_elem.map(|value| value + 1)) - .collect::(); - - Ok(Arc::new(new_array)) -} -``` + For brevity, we'll skipped some error handling, but e.g. you may want to check that `args.len()` is the expected number of arguments. This "works" in isolation, i.e. if you have a slice of `ArrayRef`s, you can call `add_one` and it will return a new `ArrayRef` with 1 added to each value. -```rust -let input = vec![Some(1), None, Some(3)]; -let input = Arc::new(Int64Array::from(input)) as ArrayRef; - -let result = add_one(&[input]).unwrap(); -let result = result.as_any().downcast_ref::().unwrap(); - -assert_eq!(result, &Int64Array::from(vec![Some(2), None, Some(4)])); -``` + The challenge however is that DataFusion doesn't know about this function. We need to register it with DataFusion so that it can be used in the context of a query. @@ -77,15 +49,7 @@ The challenge however is that DataFusion doesn't know about this function. We ne To register a Scalar UDF, you need to wrap the function implementation in a `ScalarUDF` struct and then register it with the `SessionContext`. DataFusion provides the `create_udf` and `make_scalar_function` helper functions to make this easier. -```rust -let udf = create_udf( - "add_one", - vec![DataType::Int64], - Arc::new(DataType::Int64), - Volatility::Immutable, - make_scalar_function(add_one), -); -``` + A few things to note: @@ -97,19 +61,11 @@ A few things to note: That gives us a `ScalarUDF` that we can register with the `SessionContext`: -```rust -let mut ctx = SessionContext::new(); - -ctx.register_udf(udf); -``` + At this point, you can use the `add_one` function in your query: -```rust -let sql = "SELECT add_one(1)"; - -let df = ctx.sql(&sql).await.unwrap(); -``` + ## Adding a Window UDF diff --git a/docs/source/library-user-guide/building-logical-plans.md b/docs/source/library-user-guide/building-logical-plans.md index 55a92d99262d..b75a788e830a 100644 --- a/docs/source/library-user-guide/building-logical-plans.md +++ b/docs/source/library-user-guide/building-logical-plans.md @@ -36,7 +36,7 @@ much easier to use the [LogicalPlanBuilder], which is described in the next sect Here is an example of building a logical plan directly: - + This example produces the following plan: @@ -71,7 +71,7 @@ Here are some examples of transformation methods, but for a full list, refer to The following example demonstrates building the same simple query plan as the previous example, with a table scan followed by a filter. - + This example produces the following plan: diff --git a/docs/src/lib.rs b/docs/src/lib.rs index f73132468ec9..3ab99e77aa9b 100644 --- a/docs/src/lib.rs +++ b/docs/src/lib.rs @@ -14,6 +14,5 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. - -#[cfg(test)] mod library_logical_plan; +mod library_udfs; diff --git a/docs/src/library_logical_plan.rs b/docs/src/library_logical_plan.rs index 355003941570..76296c01be79 100644 --- a/docs/src/library_logical_plan.rs +++ b/docs/src/library_logical_plan.rs @@ -23,7 +23,8 @@ use datafusion::prelude::*; use std::sync::Arc; #[test] -fn plan_1() -> Result<()> { +fn create_plan() -> Result<()> { + //begin:create_plan // create a logical table source let schema = Schema::new(vec![ Field::new("id", DataType::Int32, true), @@ -49,12 +50,18 @@ fn plan_1() -> Result<()> { // print the plan println!("{}", plan.display_indent_schema()); + //end:create_plan + + //TODO + // assert_eq!(plan.display_indent_schema().to_string(), "Filter: person.id > Int32(500) [id:Int32;N, name:Utf8;N]\n\ + // TableScan: person [id:Int32;N, name:Utf8;N]"); Ok(()) } #[test] -fn plan_builder_1() -> Result<()> { +fn build_plan() -> Result<()> { + //begin:build_plan // create a logical table source let schema = Schema::new(vec![ Field::new("id", DataType::Int32, true), @@ -73,6 +80,11 @@ fn plan_builder_1() -> Result<()> { // print the plan println!("{}", plan.display_indent_schema()); + //end:build_plan + + //TODO + // assert_eq!(plan.display_indent_schema().to_string(), "Filter: person.id > Int32(500) [id:Int32;N, name:Utf8;N]\n\ + // TableScan: person [id:Int32;N, name:Utf8;N]"); Ok(()) } diff --git a/docs/src/library_udfs.rs b/docs/src/library_udfs.rs new file mode 100644 index 000000000000..a58db34aa5a2 --- /dev/null +++ b/docs/src/library_udfs.rs @@ -0,0 +1,92 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion::arrow::array::{ArrayRef, Int64Array}; +use datafusion::arrow::datatypes::DataType; +use datafusion::common::cast::as_int64_array; +use datafusion::error::Result; +use datafusion::logical_expr::Volatility; +use datafusion::physical_expr::functions::make_scalar_function; +use datafusion::prelude::{create_udf, SessionContext}; +use std::sync::Arc; +use tokio; + +//begin:add_one +fn add_one(args: &[ArrayRef]) -> Result { + let i64s = as_int64_array(&args[0])?; + + let new_array = i64s + .iter() + .map(|array_elem| array_elem.map(|value| value + 1)) + .collect::(); + + Ok(Arc::new(new_array)) +} +//end:add_one + +#[test] +fn call_add_one() -> Result<()> { + //begin:call_add_one + let input = vec![Some(1), None, Some(3)]; + let input = Arc::new(Int64Array::from(input)) as ArrayRef; + + let result = add_one(&[input])?; + let result = result + .as_any() + .downcast_ref::() + .expect("result is Int64Array"); + + assert_eq!(result, &Int64Array::from(vec![Some(2), None, Some(4)])); + //end:call_add_one + + Ok(()) +} + +#[test] +fn register_udf() -> Result<()> { + //begin:create_udf + let udf = create_udf( + "add_one", + vec![DataType::Int64], + Arc::new(DataType::Int64), + Volatility::Immutable, + make_scalar_function(add_one), + ); + //end:create_udf + //begin:register_udf + let ctx = SessionContext::new(); + ctx.register_udf(udf); + //end:register_udf + Ok(()) +} + +#[tokio::test] +async fn call_udf() -> Result<()> { + let udf = create_udf( + "add_one", + vec![DataType::Int64], + Arc::new(DataType::Int64), + Volatility::Immutable, + make_scalar_function(add_one), + ); + //begin:call_udf + let ctx = SessionContext::new(); + let sql = "SELECT add_one(1)"; + let df = ctx.sql(&sql).await?; + //end:call_udf + Ok(()) +} From ec05922ff0743d8179ca3a3eb956d60902d7a9ca Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 28 Oct 2023 10:40:04 -0600 Subject: [PATCH 15/26] improve tests --- docs/src/library_logical_plan.rs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/docs/src/library_logical_plan.rs b/docs/src/library_logical_plan.rs index 76296c01be79..5a12c9034c60 100644 --- a/docs/src/library_logical_plan.rs +++ b/docs/src/library_logical_plan.rs @@ -52,9 +52,11 @@ fn create_plan() -> Result<()> { println!("{}", plan.display_indent_schema()); //end:create_plan - //TODO - // assert_eq!(plan.display_indent_schema().to_string(), "Filter: person.id > Int32(500) [id:Int32;N, name:Utf8;N]\n\ - // TableScan: person [id:Int32;N, name:Utf8;N]"); + assert_eq!( + plan.display_indent_schema().to_string(), + r#"Filter: id > Int32(500) [id:Int32;N, name:Utf8;N] + TableScan: person [id:Int32;N, name:Utf8;N]"# + ); Ok(()) } @@ -82,9 +84,11 @@ fn build_plan() -> Result<()> { println!("{}", plan.display_indent_schema()); //end:build_plan - //TODO - // assert_eq!(plan.display_indent_schema().to_string(), "Filter: person.id > Int32(500) [id:Int32;N, name:Utf8;N]\n\ - // TableScan: person [id:Int32;N, name:Utf8;N]"); + assert_eq!( + plan.display_indent_schema().to_string(), + r#"Filter: person.id > Int32(500) [id:Int32;N, name:Utf8;N] + TableScan: person [id:Int32;N, name:Utf8;N]"# + ); Ok(()) } From c867cf93dd3749b9bac0fca6e8d8a6e2274ed44b Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 28 Oct 2023 10:42:20 -0600 Subject: [PATCH 16/26] improve README --- docs/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/README.md b/docs/README.md index b0e1dc8ad17a..20617aff55d1 100644 --- a/docs/README.md +++ b/docs/README.md @@ -42,7 +42,7 @@ pip install -r requirements.txt Run the provided script to build the HTML pages. -```bash +```shell ./build.sh ``` @@ -50,7 +50,7 @@ The HTML will be generated into a `build` directory. Preview the site on Linux by running this command. -```bash +```shell firefox build/html/index.html ``` @@ -72,7 +72,7 @@ let foo = 1 + 1; //end:my_example ``` -We can now include an include directive in the markdown file, specifying the name of the Rust file containing the test +We can now put an `include` directive in the markdown file, specifying the name of the Rust file containing the test and the name of the example. ```md From 7763328d16dacf526779099c50e585d338824c6f Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 28 Oct 2023 10:49:19 -0600 Subject: [PATCH 17/26] improve README --- docs/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/README.md b/docs/README.md index 20617aff55d1..fb572f784669 100644 --- a/docs/README.md +++ b/docs/README.md @@ -64,7 +64,7 @@ automatically updated. We want to make sure that all source code in the documentation is tested as part of the build and release process. We achieve this by writing the code in standard Rust tests in the `datafusion-docs-test` crate, and annotate the code with -comments that mark the beginning and end of the code example. +comments that mark the beginning and end of the portion of the code that we want to include in the documentation. ```rust //begin:my_example From 074876aaaedc824e1a305d464606653024f4e68e Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 4 Nov 2023 09:13:54 -0600 Subject: [PATCH 18/26] save --- docs/source/library-user-guide/adding-udfs.md | 15 +++++++++++++++ .../library-user-guide/building-logical-plans.md | 6 ++++++ 2 files changed, 21 insertions(+) diff --git a/docs/source/library-user-guide/adding-udfs.md b/docs/source/library-user-guide/adding-udfs.md index 3d1cefc09a4b..c7d958a5d62f 100644 --- a/docs/source/library-user-guide/adding-udfs.md +++ b/docs/source/library-user-guide/adding-udfs.md @@ -36,12 +36,18 @@ First we'll talk about adding an Scalar UDF end-to-end, then we'll talk about th A Scalar UDF is a function that takes a row of data and returns a single value. For example, this function takes a single i64 and returns a single i64 with 1 added to it: +```rust +TODO +``` For brevity, we'll skipped some error handling, but e.g. you may want to check that `args.len()` is the expected number of arguments. This "works" in isolation, i.e. if you have a slice of `ArrayRef`s, you can call `add_one` and it will return a new `ArrayRef` with 1 added to each value. +```rust +TODO +``` The challenge however is that DataFusion doesn't know about this function. We need to register it with DataFusion so that it can be used in the context of a query. @@ -50,6 +56,9 @@ The challenge however is that DataFusion doesn't know about this function. We ne To register a Scalar UDF, you need to wrap the function implementation in a `ScalarUDF` struct and then register it with the `SessionContext`. DataFusion provides the `create_udf` and `make_scalar_function` helper functions to make this easier. +```rust +TODO +``` A few things to note: @@ -62,10 +71,16 @@ A few things to note: That gives us a `ScalarUDF` that we can register with the `SessionContext`: +```rust +TODO +``` At this point, you can use the `add_one` function in your query: +```rust +TODO +``` ## Adding a Window UDF diff --git a/docs/source/library-user-guide/building-logical-plans.md b/docs/source/library-user-guide/building-logical-plans.md index b75a788e830a..84530eb95c1b 100644 --- a/docs/source/library-user-guide/building-logical-plans.md +++ b/docs/source/library-user-guide/building-logical-plans.md @@ -37,6 +37,9 @@ much easier to use the [LogicalPlanBuilder], which is described in the next sect Here is an example of building a logical plan directly: +```rust +TODO +``` This example produces the following plan: @@ -72,6 +75,9 @@ Here are some examples of transformation methods, but for a full list, refer to The following example demonstrates building the same simple query plan as the previous example, with a table scan followed by a filter. +```rust +TODO +``` This example produces the following plan: From c6a8c77776ccf8bca800ba526c70429b623f3c5a Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 4 Nov 2023 09:16:12 -0600 Subject: [PATCH 19/26] inline example code in source --- docs/build.sh | 2 +- docs/preprocess.py | 57 +++++++++++++++++++++++++++------------------- 2 files changed, 34 insertions(+), 25 deletions(-) diff --git a/docs/build.sh b/docs/build.sh index 262046b39992..8fdaa937db0c 100755 --- a/docs/build.sh +++ b/docs/build.sh @@ -26,7 +26,7 @@ rm -rf temp 2> /dev/null mkdir temp cp -rf source/* temp/ -# copy markdown files into temp dir again and insert source from tests +# update markdown files with latest example source code from tests python preprocess.py # replace relative URLs with absolute URLs diff --git a/docs/preprocess.py b/docs/preprocess.py index 82b22a33de5a..c8b11cd45509 100644 --- a/docs/preprocess.py +++ b/docs/preprocess.py @@ -20,7 +20,7 @@ import re -def copy_test_source(test_filename, test_method, output): +def read_source(test_filename, test_method): lines = [] with open(test_filename) as test: in_example_code = False @@ -33,10 +33,6 @@ def copy_test_source(test_filename, test_method, output): if in_example_code: lines.append(test_line) - # remove blank lines from the end of the list - while lines and lines[-1] == "": - lines.pop() - # remove leading indent when possible consistent_indent = True for line in lines: @@ -54,38 +50,51 @@ def copy_test_source(test_filename, test_method, output): else: lines.append(line) - # write to output - output.write("```rust\n") - for line in lines: - output.write(line) - output.write("```") + return lines -def add_source(input, output): - print("Copying", input, "to", output) +def update_examples(source_file): + print("Updating code samples in ", source_file) + lines = [] + # finite state machine to track state + state_scan = "scan" + state_before_code = "before" + state_in_code = "in" # include_pattern = "" - with open(input, "r") as input: - with open(output, "w") as output: - for line in input.readlines(): + with open(source_file, "r") as input: + state = state_scan + for line in input.readlines(): + if state == state_scan: + lines.append(line) matches = re.search(include_pattern, line) if matches is not None: + state = state_before_code test_file = matches.group(1) test_method = matches.group(2) test_filename = "src/{}.rs".format(test_file) - copy_test_source(test_filename, test_method, output) - else: - output.write(line) + lines.append("```rust") + lines.extend(read_source(test_filename, test_method)) + lines.append("```") + elif state == state_before_code: + # there can be blank lines between the include directive and the start of the code + if len(line.strip()) > 0: + if line.startswith("```rust"): + state = state_in_code + else: + raise "expected Rust code to immediately follow include directive but found other content" + elif state == state_in_code: + if line.strip() == "```": + state = state_scan + + with open(source_file, "w") as output: + for line in lines: + output.write(line) def main(): for file in glob.glob("source/**/*.md"): - dest = "temp/" + file[7:] - last_path_sep = dest.rindex("/") - dir = dest[0:last_path_sep] - if not os.path.exists(dir): - os.makedirs(dir) - add_source(file, dest) + update_examples(file) if __name__ == "__main__": From fa96ed3afdea8c8d2765a5a374c90eff68d5dfc8 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 4 Nov 2023 09:19:56 -0600 Subject: [PATCH 20/26] save --- docs/preprocess.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/docs/preprocess.py b/docs/preprocess.py index c8b11cd45509..7deda966cf94 100644 --- a/docs/preprocess.py +++ b/docs/preprocess.py @@ -73,8 +73,10 @@ def update_examples(source_file): test_file = matches.group(1) test_method = matches.group(2) test_filename = "src/{}.rs".format(test_file) - lines.append("```rust") - lines.extend(read_source(test_filename, test_method)) + lines.append("```rust\n") + source = read_source(test_filename, test_method) + for x in source: + lines.append(x) lines.append("```") elif state == state_before_code: # there can be blank lines between the include directive and the start of the code @@ -87,9 +89,12 @@ def update_examples(source_file): if line.strip() == "```": state = state_scan - with open(source_file, "w") as output: - for line in lines: - output.write(line) + if state == state_scan: + with open(source_file, "w") as output: + for line in lines: + output.write(line) + else: + raise "failed to rewrite example source code" def main(): From 73de2e205662c6d4f364d11cecbc0ff348351ebb Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 4 Nov 2023 09:21:04 -0600 Subject: [PATCH 21/26] inline source code in docs --- docs/preprocess.py | 2 +- docs/source/library-user-guide/adding-udfs.md | 42 +++++++++++++---- .../building-logical-plans.md | 47 +++++++++++++++++-- 3 files changed, 76 insertions(+), 15 deletions(-) diff --git a/docs/preprocess.py b/docs/preprocess.py index 7deda966cf94..dd0586d0f416 100644 --- a/docs/preprocess.py +++ b/docs/preprocess.py @@ -77,7 +77,7 @@ def update_examples(source_file): source = read_source(test_filename, test_method) for x in source: lines.append(x) - lines.append("```") + lines.append("```\n") elif state == state_before_code: # there can be blank lines between the include directive and the start of the code if len(line.strip()) > 0: diff --git a/docs/source/library-user-guide/adding-udfs.md b/docs/source/library-user-guide/adding-udfs.md index c7d958a5d62f..b5ebc686260a 100644 --- a/docs/source/library-user-guide/adding-udfs.md +++ b/docs/source/library-user-guide/adding-udfs.md @@ -37,18 +37,34 @@ A Scalar UDF is a function that takes a row of data and returns a single value. ```rust -TODO -``` +fn add_one(args: &[ArrayRef]) -> Result { + let i64s = as_int64_array(&args[0])?; + + let new_array = i64s + .iter() + .map(|array_elem| array_elem.map(|value| value + 1)) + .collect::(); + Ok(Arc::new(new_array)) +} +``` For brevity, we'll skipped some error handling, but e.g. you may want to check that `args.len()` is the expected number of arguments. This "works" in isolation, i.e. if you have a slice of `ArrayRef`s, you can call `add_one` and it will return a new `ArrayRef` with 1 added to each value. ```rust -TODO -``` +let input = vec![Some(1), None, Some(3)]; +let input = Arc::new(Int64Array::from(input)) as ArrayRef; + +let result = add_one(&[input])?; +let result = result + .as_any() + .downcast_ref::() + .expect("result is Int64Array"); +assert_eq!(result, &Int64Array::from(vec![Some(2), None, Some(4)])); +``` The challenge however is that DataFusion doesn't know about this function. We need to register it with DataFusion so that it can be used in the context of a query. ### Registering a Scalar UDF @@ -57,9 +73,14 @@ To register a Scalar UDF, you need to wrap the function implementation in a `Sca ```rust -TODO +let udf = create_udf( + "add_one", + vec![DataType::Int64], + Arc::new(DataType::Int64), + Volatility::Immutable, + make_scalar_function(add_one), +); ``` - A few things to note: - The first argument is the name of the function. This is the name that will be used in SQL queries. @@ -72,16 +93,17 @@ That gives us a `ScalarUDF` that we can register with the `SessionContext`: ```rust -TODO +let ctx = SessionContext::new(); +ctx.register_udf(udf); ``` - At this point, you can use the `add_one` function in your query: ```rust -TODO +let ctx = SessionContext::new(); +let sql = "SELECT add_one(1)"; +let df = ctx.sql(&sql).await?; ``` - ## Adding a Window UDF Scalar UDFs are functions that take a row of data and return a single value. Window UDFs are similar, but they also have access to the rows around them. Access to the the proximal rows is helpful, but adds some complexity to the implementation. diff --git a/docs/source/library-user-guide/building-logical-plans.md b/docs/source/library-user-guide/building-logical-plans.md index 84530eb95c1b..0a1e299a8035 100644 --- a/docs/source/library-user-guide/building-logical-plans.md +++ b/docs/source/library-user-guide/building-logical-plans.md @@ -38,9 +38,32 @@ Here is an example of building a logical plan directly: ```rust -TODO +// create a logical table source +let schema = Schema::new(vec![ + Field::new("id", DataType::Int32, true), + Field::new("name", DataType::Utf8, true), +]); +let table_source = LogicalTableSource::new(SchemaRef::new(schema)); + +// create a TableScan plan +let projection = None; // optional projection +let filters = vec![]; // optional filters to push down +let fetch = None; // optional LIMIT +let table_scan = LogicalPlan::TableScan(TableScan::try_new( + "person", + Arc::new(table_source), + projection, + filters, + fetch, +)?); + +// create a Filter plan that evaluates `id > 500` and wraps the TableScan +let filter_expr = col("id").gt(lit(500)); +let plan = LogicalPlan::Filter(Filter::try_new(filter_expr, Arc::new(table_scan))?); + +// print the plan +println!("{}", plan.display_indent_schema()); ``` - This example produces the following plan: ``` @@ -76,9 +99,25 @@ The following example demonstrates building the same simple query plan as the pr ```rust -TODO -``` +// create a logical table source +let schema = Schema::new(vec![ + Field::new("id", DataType::Int32, true), + Field::new("name", DataType::Utf8, true), +]); +let table_source = LogicalTableSource::new(SchemaRef::new(schema)); + +// optional projection +let projection = None; +// create a LogicalPlanBuilder for a table scan +let builder = LogicalPlanBuilder::scan("person", Arc::new(table_source), projection)?; + +// perform a filter that evaluates `id > 500`, and build the plan +let plan = builder.filter(col("id").gt(lit(500)))?.build()?; + +// print the plan +println!("{}", plan.display_indent_schema()); +``` This example produces the following plan: ``` From f684f136a67dafddae4f13960d3236282b29926b Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 4 Nov 2023 09:22:21 -0600 Subject: [PATCH 22/26] add error check --- docs/preprocess.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/preprocess.py b/docs/preprocess.py index dd0586d0f416..fd8fcafcab79 100644 --- a/docs/preprocess.py +++ b/docs/preprocess.py @@ -75,6 +75,8 @@ def update_examples(source_file): test_filename = "src/{}.rs".format(test_file) lines.append("```rust\n") source = read_source(test_filename, test_method) + if len(source) == 0: + raise "failed to read source code from unit tests" for x in source: lines.append(x) lines.append("```\n") From b148ac5a3edab2183134075794b9d0600b3e3e21 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 4 Nov 2023 09:28:23 -0600 Subject: [PATCH 23/26] update docs --- docs/README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/README.md b/docs/README.md index fb572f784669..5028943eece7 100644 --- a/docs/README.md +++ b/docs/README.md @@ -73,10 +73,16 @@ let foo = 1 + 1; ``` We can now put an `include` directive in the markdown file, specifying the name of the Rust file containing the test -and the name of the example. +and the name of the example. The include directive must be followed immediately by a code block starting with +````rust and ending with ```. This code block will be replaced whenever the `preprocess.py` +Python script is executed. Note that this action can potentially be destructive so be sure to commit documentation +changes or otherwise back them up before running this script. ```md + +```rust +``` ``` ## Release Process From df3751692c948c496ace74bb38ab23e599a6a2cf Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 4 Nov 2023 09:31:54 -0600 Subject: [PATCH 24/26] prettier --- docs/preprocess.py | 4 ++-- docs/source/library-user-guide/adding-udfs.md | 10 ++++++++++ .../library-user-guide/building-logical-plans.md | 4 ++++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/docs/preprocess.py b/docs/preprocess.py index fd8fcafcab79..9ab30a1228bf 100644 --- a/docs/preprocess.py +++ b/docs/preprocess.py @@ -73,13 +73,13 @@ def update_examples(source_file): test_file = matches.group(1) test_method = matches.group(2) test_filename = "src/{}.rs".format(test_file) - lines.append("```rust\n") + lines.append("\n```rust\n") source = read_source(test_filename, test_method) if len(source) == 0: raise "failed to read source code from unit tests" for x in source: lines.append(x) - lines.append("```\n") + lines.append("```\n\n") elif state == state_before_code: # there can be blank lines between the include directive and the start of the code if len(line.strip()) > 0: diff --git a/docs/source/library-user-guide/adding-udfs.md b/docs/source/library-user-guide/adding-udfs.md index b5ebc686260a..cbefb07ecf94 100644 --- a/docs/source/library-user-guide/adding-udfs.md +++ b/docs/source/library-user-guide/adding-udfs.md @@ -36,6 +36,7 @@ First we'll talk about adding an Scalar UDF end-to-end, then we'll talk about th A Scalar UDF is a function that takes a row of data and returns a single value. For example, this function takes a single i64 and returns a single i64 with 1 added to it: + ```rust fn add_one(args: &[ArrayRef]) -> Result { let i64s = as_int64_array(&args[0])?; @@ -48,11 +49,13 @@ fn add_one(args: &[ArrayRef]) -> Result { Ok(Arc::new(new_array)) } ``` + For brevity, we'll skipped some error handling, but e.g. you may want to check that `args.len()` is the expected number of arguments. This "works" in isolation, i.e. if you have a slice of `ArrayRef`s, you can call `add_one` and it will return a new `ArrayRef` with 1 added to each value. + ```rust let input = vec![Some(1), None, Some(3)]; let input = Arc::new(Int64Array::from(input)) as ArrayRef; @@ -65,6 +68,7 @@ let result = result assert_eq!(result, &Int64Array::from(vec![Some(2), None, Some(4)])); ``` + The challenge however is that DataFusion doesn't know about this function. We need to register it with DataFusion so that it can be used in the context of a query. ### Registering a Scalar UDF @@ -72,6 +76,7 @@ The challenge however is that DataFusion doesn't know about this function. We ne To register a Scalar UDF, you need to wrap the function implementation in a `ScalarUDF` struct and then register it with the `SessionContext`. DataFusion provides the `create_udf` and `make_scalar_function` helper functions to make this easier. + ```rust let udf = create_udf( "add_one", @@ -81,6 +86,7 @@ let udf = create_udf( make_scalar_function(add_one), ); ``` + A few things to note: - The first argument is the name of the function. This is the name that will be used in SQL queries. @@ -92,18 +98,22 @@ A few things to note: That gives us a `ScalarUDF` that we can register with the `SessionContext`: + ```rust let ctx = SessionContext::new(); ctx.register_udf(udf); ``` + At this point, you can use the `add_one` function in your query: + ```rust let ctx = SessionContext::new(); let sql = "SELECT add_one(1)"; let df = ctx.sql(&sql).await?; ``` + ## Adding a Window UDF Scalar UDFs are functions that take a row of data and return a single value. Window UDFs are similar, but they also have access to the rows around them. Access to the the proximal rows is helpful, but adds some complexity to the implementation. diff --git a/docs/source/library-user-guide/building-logical-plans.md b/docs/source/library-user-guide/building-logical-plans.md index 0a1e299a8035..ed7d6a51db15 100644 --- a/docs/source/library-user-guide/building-logical-plans.md +++ b/docs/source/library-user-guide/building-logical-plans.md @@ -37,6 +37,7 @@ much easier to use the [LogicalPlanBuilder], which is described in the next sect Here is an example of building a logical plan directly: + ```rust // create a logical table source let schema = Schema::new(vec![ @@ -64,6 +65,7 @@ let plan = LogicalPlan::Filter(Filter::try_new(filter_expr, Arc::new(table_scan) // print the plan println!("{}", plan.display_indent_schema()); ``` + This example produces the following plan: ``` @@ -98,6 +100,7 @@ Here are some examples of transformation methods, but for a full list, refer to The following example demonstrates building the same simple query plan as the previous example, with a table scan followed by a filter. + ```rust // create a logical table source let schema = Schema::new(vec![ @@ -118,6 +121,7 @@ let plan = builder.filter(col("id").gt(lit(500)))?.build()?; // print the plan println!("{}", plan.display_indent_schema()); ``` + This example produces the following plan: ``` From 8d2661112e1bfc7236ff275730980c509c5ecea4 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 4 Nov 2023 10:37:15 -0600 Subject: [PATCH 25/26] update example --- docs/source/library-user-guide/adding-udfs.md | 6 +++++- docs/source/library-user-guide/building-logical-plans.md | 2 ++ docs/src/library_udfs.rs | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/docs/source/library-user-guide/adding-udfs.md b/docs/source/library-user-guide/adding-udfs.md index cbefb07ecf94..316b6d455f76 100644 --- a/docs/source/library-user-guide/adding-udfs.md +++ b/docs/source/library-user-guide/adding-udfs.md @@ -50,6 +50,7 @@ fn add_one(args: &[ArrayRef]) -> Result { } ``` + For brevity, we'll skipped some error handling, but e.g. you may want to check that `args.len()` is the expected number of arguments. This "works" in isolation, i.e. if you have a slice of `ArrayRef`s, you can call `add_one` and it will return a new `ArrayRef` with 1 added to each value. @@ -69,6 +70,7 @@ let result = result assert_eq!(result, &Int64Array::from(vec![Some(2), None, Some(4)])); ``` + The challenge however is that DataFusion doesn't know about this function. We need to register it with DataFusion so that it can be used in the context of a query. ### Registering a Scalar UDF @@ -87,6 +89,7 @@ let udf = create_udf( ); ``` + A few things to note: - The first argument is the name of the function. This is the name that will be used in SQL queries. @@ -104,16 +107,17 @@ let ctx = SessionContext::new(); ctx.register_udf(udf); ``` + At this point, you can use the `add_one` function in your query: ```rust -let ctx = SessionContext::new(); let sql = "SELECT add_one(1)"; let df = ctx.sql(&sql).await?; ``` + ## Adding a Window UDF Scalar UDFs are functions that take a row of data and return a single value. Window UDFs are similar, but they also have access to the rows around them. Access to the the proximal rows is helpful, but adds some complexity to the implementation. diff --git a/docs/source/library-user-guide/building-logical-plans.md b/docs/source/library-user-guide/building-logical-plans.md index ed7d6a51db15..141c171895e2 100644 --- a/docs/source/library-user-guide/building-logical-plans.md +++ b/docs/source/library-user-guide/building-logical-plans.md @@ -66,6 +66,7 @@ let plan = LogicalPlan::Filter(Filter::try_new(filter_expr, Arc::new(table_scan) println!("{}", plan.display_indent_schema()); ``` + This example produces the following plan: ``` @@ -122,6 +123,7 @@ let plan = builder.filter(col("id").gt(lit(500)))?.build()?; println!("{}", plan.display_indent_schema()); ``` + This example produces the following plan: ``` diff --git a/docs/src/library_udfs.rs b/docs/src/library_udfs.rs index a58db34aa5a2..01900a949f76 100644 --- a/docs/src/library_udfs.rs +++ b/docs/src/library_udfs.rs @@ -83,8 +83,8 @@ async fn call_udf() -> Result<()> { Volatility::Immutable, make_scalar_function(add_one), ); - //begin:call_udf let ctx = SessionContext::new(); + //begin:call_udf let sql = "SELECT add_one(1)"; let df = ctx.sql(&sql).await?; //end:call_udf From 712eb09bb990bc7f134cac0a070a3a9b1618ab0e Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 4 Nov 2023 15:40:57 -0600 Subject: [PATCH 26/26] prettier --- docs/preprocess.py | 2 +- docs/source/library-user-guide/adding-udfs.md | 5 ----- docs/source/library-user-guide/building-logical-plans.md | 2 -- 3 files changed, 1 insertion(+), 8 deletions(-) diff --git a/docs/preprocess.py b/docs/preprocess.py index 9ab30a1228bf..42e10ed0d805 100644 --- a/docs/preprocess.py +++ b/docs/preprocess.py @@ -79,7 +79,7 @@ def update_examples(source_file): raise "failed to read source code from unit tests" for x in source: lines.append(x) - lines.append("```\n\n") + lines.append("```\n") elif state == state_before_code: # there can be blank lines between the include directive and the start of the code if len(line.strip()) > 0: diff --git a/docs/source/library-user-guide/adding-udfs.md b/docs/source/library-user-guide/adding-udfs.md index 316b6d455f76..b9e48bb38d06 100644 --- a/docs/source/library-user-guide/adding-udfs.md +++ b/docs/source/library-user-guide/adding-udfs.md @@ -50,7 +50,6 @@ fn add_one(args: &[ArrayRef]) -> Result { } ``` - For brevity, we'll skipped some error handling, but e.g. you may want to check that `args.len()` is the expected number of arguments. This "works" in isolation, i.e. if you have a slice of `ArrayRef`s, you can call `add_one` and it will return a new `ArrayRef` with 1 added to each value. @@ -70,7 +69,6 @@ let result = result assert_eq!(result, &Int64Array::from(vec![Some(2), None, Some(4)])); ``` - The challenge however is that DataFusion doesn't know about this function. We need to register it with DataFusion so that it can be used in the context of a query. ### Registering a Scalar UDF @@ -89,7 +87,6 @@ let udf = create_udf( ); ``` - A few things to note: - The first argument is the name of the function. This is the name that will be used in SQL queries. @@ -107,7 +104,6 @@ let ctx = SessionContext::new(); ctx.register_udf(udf); ``` - At this point, you can use the `add_one` function in your query: @@ -117,7 +113,6 @@ let sql = "SELECT add_one(1)"; let df = ctx.sql(&sql).await?; ``` - ## Adding a Window UDF Scalar UDFs are functions that take a row of data and return a single value. Window UDFs are similar, but they also have access to the rows around them. Access to the the proximal rows is helpful, but adds some complexity to the implementation. diff --git a/docs/source/library-user-guide/building-logical-plans.md b/docs/source/library-user-guide/building-logical-plans.md index 141c171895e2..ed7d6a51db15 100644 --- a/docs/source/library-user-guide/building-logical-plans.md +++ b/docs/source/library-user-guide/building-logical-plans.md @@ -66,7 +66,6 @@ let plan = LogicalPlan::Filter(Filter::try_new(filter_expr, Arc::new(table_scan) println!("{}", plan.display_indent_schema()); ``` - This example produces the following plan: ``` @@ -123,7 +122,6 @@ let plan = builder.filter(col("id").gt(lit(500)))?.build()?; println!("{}", plan.display_indent_schema()); ``` - This example produces the following plan: ```