diff --git a/docs/Cargo.toml b/docs/Cargo.toml index 4d01466924f9..44abd36b2d65 100644 --- a/docs/Cargo.toml +++ b/docs/Cargo.toml @@ -30,3 +30,4 @@ rust-version = "1.70" [dependencies] datafusion = { path = "../datafusion/core", version = "33.0.0", default-features = false } +tokio = { version = "^1.0", features = ["rt-multi-thread"] } diff --git a/docs/README.md b/docs/README.md index 8b55e8756e19..5028943eece7 100644 --- a/docs/README.md +++ b/docs/README.md @@ -19,23 +19,30 @@ # DataFusion Documentation -This folder contains the source content of the [User Guide](./source/user-guide) -and [Contributor Guide](./source/contributor-guide). These are both published to -https://arrow.apache.org/datafusion/ as part of the release process. +This folder contains the source content of the following guides: + +- [User Guide] +- [Library Guide] +- [Contributor Guide] + +These guides are published to https://arrow.apache.org/datafusion/ as part of the release process. ## Dependencies It's recommended to install build dependencies and build the documentation inside a Python virtualenv. -- Python -- `pip install -r requirements.txt` +Install Python and then use pip to install dependencies: + +```shell +pip install -r requirements.txt +``` ## Build & Preview Run the provided script to build the HTML pages. -```bash +```shell ./build.sh ``` @@ -43,7 +50,7 @@ The HTML will be generated into a `build` directory. Preview the site on Linux by running this command. -```bash +```shell firefox build/html/index.html ``` @@ -53,6 +60,31 @@ To make changes to the docs, simply make a Pull Request with your proposed changes as normal. When the PR is merged the docs will be automatically updated. +## Including Source Code + +We want to make sure that all source code in the documentation is tested as part of the build and release process. We +achieve this by writing the code in standard Rust tests in the `datafusion-docs-test` crate, and annotate the code with +comments that mark the beginning and end of the portion of the code that we want to include in the documentation. + +```rust +//begin:my_example +let foo = 1 + 1; +//end:my_example +``` + +We can now put an `include` directive in the markdown file, specifying the name of the Rust file containing the test +and the name of the example. The include directive must be followed immediately by a code block starting with +````rust and ending with ```. This code block will be replaced whenever the `preprocess.py` +Python script is executed. Note that this action can potentially be destructive so be sure to commit documentation +changes or otherwise back them up before running this script. + +```md + + +```rust +``` +``` + ## Release Process This documentation is hosted at https://arrow.apache.org/datafusion/ @@ -67,3 +99,7 @@ The Apache Software Foundation provides https://arrow.apache.org/, which serves content based on the configuration in [.asf.yaml](https://github.com/apache/arrow-datafusion/blob/main/.asf.yaml), which specifies the target as https://arrow.apache.org/datafusion/. + +[user guide]: ./source/user-guide +[library guide]: ./source/library-user-guide +[contributor guide]: ./source/contributor-guide diff --git a/docs/build.sh b/docs/build.sh index 3fdcd0327024..8fdaa937db0c 100755 --- a/docs/build.sh +++ b/docs/build.sh @@ -21,8 +21,14 @@ set -e rm -rf build 2> /dev/null rm -rf temp 2> /dev/null + +# copy source to temp dir mkdir temp cp -rf source/* temp/ + +# update markdown files with latest example source code from tests +python preprocess.py + # replace relative URLs with absolute URLs sed -i -e 's/\.\.\/\.\.\/\.\.\//https:\/\/github.com\/apache\/arrow-datafusion\/blob\/main\//g' temp/contributor-guide/index.md make SOURCEDIR=`pwd`/temp html diff --git a/docs/preprocess.py b/docs/preprocess.py new file mode 100644 index 000000000000..42e10ed0d805 --- /dev/null +++ b/docs/preprocess.py @@ -0,0 +1,108 @@ +#!/usr/bin/python +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import glob +import os +import re + + +def read_source(test_filename, test_method): + lines = [] + with open(test_filename) as test: + in_example_code = False + for test_line in test.readlines(): + if test_line.strip() == "//begin:{}".format(test_method): + in_example_code = True + continue + if test_line.strip() == "//end:{}".format(test_method): + break + if in_example_code: + lines.append(test_line) + + # remove leading indent when possible + consistent_indent = True + for line in lines: + if len(line.strip()) > 0 and not ( + line.startswith(" ") or line.startswith("\t") + ): + consistent_indent = False + break + if consistent_indent: + old_lines = lines + lines = [] + for line in old_lines: + if len(line) >= 4: + lines.append(line[4:]) + else: + lines.append(line) + + return lines + + +def update_examples(source_file): + print("Updating code samples in ", source_file) + lines = [] + # finite state machine to track state + state_scan = "scan" + state_before_code = "before" + state_in_code = "in" + # + include_pattern = "" + with open(source_file, "r") as input: + state = state_scan + for line in input.readlines(): + if state == state_scan: + lines.append(line) + matches = re.search(include_pattern, line) + if matches is not None: + state = state_before_code + test_file = matches.group(1) + test_method = matches.group(2) + test_filename = "src/{}.rs".format(test_file) + lines.append("\n```rust\n") + source = read_source(test_filename, test_method) + if len(source) == 0: + raise "failed to read source code from unit tests" + for x in source: + lines.append(x) + lines.append("```\n") + elif state == state_before_code: + # there can be blank lines between the include directive and the start of the code + if len(line.strip()) > 0: + if line.startswith("```rust"): + state = state_in_code + else: + raise "expected Rust code to immediately follow include directive but found other content" + elif state == state_in_code: + if line.strip() == "```": + state = state_scan + + if state == state_scan: + with open(source_file, "w") as output: + for line in lines: + output.write(line) + else: + raise "failed to rewrite example source code" + + +def main(): + for file in glob.glob("source/**/*.md"): + update_examples(file) + + +if __name__ == "__main__": + main() diff --git a/docs/source/library-user-guide/adding-udfs.md b/docs/source/library-user-guide/adding-udfs.md index a4b5ed0b40f1..b9e48bb38d06 100644 --- a/docs/source/library-user-guide/adding-udfs.md +++ b/docs/source/library-user-guide/adding-udfs.md @@ -35,23 +35,16 @@ First we'll talk about adding an Scalar UDF end-to-end, then we'll talk about th A Scalar UDF is a function that takes a row of data and returns a single value. For example, this function takes a single i64 and returns a single i64 with 1 added to it: -```rust -use std::sync::Arc; - -use arrow::array::{ArrayRef, Int64Array}; -use datafusion::common::Result; - -use datafusion::common::cast::as_int64_array; - -pub fn add_one(args: &[ArrayRef]) -> Result { - // Error handling omitted for brevity + +```rust +fn add_one(args: &[ArrayRef]) -> Result { let i64s = as_int64_array(&args[0])?; let new_array = i64s - .iter() - .map(|array_elem| array_elem.map(|value| value + 1)) - .collect::(); + .iter() + .map(|array_elem| array_elem.map(|value| value + 1)) + .collect::(); Ok(Arc::new(new_array)) } @@ -61,12 +54,17 @@ For brevity, we'll skipped some error handling, but e.g. you may want to check t This "works" in isolation, i.e. if you have a slice of `ArrayRef`s, you can call `add_one` and it will return a new `ArrayRef` with 1 added to each value. + + ```rust let input = vec![Some(1), None, Some(3)]; let input = Arc::new(Int64Array::from(input)) as ArrayRef; -let result = add_one(&[input]).unwrap(); -let result = result.as_any().downcast_ref::().unwrap(); +let result = add_one(&[input])?; +let result = result + .as_any() + .downcast_ref::() + .expect("result is Int64Array"); assert_eq!(result, &Int64Array::from(vec![Some(2), None, Some(4)])); ``` @@ -77,6 +75,8 @@ The challenge however is that DataFusion doesn't know about this function. We ne To register a Scalar UDF, you need to wrap the function implementation in a `ScalarUDF` struct and then register it with the `SessionContext`. DataFusion provides the `create_udf` and `make_scalar_function` helper functions to make this easier. + + ```rust let udf = create_udf( "add_one", @@ -97,18 +97,20 @@ A few things to note: That gives us a `ScalarUDF` that we can register with the `SessionContext`: -```rust -let mut ctx = SessionContext::new(); + +```rust +let ctx = SessionContext::new(); ctx.register_udf(udf); ``` At this point, you can use the `add_one` function in your query: + + ```rust let sql = "SELECT add_one(1)"; - -let df = ctx.sql(&sql).await.unwrap(); +let df = ctx.sql(&sql).await?; ``` ## Adding a Window UDF diff --git a/docs/source/library-user-guide/building-logical-plans.md b/docs/source/library-user-guide/building-logical-plans.md index fe922d8eaeb1..ed7d6a51db15 100644 --- a/docs/source/library-user-guide/building-logical-plans.md +++ b/docs/source/library-user-guide/building-logical-plans.md @@ -36,7 +36,7 @@ much easier to use the [LogicalPlanBuilder], which is described in the next sect Here is an example of building a logical plan directly: - + ```rust // create a logical table source @@ -58,7 +58,7 @@ let table_scan = LogicalPlan::TableScan(TableScan::try_new( fetch, )?); -// create a Filter plan that evaluates `id > 500` that wraps the TableScan +// create a Filter plan that evaluates `id > 500` and wraps the TableScan let filter_expr = col("id").gt(lit(500)); let plan = LogicalPlan::Filter(Filter::try_new(filter_expr, Arc::new(table_scan))?); @@ -99,7 +99,7 @@ Here are some examples of transformation methods, but for a full list, refer to The following example demonstrates building the same simple query plan as the previous example, with a table scan followed by a filter. - + ```rust // create a logical table source @@ -115,10 +115,8 @@ let projection = None; // create a LogicalPlanBuilder for a table scan let builder = LogicalPlanBuilder::scan("person", Arc::new(table_source), projection)?; -// perform a filter operation and build the plan -let plan = builder - .filter(col("id").gt(lit(500)))? // WHERE id > 500 - .build()?; +// perform a filter that evaluates `id > 500`, and build the plan +let plan = builder.filter(col("id").gt(lit(500)))?.build()?; // print the plan println!("{}", plan.display_indent_schema()); diff --git a/docs/src/lib.rs b/docs/src/lib.rs index f73132468ec9..3ab99e77aa9b 100644 --- a/docs/src/lib.rs +++ b/docs/src/lib.rs @@ -14,6 +14,5 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. - -#[cfg(test)] mod library_logical_plan; +mod library_udfs; diff --git a/docs/src/library_logical_plan.rs b/docs/src/library_logical_plan.rs index 355003941570..5a12c9034c60 100644 --- a/docs/src/library_logical_plan.rs +++ b/docs/src/library_logical_plan.rs @@ -23,7 +23,8 @@ use datafusion::prelude::*; use std::sync::Arc; #[test] -fn plan_1() -> Result<()> { +fn create_plan() -> Result<()> { + //begin:create_plan // create a logical table source let schema = Schema::new(vec![ Field::new("id", DataType::Int32, true), @@ -49,12 +50,20 @@ fn plan_1() -> Result<()> { // print the plan println!("{}", plan.display_indent_schema()); + //end:create_plan + + assert_eq!( + plan.display_indent_schema().to_string(), + r#"Filter: id > Int32(500) [id:Int32;N, name:Utf8;N] + TableScan: person [id:Int32;N, name:Utf8;N]"# + ); Ok(()) } #[test] -fn plan_builder_1() -> Result<()> { +fn build_plan() -> Result<()> { + //begin:build_plan // create a logical table source let schema = Schema::new(vec![ Field::new("id", DataType::Int32, true), @@ -73,6 +82,13 @@ fn plan_builder_1() -> Result<()> { // print the plan println!("{}", plan.display_indent_schema()); + //end:build_plan + + assert_eq!( + plan.display_indent_schema().to_string(), + r#"Filter: person.id > Int32(500) [id:Int32;N, name:Utf8;N] + TableScan: person [id:Int32;N, name:Utf8;N]"# + ); Ok(()) } diff --git a/docs/src/library_udfs.rs b/docs/src/library_udfs.rs new file mode 100644 index 000000000000..01900a949f76 --- /dev/null +++ b/docs/src/library_udfs.rs @@ -0,0 +1,92 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion::arrow::array::{ArrayRef, Int64Array}; +use datafusion::arrow::datatypes::DataType; +use datafusion::common::cast::as_int64_array; +use datafusion::error::Result; +use datafusion::logical_expr::Volatility; +use datafusion::physical_expr::functions::make_scalar_function; +use datafusion::prelude::{create_udf, SessionContext}; +use std::sync::Arc; +use tokio; + +//begin:add_one +fn add_one(args: &[ArrayRef]) -> Result { + let i64s = as_int64_array(&args[0])?; + + let new_array = i64s + .iter() + .map(|array_elem| array_elem.map(|value| value + 1)) + .collect::(); + + Ok(Arc::new(new_array)) +} +//end:add_one + +#[test] +fn call_add_one() -> Result<()> { + //begin:call_add_one + let input = vec![Some(1), None, Some(3)]; + let input = Arc::new(Int64Array::from(input)) as ArrayRef; + + let result = add_one(&[input])?; + let result = result + .as_any() + .downcast_ref::() + .expect("result is Int64Array"); + + assert_eq!(result, &Int64Array::from(vec![Some(2), None, Some(4)])); + //end:call_add_one + + Ok(()) +} + +#[test] +fn register_udf() -> Result<()> { + //begin:create_udf + let udf = create_udf( + "add_one", + vec![DataType::Int64], + Arc::new(DataType::Int64), + Volatility::Immutable, + make_scalar_function(add_one), + ); + //end:create_udf + //begin:register_udf + let ctx = SessionContext::new(); + ctx.register_udf(udf); + //end:register_udf + Ok(()) +} + +#[tokio::test] +async fn call_udf() -> Result<()> { + let udf = create_udf( + "add_one", + vec![DataType::Int64], + Arc::new(DataType::Int64), + Volatility::Immutable, + make_scalar_function(add_one), + ); + let ctx = SessionContext::new(); + //begin:call_udf + let sql = "SELECT add_one(1)"; + let df = ctx.sql(&sql).await?; + //end:call_udf + Ok(()) +}