apache · andygrove · Oct 27, 2023 · Oct 27, 2023 · Oct 27, 2023 · Oct 27, 2023
diff --git a/docs/Cargo.toml b/docs/Cargo.toml
@@ -30,3 +30,4 @@ rust-version = "1.70"
 
 [dependencies]
 datafusion = { path = "../datafusion/core", version = "33.0.0", default-features = false }
+tokio = { version = "^1.0", features = ["rt-multi-thread"] }
diff --git a/docs/README.md b/docs/README.md
@@ -19,31 +19,38 @@
 
 # DataFusion Documentation
 
-This folder contains the source content of the [User Guide](./source/user-guide)
-and [Contributor Guide](./source/contributor-guide). These are both published to
-https://arrow.apache.org/datafusion/ as part of the release process.
+This folder contains the source content of the following guides:
+
+- [User Guide]
+- [Library Guide]
+- [Contributor Guide]
+
+These guides are published to https://arrow.apache.org/datafusion/ as part of the release process.
 
 ## Dependencies
 
 It's recommended to install build dependencies and build the documentation
 inside a Python virtualenv.
 
-- Python
-- `pip install -r requirements.txt`
+Install Python and then use pip to install dependencies:
+
+```shell
+pip install -r requirements.txt
+```
 
 ## Build & Preview
 
 Run the provided script to build the HTML pages.
 
-```bash
+```shell
 ./build.sh
 ```
 
 The HTML will be generated into a `build` directory.
 
 Preview the site on Linux by running this command.
 
-```bash
+```shell
 firefox build/html/index.html
 ```
 
@@ -53,6 +60,31 @@ To make changes to the docs, simply make a Pull Request with your
 proposed changes as normal. When the PR is merged the docs will be
 automatically updated.
 
+## Including Source Code
+
+We want to make sure that all source code in the documentation is tested as part of the build and release process. We
+achieve this by writing the code in standard Rust tests in the `datafusion-docs-test` crate, and annotate the code with
+comments that mark the beginning and end of the portion of the code that we want to include in the documentation.
+
+```rust
+//begin:my_example
+let foo = 1 + 1;
+//end:my_example
+```
+
+We can now put an `include` directive in the markdown file, specifying the name of the Rust file containing the test
+and the name of the example. The include directive must be followed immediately by a code block starting with  
+`&#96;&#96;&#96;rust and ending with &#96;&#96;&#96;. This code block will be replaced whenever the `preprocess.py`
+Python script is executed. Note that this action can potentially be destructive so be sure to commit documentation
+changes or otherwise back them up before running this script.
+
+```md
+<!-- include: my_rust_file::my_example -->
+
+&#96;&#96;&#96;rust
+&#96;&#96;&#96;
+```
+
 ## Release Process
 
 This documentation is hosted at https://arrow.apache.org/datafusion/
@@ -67,3 +99,7 @@ The Apache Software Foundation provides https://arrow.apache.org/,
 which serves content based on the configuration in
 [.asf.yaml](https://github.com/apache/arrow-datafusion/blob/main/.asf.yaml),
 which specifies the target as https://arrow.apache.org/datafusion/.
+
+[user guide]: ./source/user-guide
+[library guide]: ./source/library-user-guide
+[contributor guide]: ./source/contributor-guide
diff --git a/docs/build.sh b/docs/build.sh
@@ -21,8 +21,14 @@
 set -e
 rm -rf build 2> /dev/null
 rm -rf temp 2> /dev/null
+
+# copy source to temp dir
 mkdir temp
 cp -rf source/* temp/
+
+# update markdown files with latest example source code from tests
+python preprocess.py
+
 # replace relative URLs with absolute URLs
 sed -i -e 's/\.\.\/\.\.\/\.\.\//https:\/\/github.com\/apache\/arrow-datafusion\/blob\/main\//g' temp/contributor-guide/index.md
 make SOURCEDIR=`pwd`/temp html
diff --git a/docs/preprocess.py b/docs/preprocess.py
@@ -0,0 +1,108 @@
+#!/usr/bin/python
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import os
+import re
+
+
+def read_source(test_filename, test_method):
+    lines = []
+    with open(test_filename) as test:
+        in_example_code = False
+        for test_line in test.readlines():
+            if test_line.strip() == "//begin:{}".format(test_method):
+                in_example_code = True
+                continue
+            if test_line.strip() == "//end:{}".format(test_method):
+                break
+            if in_example_code:
+                lines.append(test_line)
+
+    # remove leading indent when possible
+    consistent_indent = True
+    for line in lines:
+        if len(line.strip()) > 0 and not (
+            line.startswith("    ") or line.startswith("\t")
+        ):
+            consistent_indent = False
+            break
+    if consistent_indent:
+        old_lines = lines
+        lines = []
+        for line in old_lines:
+            if len(line) >= 4:
+                lines.append(line[4:])
+            else:
+                lines.append(line)
+
+    return lines
+
+
+def update_examples(source_file):
+    print("Updating code samples in ", source_file)
+    lines = []
+    # finite state machine to track state
+    state_scan = "scan"
+    state_before_code = "before"
+    state_in_code = "in"
+    # <!-- include: library_logical_plan::plan_builder_1 -->
+    include_pattern = "<!-- include: ([a-z0-9_]*)::([a-z0-9_]*) -->"
+    with open(source_file, "r") as input:
+        state = state_scan
+        for line in input.readlines():
+            if state == state_scan:
+                lines.append(line)
+                matches = re.search(include_pattern, line)
+                if matches is not None:
+                    state = state_before_code
+                    test_file = matches.group(1)
+                    test_method = matches.group(2)
+                    test_filename = "src/{}.rs".format(test_file)
+                    lines.append("\n```rust\n")
+                    source = read_source(test_filename, test_method)
+                    if len(source) == 0:
+                        raise "failed to read source code from unit tests"
+                    for x in source:
+                        lines.append(x)
+                    lines.append("```\n")
+            elif state == state_before_code:
+                # there can be blank lines between the include directive and the start of the code
+                if len(line.strip()) > 0:
+                    if line.startswith("```rust"):
+                        state = state_in_code
+                    else:
+                        raise "expected Rust code to immediately follow include directive but found other content"
+            elif state == state_in_code:
+                if line.strip() == "```":
+                    state = state_scan
+
+    if state == state_scan:
+        with open(source_file, "w") as output:
+            for line in lines:
+                output.write(line)
+    else:
+        raise "failed to rewrite example source code"
+
+
+def main():
+    for file in glob.glob("source/**/*.md"):
+        update_examples(file)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/source/library-user-guide/adding-udfs.md b/docs/source/library-user-guide/adding-udfs.md
@@ -35,23 +35,16 @@ First we'll talk about adding an Scalar UDF end-to-end, then we'll talk about th
 
 A Scalar UDF is a function that takes a row of data and returns a single value. For example, this function takes a single i64 and returns a single i64 with 1 added to it:
 
-```rust
-use std::sync::Arc;
-
-use arrow::array::{ArrayRef, Int64Array};
-use datafusion::common::Result;
-
-use datafusion::common::cast::as_int64_array;
-
-pub fn add_one(args: &[ArrayRef]) -> Result<ArrayRef> {
-    // Error handling omitted for brevity
+<!-- include: library_udfs::add_one -->
 
+```rust
+fn add_one(args: &[ArrayRef]) -> Result<ArrayRef> {
     let i64s = as_int64_array(&args[0])?;
 
     let new_array = i64s
-      .iter()
-      .map(|array_elem| array_elem.map(|value| value + 1))
-      .collect::<Int64Array>();
+        .iter()
+        .map(|array_elem| array_elem.map(|value| value + 1))
+        .collect::<Int64Array>();
 
     Ok(Arc::new(new_array))
 }
@@ -61,12 +54,17 @@ For brevity, we'll skipped some error handling, but e.g. you may want to check t
 
 This "works" in isolation, i.e. if you have a slice of `ArrayRef`s, you can call `add_one` and it will return a new `ArrayRef` with 1 added to each value.
 
+<!-- include: library_udfs::call_add_one -->
+
 ```rust
 let input = vec![Some(1), None, Some(3)];
 let input = Arc::new(Int64Array::from(input)) as ArrayRef;
 
-let result = add_one(&[input]).unwrap();
-let result = result.as_any().downcast_ref::<Int64Array>().unwrap();
+let result = add_one(&[input])?;
+let result = result
+    .as_any()
+    .downcast_ref::<Int64Array>()
+    .expect("result is Int64Array");
 
 assert_eq!(result, &Int64Array::from(vec![Some(2), None, Some(4)]));
 ```
@@ -77,6 +75,8 @@ The challenge however is that DataFusion doesn't know about this function. We ne
 
 To register a Scalar UDF, you need to wrap the function implementation in a `ScalarUDF` struct and then register it with the `SessionContext`. DataFusion provides the `create_udf` and `make_scalar_function` helper functions to make this easier.
 
+<!-- include: library_udfs::create_udf -->
+
 ```rust
 let udf = create_udf(
     "add_one",
@@ -97,18 +97,20 @@ A few things to note:
 
 That gives us a `ScalarUDF` that we can register with the `SessionContext`:
 
-```rust
-let mut ctx = SessionContext::new();
+<!-- include: library_udfs::register_udf -->
 
+```rust
+let ctx = SessionContext::new();
 ctx.register_udf(udf);
 ```
 
 At this point, you can use the `add_one` function in your query:
 
+<!-- include: library_udfs::call_udf -->
+
 ```rust
 let sql = "SELECT add_one(1)";
-
-let df = ctx.sql(&sql).await.unwrap();
+let df = ctx.sql(&sql).await?;
 ```
 
 ## Adding a Window UDF

diff --git a/docs/source/library-user-guide/building-logical-plans.md b/docs/source/library-user-guide/building-logical-plans.md
@@ -36,7 +36,7 @@ much easier to use the [LogicalPlanBuilder], which is described in the next sect
 
 Here is an example of building a logical plan directly:
 
-<!-- source for this example is in datafusion_docs::library_logical_plan::plan_1 -->
+<!-- include: library_logical_plan::create_plan -->
 
 ```rust
 // create a logical table source
@@ -58,7 +58,7 @@ let table_scan = LogicalPlan::TableScan(TableScan::try_new(
     fetch,
 )?);
 
-// create a Filter plan that evaluates `id > 500` that wraps the TableScan
+// create a Filter plan that evaluates `id > 500` and wraps the TableScan
 let filter_expr = col("id").gt(lit(500));
 let plan = LogicalPlan::Filter(Filter::try_new(filter_expr, Arc::new(table_scan))?);
 
@@ -99,7 +99,7 @@ Here are some examples of transformation methods, but for a full list, refer to
 
 The following example demonstrates building the same simple query plan as the previous example, with a table scan followed by a filter.
 
-<!-- source for this example is in datafusion_docs::library_logical_plan::plan_builder_1 -->
+<!-- include: library_logical_plan::build_plan -->
 
 ```rust
 // create a logical table source
@@ -115,10 +115,8 @@ let projection = None;
 // create a LogicalPlanBuilder for a table scan
 let builder = LogicalPlanBuilder::scan("person", Arc::new(table_source), projection)?;
 
-// perform a filter operation and build the plan
-let plan = builder
-    .filter(col("id").gt(lit(500)))? // WHERE id > 500
-    .build()?;
+// perform a filter that evaluates `id > 500`, and build the plan
+let plan = builder.filter(col("id").gt(lit(500)))?.build()?;
 
 // print the plan
 println!("{}", plan.display_indent_schema());

diff --git a/docs/src/lib.rs b/docs/src/lib.rs
@@ -14,6 +14,5 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-
-#[cfg(test)]
 mod library_logical_plan;
+mod library_udfs;
diff --git a/docs/src/library_logical_plan.rs b/docs/src/library_logical_plan.rs
@@ -23,7 +23,8 @@ use datafusion::prelude::*;
 use std::sync::Arc;
 
 #[test]
-fn plan_1() -> Result<()> {
+fn create_plan() -> Result<()> {
+    //begin:create_plan
     // create a logical table source
     let schema = Schema::new(vec![
         Field::new("id", DataType::Int32, true),
@@ -49,12 +50,20 @@ fn plan_1() -> Result<()> {
 
     // print the plan
     println!("{}", plan.display_indent_schema());
+    //end:create_plan
+
+    assert_eq!(
+        plan.display_indent_schema().to_string(),
+        r#"Filter: id > Int32(500) [id:Int32;N, name:Utf8;N]
+  TableScan: person [id:Int32;N, name:Utf8;N]"#
+    );
 
     Ok(())
 }
 
 #[test]
-fn plan_builder_1() -> Result<()> {
+fn build_plan() -> Result<()> {
+    //begin:build_plan
     // create a logical table source
     let schema = Schema::new(vec![
         Field::new("id", DataType::Int32, true),
@@ -73,6 +82,13 @@ fn plan_builder_1() -> Result<()> {
 
     // print the plan
     println!("{}", plan.display_indent_schema());
+    //end:build_plan
+
+    assert_eq!(
+        plan.display_indent_schema().to_string(),
+        r#"Filter: person.id > Int32(500) [id:Int32;N, name:Utf8;N]
+  TableScan: person [id:Int32;N, name:Utf8;N]"#
+    );
 
     Ok(())
 }
Original file line number	Diff line number	Diff line change
Expand Up		@@ -30,3 +30,4 @@ rust-version = "1.70"

		[dependencies]
		datafusion = { path = "../datafusion/core", version = "33.0.0", default-features = false }
		tokio = { version = "^1.0", features = ["rt-multi-thread"] }