diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index edaa49ec6e7e..4527d047e4c0 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -521,7 +521,7 @@ jobs: run: taplo format --check config-docs-check: - name: check configs.md is up-to-date + name: check configs.md and ***_functions.md is up-to-date needs: [ linux-build-lib ] runs-on: ubuntu-latest container: @@ -542,6 +542,11 @@ jobs: # If you encounter an error, run './dev/update_config_docs.sh' and commit ./dev/update_config_docs.sh git diff --exit-code + - name: Check if any of the ***_functions.md has been modified + run: | + # If you encounter an error, run './dev/update_function_docs.sh' and commit + ./dev/update_function_docs.sh + git diff --exit-code # Verify MSRV for the crates which are directly used by other projects: # - datafusion diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index a1157cbffbd6..8c77ea8a2557 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1401,6 +1401,7 @@ dependencies = [ "datafusion-physical-expr", "datafusion-physical-expr-common", "half", + "indexmap", "log", "paste", ] diff --git a/datafusion/core/src/bin/print_functions_docs.rs b/datafusion/core/src/bin/print_functions_docs.rs new file mode 100644 index 000000000000..92737b244a64 --- /dev/null +++ b/datafusion/core/src/bin/print_functions_docs.rs @@ -0,0 +1,265 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion::execution::SessionStateDefaults; +use datafusion_expr::{ + aggregate_doc_sections, scalar_doc_sections, window_doc_sections, AggregateUDF, + DocSection, Documentation, ScalarUDF, WindowUDF, +}; +use itertools::Itertools; +use std::env::args; +use std::fmt::Write as _; + +fn main() { + let args: Vec = args().collect(); + + if args.len() != 2 { + panic!( + "Usage: {} type (one of 'aggregate', 'scalar', 'window')", + args[0] + ); + } + + let function_type = args[1].trim().to_lowercase(); + let docs = match function_type.as_str() { + "aggregate" => print_aggregate_docs(), + "scalar" => print_scalar_docs(), + "window" => print_window_docs(), + _ => { + panic!("Unknown function type: {}", function_type) + } + }; + + println!("{docs}"); +} + +fn print_aggregate_docs() -> String { + let mut providers: Vec> = vec![]; + + for f in SessionStateDefaults::default_aggregate_functions() { + providers.push(Box::new(f.as_ref().clone())); + } + + print_docs(providers, aggregate_doc_sections::doc_sections()) +} + +fn print_scalar_docs() -> String { + let mut providers: Vec> = vec![]; + + for f in SessionStateDefaults::default_scalar_functions() { + providers.push(Box::new(f.as_ref().clone())); + } + + print_docs(providers, scalar_doc_sections::doc_sections()) +} + +fn print_window_docs() -> String { + let mut providers: Vec> = vec![]; + + for f in SessionStateDefaults::default_window_functions() { + providers.push(Box::new(f.as_ref().clone())); + } + + print_docs(providers, window_doc_sections::doc_sections()) +} + +fn print_docs( + providers: Vec>, + doc_sections: Vec, +) -> String { + let mut docs = "".to_string(); + + // doc sections only includes sections that have 'include' == true + for doc_section in doc_sections { + // make sure there is a function that is in this doc section + if !&providers.iter().any(|f| { + if let Some(documentation) = f.get_documentation() { + documentation.doc_section == doc_section + } else { + false + } + }) { + continue; + } + + let providers: Vec<&Box> = providers + .iter() + .filter(|&f| { + if let Some(documentation) = f.get_documentation() { + documentation.doc_section == doc_section + } else { + false + } + }) + .collect::>(); + + // write out section header + let _ = writeln!(docs, "## {} ", doc_section.label); + + if let Some(description) = doc_section.description { + let _ = writeln!(docs, "{description}"); + } + + // names is a sorted list of function names and aliases since we display + // both in the documentation + let names = get_names_and_aliases(&providers); + + // write out the list of function names and aliases + names.iter().for_each(|name| { + let _ = writeln!(docs, "- [{name}](#{name})"); + }); + + // write out each function and alias in the order of the sorted name list + for name in names { + let f = providers + .iter() + .find(|f| f.get_name() == name || f.get_aliases().contains(&name)) + .unwrap(); + + let name = f.get_name(); + let aliases = f.get_aliases(); + let documentation = f.get_documentation(); + + // if this name is an alias we need to display what it's an alias of + if aliases.contains(&name) { + let _ = write!(docs, "_Alias of [{name}](#{name})._"); + continue; + } + + // otherwise display the documentation for the function + let Some(documentation) = documentation else { + unreachable!() + }; + + // first, the name, description and syntax example + let _ = write!( + docs, + r#" +### `{}` + +{} + +``` +{} +``` +"#, + name, documentation.description, documentation.syntax_example + ); + + // next, arguments + if let Some(args) = &documentation.arguments { + let _ = writeln!(docs, "#### Arguments\n"); + for (arg_name, arg_desc) in args { + let _ = writeln!(docs, "- **{arg_name}**: {arg_desc}"); + } + } + + // next, sql example if provided + if let Some(example) = &documentation.sql_example { + let _ = writeln!( + docs, + r#" +#### Example + +{} +"#, + example + ); + } + + // next, aliases + if !f.get_aliases().is_empty() { + let _ = write!(docs, "#### Aliases"); + + for alias in f.get_aliases() { + let _ = writeln!(docs, "- {alias}"); + } + } + + // finally, any related udfs + if let Some(related_udfs) = &documentation.related_udfs { + let _ = writeln!(docs, "\n**Related functions**:"); + + for related in related_udfs { + let _ = writeln!(docs, "- [{related}](#{related})"); + } + } + } + } + + docs +} + +trait DocProvider { + fn get_name(&self) -> String; + fn get_aliases(&self) -> Vec; + fn get_documentation(&self) -> Option<&Documentation>; +} + +impl DocProvider for AggregateUDF { + fn get_name(&self) -> String { + self.name().to_string() + } + fn get_aliases(&self) -> Vec { + self.aliases().iter().map(|a| a.to_string()).collect() + } + fn get_documentation(&self) -> Option<&Documentation> { + self.documentation() + } +} + +impl DocProvider for ScalarUDF { + fn get_name(&self) -> String { + self.name().to_string() + } + fn get_aliases(&self) -> Vec { + self.aliases().iter().map(|a| a.to_string()).collect() + } + fn get_documentation(&self) -> Option<&Documentation> { + self.documentation() + } +} + +impl DocProvider for WindowUDF { + fn get_name(&self) -> String { + self.name().to_string() + } + fn get_aliases(&self) -> Vec { + self.aliases().iter().map(|a| a.to_string()).collect() + } + fn get_documentation(&self) -> Option<&Documentation> { + self.documentation() + } +} + +#[allow(clippy::borrowed_box)] +#[allow(clippy::ptr_arg)] +fn get_names_and_aliases(functions: &Vec<&Box>) -> Vec { + functions + .iter() + .flat_map(|f| { + if f.get_aliases().is_empty() { + vec![f.get_name().to_string()] + } else { + let mut names = vec![f.get_name().to_string()]; + names.extend(f.get_aliases().iter().cloned()); + names + } + }) + .sorted() + .collect_vec() +} diff --git a/datafusion/expr/src/lib.rs b/datafusion/expr/src/lib.rs index 7d94a3b93eab..849d9604808c 100644 --- a/datafusion/expr/src/lib.rs +++ b/datafusion/expr/src/lib.rs @@ -34,6 +34,7 @@ mod partition_evaluator; mod table_source; mod udaf; mod udf; +mod udf_docs; mod udwf; pub mod conditional_expressions; @@ -90,9 +91,12 @@ pub use logical_plan::*; pub use partition_evaluator::PartitionEvaluator; pub use sqlparser; pub use table_source::{TableProviderFilterPushDown, TableSource, TableType}; -pub use udaf::{AggregateUDF, AggregateUDFImpl, ReversedUDAF, StatisticsArgs}; -pub use udf::{ScalarUDF, ScalarUDFImpl}; -pub use udwf::{ReversedUDWF, WindowUDF, WindowUDFImpl}; +pub use udaf::{ + aggregate_doc_sections, AggregateUDF, AggregateUDFImpl, ReversedUDAF, StatisticsArgs, +}; +pub use udf::{scalar_doc_sections, ScalarUDF, ScalarUDFImpl}; +pub use udf_docs::{DocSection, Documentation, DocumentationBuilder}; +pub use udwf::{window_doc_sections, ReversedUDWF, WindowUDF, WindowUDFImpl}; pub use window_frame::{WindowFrame, WindowFrameBound, WindowFrameUnits}; #[cfg(test)] diff --git a/datafusion/expr/src/udaf.rs b/datafusion/expr/src/udaf.rs index 780ea36910a4..6e48054bcf3d 100644 --- a/datafusion/expr/src/udaf.rs +++ b/datafusion/expr/src/udaf.rs @@ -36,8 +36,8 @@ use crate::function::{ use crate::groups_accumulator::GroupsAccumulator; use crate::utils::format_state_name; use crate::utils::AggregateOrderSensitivity; -use crate::Signature; use crate::{Accumulator, Expr}; +use crate::{Documentation, Signature}; /// Logical representation of a user-defined [aggregate function] (UDAF). /// @@ -275,6 +275,14 @@ impl AggregateUDF { pub fn default_value(&self, data_type: &DataType) -> Result { self.inner.default_value(data_type) } + + /// Returns the documentation for this Aggregate UDF. + /// + /// Documentation can be accessed programmatically as well as + /// generating publicly facing documentation. + pub fn documentation(&self) -> Option<&Documentation> { + self.inner.documentation() + } } impl From for AggregateUDF @@ -299,25 +307,42 @@ where /// # Basic Example /// ``` /// # use std::any::Any; +/// # use std::sync::OnceLock; /// # use arrow::datatypes::DataType; /// # use datafusion_common::{DataFusionError, plan_err, Result}; -/// # use datafusion_expr::{col, ColumnarValue, Signature, Volatility, Expr}; +/// # use datafusion_expr::{col, ColumnarValue, Signature, Volatility, Expr, Documentation}; /// # use datafusion_expr::{AggregateUDFImpl, AggregateUDF, Accumulator, function::{AccumulatorArgs, StateFieldsArgs}}; +/// # use datafusion_expr::window_doc_sections::DOC_SECTION_AGGREGATE; /// # use arrow::datatypes::Schema; /// # use arrow::datatypes::Field; +/// /// #[derive(Debug, Clone)] /// struct GeoMeanUdf { -/// signature: Signature +/// signature: Signature, /// } /// /// impl GeoMeanUdf { /// fn new() -> Self { /// Self { -/// signature: Signature::uniform(1, vec![DataType::Float64], Volatility::Immutable) +/// signature: Signature::uniform(1, vec![DataType::Float64], Volatility::Immutable), /// } /// } /// } /// +/// static DOCUMENTATION: OnceLock = OnceLock::new(); +/// +/// fn get_doc() -> &'static Documentation { +/// DOCUMENTATION.get_or_init(|| { +/// Documentation::builder() +/// .with_doc_section(DOC_SECTION_AGGREGATE) +/// .with_description("calculates a geometric mean") +/// .with_syntax_example("geo_mean(2.0)") +/// .with_argument("arg1", "The Float64 number for the geometric mean") +/// .build() +/// .unwrap() +/// }) +/// } +/// /// /// Implement the AggregateUDFImpl trait for GeoMeanUdf /// impl AggregateUDFImpl for GeoMeanUdf { /// fn as_any(&self) -> &dyn Any { self } @@ -325,7 +350,7 @@ where /// fn signature(&self) -> &Signature { &self.signature } /// fn return_type(&self, args: &[DataType]) -> Result { /// if !matches!(args.get(0), Some(&DataType::Float64)) { -/// return plan_err!("add_one only accepts Float64 arguments"); +/// return plan_err!("geo_mean only accepts Float64 arguments"); /// } /// Ok(DataType::Float64) /// } @@ -337,6 +362,9 @@ where /// Field::new("ordering", DataType::UInt32, true) /// ]) /// } +/// fn documentation(&self) -> Option<&Documentation> { +/// Some(get_doc()) +/// } /// } /// /// // Create a new AggregateUDF from the implementation @@ -603,6 +631,14 @@ pub trait AggregateUDFImpl: Debug + Send + Sync { fn default_value(&self, data_type: &DataType) -> Result { ScalarValue::try_from(data_type) } + + /// Returns the documentation for this Aggregate UDF. + /// + /// Documentation can be accessed programmatically as well as + /// generating publicly facing documentation. + fn documentation(&self) -> Option<&Documentation> { + None + } } impl PartialEq for dyn AggregateUDFImpl { @@ -749,6 +785,41 @@ impl AggregateUDFImpl for AliasedAggregateUDFImpl { fn is_descending(&self) -> Option { self.inner.is_descending() } + + fn documentation(&self) -> Option<&Documentation> { + self.inner.documentation() + } +} + +// Aggregate UDF doc sections for use in public documentation +pub mod aggregate_doc_sections { + use crate::DocSection; + + pub fn doc_sections() -> Vec { + vec![ + DOC_SECTION_GENERAL, + DOC_SECTION_STATISTICAL, + DOC_SECTION_APPROXIMATE, + ] + } + + pub const DOC_SECTION_GENERAL: DocSection = DocSection { + include: true, + label: "General Functions", + description: None, + }; + + pub const DOC_SECTION_STATISTICAL: DocSection = DocSection { + include: true, + label: "Statistical Functions", + description: None, + }; + + pub const DOC_SECTION_APPROXIMATE: DocSection = DocSection { + include: true, + label: "Approximate Functions", + description: None, + }; } #[cfg(test)] diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index 938e1181d85d..3759fb18f56d 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -20,7 +20,9 @@ use crate::expr::schema_name_from_exprs_comma_seperated_without_space; use crate::simplify::{ExprSimplifyResult, SimplifyInfo}; use crate::sort_properties::{ExprProperties, SortProperties}; -use crate::{ColumnarValue, Expr, ScalarFunctionImplementation, Signature}; +use crate::{ + ColumnarValue, Documentation, Expr, ScalarFunctionImplementation, Signature, +}; use arrow::datatypes::DataType; use datafusion_common::{not_impl_err, ExprSchema, Result}; use datafusion_expr_common::interval_arithmetic::Interval; @@ -274,6 +276,14 @@ impl ScalarUDF { pub fn coerce_types(&self, arg_types: &[DataType]) -> Result> { self.inner.coerce_types(arg_types) } + + /// Returns the documentation for this Scalar UDF. + /// + /// Documentation can be accessed programmatically as well as + /// generating publicly facing documentation. + pub fn documentation(&self) -> Option<&Documentation> { + self.inner.documentation() + } } impl From for ScalarUDF @@ -298,22 +308,39 @@ where /// # Basic Example /// ``` /// # use std::any::Any; +/// # use std::sync::OnceLock; /// # use arrow::datatypes::DataType; /// # use datafusion_common::{DataFusionError, plan_err, Result}; -/// # use datafusion_expr::{col, ColumnarValue, Signature, Volatility}; +/// # use datafusion_expr::{col, ColumnarValue, Documentation, Signature, Volatility}; /// # use datafusion_expr::{ScalarUDFImpl, ScalarUDF}; +/// # use datafusion_expr::scalar_doc_sections::DOC_SECTION_MATH; +/// /// #[derive(Debug)] /// struct AddOne { -/// signature: Signature +/// signature: Signature, /// } /// /// impl AddOne { /// fn new() -> Self { /// Self { -/// signature: Signature::uniform(1, vec![DataType::Int32], Volatility::Immutable) +/// signature: Signature::uniform(1, vec![DataType::Int32], Volatility::Immutable), /// } /// } /// } +/// +/// static DOCUMENTATION: OnceLock = OnceLock::new(); +/// +/// fn get_doc() -> &'static Documentation { +/// DOCUMENTATION.get_or_init(|| { +/// Documentation::builder() +/// .with_doc_section(DOC_SECTION_MATH) +/// .with_description("Add one to an int32") +/// .with_syntax_example("add_one(2)") +/// .with_argument("arg1", "The int32 number to add one to") +/// .build() +/// .unwrap() +/// }) +/// } /// /// /// Implement the ScalarUDFImpl trait for AddOne /// impl ScalarUDFImpl for AddOne { @@ -328,6 +355,9 @@ where /// } /// // The actual implementation would add one to the argument /// fn invoke(&self, args: &[ColumnarValue]) -> Result { unimplemented!() } +/// fn documentation(&self) -> Option<&Documentation> { +/// Some(get_doc()) +/// } /// } /// /// // Create a new ScalarUDF from the implementation @@ -596,6 +626,14 @@ pub trait ScalarUDFImpl: Debug + Send + Sync { self.signature().hash(hasher); hasher.finish() } + + /// Returns the documentation for this Scalar UDF. + /// + /// Documentation can be accessed programmatically as well as + /// generating publicly facing documentation. + fn documentation(&self) -> Option<&Documentation> { + None + } } /// ScalarUDF that adds an alias to the underlying function. It is better to @@ -709,4 +747,100 @@ impl ScalarUDFImpl for AliasedScalarUDFImpl { self.aliases.hash(hasher); hasher.finish() } + + fn documentation(&self) -> Option<&Documentation> { + self.inner.documentation() + } +} + +// Scalar UDF doc sections for use in public documentation +pub mod scalar_doc_sections { + use crate::DocSection; + + pub fn doc_sections() -> Vec { + vec![ + DOC_SECTION_MATH, + DOC_SECTION_CONDITIONAL, + DOC_SECTION_STRING, + DOC_SECTION_BINARY_STRING, + DOC_SECTION_REGEX, + DOC_SECTION_DATETIME, + DOC_SECTION_ARRAY, + DOC_SECTION_STRUCT, + DOC_SECTION_MAP, + DOC_SECTION_HASHING, + DOC_SECTION_OTHER, + ] + } + + pub const DOC_SECTION_MATH: DocSection = DocSection { + include: true, + label: "Math Functions", + description: None, + }; + + pub const DOC_SECTION_CONDITIONAL: DocSection = DocSection { + include: true, + label: "Conditional Functions", + description: None, + }; + + pub const DOC_SECTION_STRING: DocSection = DocSection { + include: true, + label: "String Functions", + description: None, + }; + + pub const DOC_SECTION_BINARY_STRING: DocSection = DocSection { + include: true, + label: "Binary String Functions", + description: None, + }; + + pub const DOC_SECTION_REGEX: DocSection = DocSection { + include: true, + label: "Regular Expression Functions", + description: Some( + r#"Apache DataFusion uses a [PCRE-like](https://en.wikibooks.org/wiki/Regular_Expressions/Perl-Compatible_Regular_Expressions) +regular expression [syntax](https://docs.rs/regex/latest/regex/#syntax) +(minus support for several features including look-around and backreferences). +The following regular expression functions are supported:"#, + ), + }; + + pub const DOC_SECTION_DATETIME: DocSection = DocSection { + include: true, + label: "Time and Date Functions", + description: None, + }; + + pub const DOC_SECTION_ARRAY: DocSection = DocSection { + include: true, + label: "Array Functions", + description: None, + }; + + pub const DOC_SECTION_STRUCT: DocSection = DocSection { + include: true, + label: "Struct Functions", + description: None, + }; + + pub const DOC_SECTION_MAP: DocSection = DocSection { + include: true, + label: "Map Functions", + description: None, + }; + + pub const DOC_SECTION_HASHING: DocSection = DocSection { + include: true, + label: "Hashing Functions", + description: None, + }; + + pub const DOC_SECTION_OTHER: DocSection = DocSection { + include: true, + label: "Other Functions", + description: None, + }; } diff --git a/datafusion/expr/src/udf_docs.rs b/datafusion/expr/src/udf_docs.rs new file mode 100644 index 000000000000..280910b87199 --- /dev/null +++ b/datafusion/expr/src/udf_docs.rs @@ -0,0 +1,187 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion_common::exec_err; +use datafusion_common::Result; + +/// Documentation for use by [`ScalarUDFImpl`](crate::ScalarUDFImpl), +/// [`AggregateUDFImpl`](crate::AggregateUDFImpl) and [`WindowUDFImpl`](crate::WindowUDFImpl) functions +/// that will be used to generate public documentation. +/// +/// The name of the udf will be pulled from the [`ScalarUDFImpl::name`](crate::ScalarUDFImpl::name), +/// [`AggregateUDFImpl::name`](crate::AggregateUDFImpl::name) or [`WindowUDFImpl::name`](crate::WindowUDFImpl::name) +/// function as appropriate. +/// +/// All strings in the documentation are required to be +/// in [markdown format](https://www.markdownguide.org/basic-syntax/). +/// +/// Currently, documentation only supports a single language +/// thus all text should be in English. +#[derive(Debug, Clone)] +pub struct Documentation { + /// the section in the documentation where the UDF will be documented + pub doc_section: DocSection, + /// the description for the UDF + pub description: String, + /// a brief example of the syntax. For example "ascii(str)" + pub syntax_example: String, + /// a sql example for the UDF, usually in the form of a sql prompt + /// query and output. It is strongly recommended to provide an + /// example for anything but the most basic UDF's + pub sql_example: Option, + /// arguments for the UDF which will be displayed in array order. + /// Left member of a pair is the argument name, right is a + /// description for the argument + pub arguments: Option>, + /// related functions if any. Values should match the related + /// udf's name exactly. Related udf's must be of the same + /// UDF type (scalar, aggregate or window) for proper linking to + /// occur + pub related_udfs: Option>, +} + +impl Documentation { + /// Returns a new [`DocumentationBuilder`] with no options set. + pub fn builder() -> DocumentationBuilder { + DocumentationBuilder::new() + } +} + +#[derive(Debug, Clone, PartialEq)] +pub struct DocSection { + /// true to include this doc section in the public + /// documentation, false otherwise + pub include: bool, + /// a display label for the doc section. For example: "Math Expressions" + pub label: &'static str, + /// an optional description for the doc section + pub description: Option<&'static str>, +} + +/// A builder to be used for building [`Documentation`]'s. +/// +/// Example: +/// +/// ```rust +/// # use datafusion_expr::Documentation; +/// # use datafusion_expr::scalar_doc_sections::DOC_SECTION_MATH; +/// # use datafusion_common::Result; +/// # +/// # fn main() -> Result<()> { +/// let documentation = Documentation::builder() +/// .with_doc_section(DOC_SECTION_MATH) +/// .with_description("Add one to an int32") +/// .with_syntax_example("add_one(2)") +/// .with_argument("arg_1", "The int32 number to add one to") +/// .build()?; +/// Ok(()) +/// # } +pub struct DocumentationBuilder { + pub doc_section: Option, + pub description: Option, + pub syntax_example: Option, + pub sql_example: Option, + pub arguments: Option>, + pub related_udfs: Option>, +} + +impl DocumentationBuilder { + pub fn new() -> Self { + Self { + doc_section: None, + description: None, + syntax_example: None, + sql_example: None, + arguments: None, + related_udfs: None, + } + } + + pub fn with_doc_section(mut self, doc_section: DocSection) -> Self { + self.doc_section = Some(doc_section); + self + } + + pub fn with_description(mut self, description: impl Into) -> Self { + self.description = Some(description.into()); + self + } + + pub fn with_syntax_example(mut self, syntax_example: impl Into) -> Self { + self.syntax_example = Some(syntax_example.into()); + self + } + + pub fn with_sql_example(mut self, sql_example: impl Into) -> Self { + self.sql_example = Some(sql_example.into()); + self + } + + pub fn with_argument( + mut self, + arg_name: impl Into, + arg_description: impl Into, + ) -> Self { + let mut args = self.arguments.unwrap_or_default(); + args.push((arg_name.into(), arg_description.into())); + self.arguments = Some(args); + self + } + + pub fn with_related_udf(mut self, related_udf: impl Into) -> Self { + let mut related = self.related_udfs.unwrap_or_default(); + related.push(related_udf.into()); + self.related_udfs = Some(related); + self + } + + pub fn build(self) -> Result { + let Self { + doc_section, + description, + syntax_example, + sql_example, + arguments, + related_udfs, + } = self; + + if doc_section.is_none() { + return exec_err!("Documentation must have a doc section"); + } + if description.is_none() { + return exec_err!("Documentation must have a description"); + } + if syntax_example.is_none() { + return exec_err!("Documentation must have a syntax_example"); + } + + Ok(Documentation { + doc_section: doc_section.unwrap(), + description: description.unwrap(), + syntax_example: syntax_example.unwrap(), + sql_example, + arguments, + related_udfs, + }) + } +} + +impl Default for DocumentationBuilder { + fn default() -> Self { + Self::new() + } +} diff --git a/datafusion/expr/src/udwf.rs b/datafusion/expr/src/udwf.rs index 678a0b62cd9a..6459e8f3f7d1 100644 --- a/datafusion/expr/src/udwf.rs +++ b/datafusion/expr/src/udwf.rs @@ -33,7 +33,8 @@ use datafusion_functions_window_common::field::WindowUDFFieldArgs; use crate::expr::WindowFunction; use crate::{ - function::WindowFunctionSimplification, Expr, PartitionEvaluator, Signature, + function::WindowFunctionSimplification, Documentation, Expr, PartitionEvaluator, + Signature, }; /// Logical representation of a user-defined window function (UDWF) @@ -180,6 +181,14 @@ impl WindowUDF { pub fn reverse_expr(&self) -> ReversedUDWF { self.inner.reverse_expr() } + + /// Returns the documentation for this Window UDF. + /// + /// Documentation can be accessed programmatically as well as + /// generating publicly facing documentation. + pub fn documentation(&self) -> Option<&Documentation> { + self.inner.documentation() + } } impl From for WindowUDF @@ -204,30 +213,47 @@ where /// # Basic Example /// ``` /// # use std::any::Any; +/// # use std::sync::OnceLock; /// # use arrow::datatypes::{DataType, Field}; /// # use datafusion_common::{DataFusionError, plan_err, Result}; -/// # use datafusion_expr::{col, Signature, Volatility, PartitionEvaluator, WindowFrame, ExprFunctionExt}; +/// # use datafusion_expr::{col, Signature, Volatility, PartitionEvaluator, WindowFrame, ExprFunctionExt, Documentation}; /// # use datafusion_expr::{WindowUDFImpl, WindowUDF}; -/// use datafusion_functions_window_common::field::WindowUDFFieldArgs; +/// # use datafusion_expr::window_doc_sections::DOC_SECTION_ANALYTICAL; +/// # use datafusion_functions_window_common::field::WindowUDFFieldArgs; +/// /// #[derive(Debug, Clone)] /// struct SmoothIt { -/// signature: Signature +/// signature: Signature, /// } /// /// impl SmoothIt { /// fn new() -> Self { /// Self { -/// signature: Signature::uniform(1, vec![DataType::Int32], Volatility::Immutable) +/// signature: Signature::uniform(1, vec![DataType::Int32], Volatility::Immutable), /// } /// } /// } /// -/// /// Implement the WindowUDFImpl trait for AddOne +/// static DOCUMENTATION: OnceLock = OnceLock::new(); +/// +/// fn get_doc() -> &'static Documentation { +/// DOCUMENTATION.get_or_init(|| { +/// Documentation::builder() +/// .with_doc_section(DOC_SECTION_ANALYTICAL) +/// .with_description("smooths the windows") +/// .with_syntax_example("smooth_it(2)") +/// .with_argument("arg1", "The int32 number to smooth by") +/// .build() +/// .unwrap() +/// }) +/// } +/// +/// /// Implement the WindowUDFImpl trait for SmoothIt /// impl WindowUDFImpl for SmoothIt { /// fn as_any(&self) -> &dyn Any { self } /// fn name(&self) -> &str { "smooth_it" } /// fn signature(&self) -> &Signature { &self.signature } -/// // The actual implementation would add one to the argument +/// // The actual implementation would smooth the window /// fn partition_evaluator(&self) -> Result> { unimplemented!() } /// fn field(&self, field_args: WindowUDFFieldArgs) -> Result { /// if let Some(DataType::Int32) = field_args.get_input_type(0) { @@ -236,6 +262,9 @@ where /// plan_err!("smooth_it only accepts Int32 arguments") /// } /// } +/// fn documentation(&self) -> Option<&Documentation> { +/// Some(get_doc()) +/// } /// } /// /// // Create a new WindowUDF from the implementation @@ -365,6 +394,14 @@ pub trait WindowUDFImpl: Debug + Send + Sync { fn reverse_expr(&self) -> ReversedUDWF { ReversedUDWF::NotSupported } + + /// Returns the documentation for this Window UDF. + /// + /// Documentation can be accessed programmatically as well as + /// generating publicly facing documentation. + fn documentation(&self) -> Option<&Documentation> { + None + } } pub enum ReversedUDWF { @@ -465,6 +502,41 @@ impl WindowUDFImpl for AliasedWindowUDFImpl { fn coerce_types(&self, arg_types: &[DataType]) -> Result> { self.inner.coerce_types(arg_types) } + + fn documentation(&self) -> Option<&Documentation> { + self.inner.documentation() + } +} + +// Window UDF doc sections for use in public documentation +pub mod window_doc_sections { + use crate::DocSection; + + pub fn doc_sections() -> Vec { + vec![ + DOC_SECTION_AGGREGATE, + DOC_SECTION_RANKING, + DOC_SECTION_ANALYTICAL, + ] + } + + pub const DOC_SECTION_AGGREGATE: DocSection = DocSection { + include: true, + label: "Aggregate Functions", + description: Some("All aggregate functions can be used as window functions."), + }; + + pub const DOC_SECTION_RANKING: DocSection = DocSection { + include: true, + label: "Ranking Functions", + description: None, + }; + + pub const DOC_SECTION_ANALYTICAL: DocSection = DocSection { + include: true, + label: "Analytical Functions", + description: None, + }; } #[cfg(test)] diff --git a/datafusion/functions-aggregate/Cargo.toml b/datafusion/functions-aggregate/Cargo.toml index 33a52afbe21a..37e4c7f4a5ad 100644 --- a/datafusion/functions-aggregate/Cargo.toml +++ b/datafusion/functions-aggregate/Cargo.toml @@ -48,6 +48,7 @@ datafusion-functions-aggregate-common = { workspace = true } datafusion-physical-expr = { workspace = true } datafusion-physical-expr-common = { workspace = true } half = { workspace = true } +indexmap = { workspace = true } log = { workspace = true } paste = "1.0.14" diff --git a/datafusion/functions-aggregate/src/bit_and_or_xor.rs b/datafusion/functions-aggregate/src/bit_and_or_xor.rs index aa65062e3330..ce36e09bc25b 100644 --- a/datafusion/functions-aggregate/src/bit_and_or_xor.rs +++ b/datafusion/functions-aggregate/src/bit_and_or_xor.rs @@ -35,11 +35,14 @@ use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::type_coercion::aggregates::INTEGERS; use datafusion_expr::utils::format_state_name; use datafusion_expr::{ - Accumulator, AggregateUDFImpl, GroupsAccumulator, ReversedUDAF, Signature, Volatility, + Accumulator, AggregateUDFImpl, Documentation, GroupsAccumulator, ReversedUDAF, + Signature, Volatility, }; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; use datafusion_functions_aggregate_common::aggregate::groups_accumulator::prim_op::PrimitiveGroupsAccumulator; use std::ops::{BitAndAssign, BitOrAssign, BitXorAssign}; +use std::sync::OnceLock; /// This macro helps create group accumulators based on bitwise operations typically used internally /// and might not be necessary for users to call directly. @@ -110,8 +113,9 @@ macro_rules! downcast_bitwise_accumulator { /// `EXPR_FN` identifier used to name the generated expression function. /// `AGGREGATE_UDF_FN` is an identifier used to name the underlying UDAF function. /// `OPR_TYPE` is an expression that evaluates to the type of bitwise operation to be performed. +/// `DOCUMENTATION` documentation for the UDAF macro_rules! make_bitwise_udaf_expr_and_func { - ($EXPR_FN:ident, $AGGREGATE_UDF_FN:ident, $OPR_TYPE:expr) => { + ($EXPR_FN:ident, $AGGREGATE_UDF_FN:ident, $OPR_TYPE:expr, $DOCUMENTATION:expr) => { make_udaf_expr!( $EXPR_FN, expr_x, @@ -125,14 +129,80 @@ macro_rules! make_bitwise_udaf_expr_and_func { create_func!( $EXPR_FN, $AGGREGATE_UDF_FN, - BitwiseOperation::new($OPR_TYPE, stringify!($EXPR_FN)) + BitwiseOperation::new($OPR_TYPE, stringify!($EXPR_FN), $DOCUMENTATION) ); }; } -make_bitwise_udaf_expr_and_func!(bit_and, bit_and_udaf, BitwiseOperationType::And); -make_bitwise_udaf_expr_and_func!(bit_or, bit_or_udaf, BitwiseOperationType::Or); -make_bitwise_udaf_expr_and_func!(bit_xor, bit_xor_udaf, BitwiseOperationType::Xor); +static BIT_AND_DOC: OnceLock = OnceLock::new(); + +fn get_bit_and_doc() -> &'static Documentation { + BIT_AND_DOC.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description("Computes the bitwise AND of all non-null input values.") + .with_syntax_example("bit_and(expression)") + .with_argument( + "expression", + "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.", + ) + .build() + .unwrap() + }) +} + +static BIT_OR_DOC: OnceLock = OnceLock::new(); + +fn get_bit_or_doc() -> &'static Documentation { + BIT_OR_DOC.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description("Computes the bitwise OR of all non-null input values.") + .with_syntax_example("bit_or(expression)") + .with_argument( + "expression", + "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.", + ) + .build() + .unwrap() + }) +} + +static BIT_XOR_DOC: OnceLock = OnceLock::new(); + +fn get_bit_xor_doc() -> &'static Documentation { + BIT_XOR_DOC.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description("Computes the bitwise exclusive OR of all non-null input values.") + .with_syntax_example("bit_xor(expression)") + .with_argument( + "expression", + "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.", + ) + .build() + .unwrap() + }) +} + +make_bitwise_udaf_expr_and_func!( + bit_and, + bit_and_udaf, + BitwiseOperationType::And, + get_bit_and_doc() +); +make_bitwise_udaf_expr_and_func!( + bit_or, + bit_or_udaf, + BitwiseOperationType::Or, + get_bit_or_doc() +); +make_bitwise_udaf_expr_and_func!( + bit_xor, + bit_xor_udaf, + BitwiseOperationType::Xor, + get_bit_xor_doc() +); /// The different types of bitwise operations that can be performed. #[derive(Debug, Clone, Eq, PartialEq)] @@ -155,14 +225,20 @@ struct BitwiseOperation { /// `operation` indicates the type of bitwise operation to be performed. operation: BitwiseOperationType, func_name: &'static str, + documentation: &'static Documentation, } impl BitwiseOperation { - pub fn new(operator: BitwiseOperationType, func_name: &'static str) -> Self { + pub fn new( + operator: BitwiseOperationType, + func_name: &'static str, + documentation: &'static Documentation, + ) -> Self { Self { operation: operator, signature: Signature::uniform(1, INTEGERS.to_vec(), Volatility::Immutable), func_name, + documentation, } } } @@ -239,6 +315,10 @@ impl AggregateUDFImpl for BitwiseOperation { fn reverse_expr(&self) -> ReversedUDAF { ReversedUDAF::Identical } + + fn documentation(&self) -> Option<&Documentation> { + Some(self.documentation) + } } struct BitAndAccumulator { diff --git a/datafusion/functions-window/src/row_number.rs b/datafusion/functions-window/src/row_number.rs index a2e1b2222bb7..c903f6778ae8 100644 --- a/datafusion/functions-window/src/row_number.rs +++ b/datafusion/functions-window/src/row_number.rs @@ -17,19 +17,22 @@ //! Defines physical expression for `row_number` that can evaluated at runtime during query execution -use std::any::Any; -use std::fmt::Debug; -use std::ops::Range; - use datafusion_common::arrow::array::ArrayRef; use datafusion_common::arrow::array::UInt64Array; use datafusion_common::arrow::compute::SortOptions; use datafusion_common::arrow::datatypes::DataType; use datafusion_common::arrow::datatypes::Field; use datafusion_common::{Result, ScalarValue}; -use datafusion_expr::{PartitionEvaluator, Signature, Volatility, WindowUDFImpl}; +use datafusion_expr::window_doc_sections::DOC_SECTION_RANKING; +use datafusion_expr::{ + Documentation, PartitionEvaluator, Signature, Volatility, WindowUDFImpl, +}; use datafusion_functions_window_common::field; use field::WindowUDFFieldArgs; +use std::any::Any; +use std::fmt::Debug; +use std::ops::Range; +use std::sync::OnceLock; define_udwf_and_expr!( RowNumber, @@ -58,6 +61,21 @@ impl Default for RowNumber { } } +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_row_number_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_RANKING) + .with_description( + "Number of the current row within its partition, counting from 1.", + ) + .with_syntax_example("row_number()") + .build() + .unwrap() + }) +} + impl WindowUDFImpl for RowNumber { fn as_any(&self) -> &dyn Any { self @@ -85,6 +103,10 @@ impl WindowUDFImpl for RowNumber { nulls_first: false, }) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_row_number_doc()) + } } /// State for the `row_number` built-in window function. diff --git a/datafusion/functions/src/core/coalesce.rs b/datafusion/functions/src/core/coalesce.rs index 2fa6d7c197ad..d8ff44798f8a 100644 --- a/datafusion/functions/src/core/coalesce.rs +++ b/datafusion/functions/src/core/coalesce.rs @@ -15,17 +15,18 @@ // specific language governing permissions and limitations // under the License. -use std::any::Any; - use arrow::array::{new_null_array, BooleanArray}; use arrow::compute::kernels::zip::zip; use arrow::compute::{and, is_not_null, is_null}; use arrow::datatypes::DataType; use datafusion_common::{exec_err, ExprSchema, Result}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_CONDITIONAL; use datafusion_expr::type_coercion::binary::type_union_resolution; -use datafusion_expr::{ColumnarValue, Expr, ExprSchemable}; +use datafusion_expr::{ColumnarValue, Documentation, Expr, ExprSchemable}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use itertools::Itertools; +use std::any::Any; +use std::sync::OnceLock; #[derive(Debug)] pub struct CoalesceFunc { @@ -46,6 +47,23 @@ impl CoalesceFunc { } } +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_coalesce_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_CONDITIONAL) + .with_description("Returns the first of its arguments that is not _null_. Returns _null_ if all arguments are _null_. This function is often used to substitute a default value for _null_ values.") + .with_syntax_example("coalesce(expression1[, ..., expression_n])") + .with_argument( + "expression1, expression_n", + "Expression to use if previous expressions are _null_. Can be a constant, column, or function, and any combination of arithmetic operators. Pass as many expression arguments as necessary." + ) + .build() + .unwrap() + }) +} + impl ScalarUDFImpl for CoalesceFunc { fn as_any(&self) -> &dyn Any { self @@ -140,6 +158,10 @@ impl ScalarUDFImpl for CoalesceFunc { .unwrap_or(arg_types.first().unwrap().clone()); Ok(vec![new_type; arg_types.len()]) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_coalesce_doc()) + } } #[cfg(test)] diff --git a/datafusion/functions/src/crypto/sha224.rs b/datafusion/functions/src/crypto/sha224.rs index 2795c4a25004..df3045f22cf5 100644 --- a/datafusion/functions/src/crypto/sha224.rs +++ b/datafusion/functions/src/crypto/sha224.rs @@ -19,13 +19,18 @@ use super::basic::{sha224, utf8_or_binary_to_binary_type}; use arrow::datatypes::DataType; use datafusion_common::Result; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_HASHING; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; use std::any::Any; +use std::sync::OnceLock; #[derive(Debug)] pub struct SHA224Func { signature: Signature, } + impl Default for SHA224Func { fn default() -> Self { Self::new() @@ -44,6 +49,22 @@ impl SHA224Func { } } } + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_sha224_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_HASHING) + .with_description("Computes the SHA-224 hash of a binary string.") + .with_syntax_example("sha224(expression)") + .with_argument("expression", + "String expression to operate on. Can be a constant, column, or function, and any combination of string operators.") + .build() + .unwrap() + }) +} + impl ScalarUDFImpl for SHA224Func { fn as_any(&self) -> &dyn Any { self @@ -60,7 +81,12 @@ impl ScalarUDFImpl for SHA224Func { fn return_type(&self, arg_types: &[DataType]) -> Result { utf8_or_binary_to_binary_type(&arg_types[0], self.name()) } + fn invoke(&self, args: &[ColumnarValue]) -> Result { sha224(args) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_sha224_doc()) + } } diff --git a/datafusion/functions/src/datetime/to_date.rs b/datafusion/functions/src/datetime/to_date.rs index 288641b84dd7..176d7f8bbcbf 100644 --- a/datafusion/functions/src/datetime/to_date.rs +++ b/datafusion/functions/src/datetime/to_date.rs @@ -15,17 +15,19 @@ // specific language governing permissions and limitations // under the License. -use std::any::Any; - +use crate::datetime::common::*; use arrow::datatypes::DataType; use arrow::datatypes::DataType::Date32; use arrow::error::ArrowError::ParseError; use arrow::{array::types::Date32Type, compute::kernels::cast_utils::Parser}; - -use crate::datetime::common::*; use datafusion_common::error::DataFusionError; use datafusion_common::{arrow_err, exec_err, internal_datafusion_err, Result}; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_DATETIME; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; +use std::any::Any; +use std::sync::OnceLock; #[derive(Debug)] pub struct ToDateFunc { @@ -77,6 +79,53 @@ impl ToDateFunc { } } +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_to_date_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_DATETIME) + .with_description(r#"Converts a value to a date (`YYYY-MM-DD`). +Supports strings, integer and double types as input. +Strings are parsed as YYYY-MM-DD (e.g. '2023-07-20') if no [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are provided. +Integers and doubles are interpreted as days since the unix epoch (`1970-01-01T00:00:00Z`). +Returns the corresponding date. + +Note: `to_date` returns Date32, which represents its values as the number of days since unix epoch(`1970-01-01`) stored as signed 32 bit value. The largest supported date value is `9999-12-31`. +"#) + .with_syntax_example("to_date('2017-05-31', '%Y-%m-%d')") + .with_sql_example(r#"```sql +> select to_date('2023-01-31'); ++-----------------------------+ +| to_date(Utf8("2023-01-31")) | ++-----------------------------+ +| 2023-01-31 | ++-----------------------------+ +> select to_date('2023/01/31', '%Y-%m-%d', '%Y/%m/%d'); ++---------------------------------------------------------------+ +| to_date(Utf8("2023/01/31"),Utf8("%Y-%m-%d"),Utf8("%Y/%m/%d")) | ++---------------------------------------------------------------+ +| 2023-01-31 | ++---------------------------------------------------------------+ +``` + +Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/to_date.rs) +"#) + .with_argument( + "expression", + "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.", + ) + .with_argument( + "format_n", + "Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order + they appear with the first successful one being returned. If none of the formats successfully parse the expression + an error will be returned.", + ) + .build() + .unwrap() + }) +} + impl ScalarUDFImpl for ToDateFunc { fn as_any(&self) -> &dyn Any { self @@ -117,6 +166,10 @@ impl ScalarUDFImpl for ToDateFunc { } } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_to_date_doc()) + } } #[cfg(test)] diff --git a/datafusion/functions/src/encoding/inner.rs b/datafusion/functions/src/encoding/inner.rs index 2a22e572614b..4f91879f94db 100644 --- a/datafusion/functions/src/encoding/inner.rs +++ b/datafusion/functions/src/encoding/inner.rs @@ -31,10 +31,11 @@ use datafusion_common::{ }; use datafusion_common::{exec_err, ScalarValue}; use datafusion_common::{DataFusionError, Result}; -use datafusion_expr::ColumnarValue; -use std::sync::Arc; +use datafusion_expr::{ColumnarValue, Documentation}; +use std::sync::{Arc, OnceLock}; use std::{fmt, str::FromStr}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_BINARY_STRING; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use std::any::Any; @@ -57,6 +58,22 @@ impl EncodeFunc { } } +static ENCODE_DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_encode_doc() -> &'static Documentation { + ENCODE_DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_BINARY_STRING) + .with_description("Encode binary data into a textual representation.") + .with_syntax_example("encode(expression, format)") + .with_argument("expression", "Expression containing string or binary data") + .with_argument("format", "Supported formats are: `base64`, `hex`") + .with_related_udf("decode") + .build() + .unwrap() + }) +} + impl ScalarUDFImpl for EncodeFunc { fn as_any(&self) -> &dyn Any { self @@ -103,6 +120,10 @@ impl ScalarUDFImpl for EncodeFunc { ), } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_encode_doc()) + } } #[derive(Debug)] @@ -124,6 +145,22 @@ impl DecodeFunc { } } +static DECODE_DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_decode_doc() -> &'static Documentation { + DECODE_DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_BINARY_STRING) + .with_description("Decode binary data from textual representation in string.") + .with_syntax_example("decode(expression, format)") + .with_argument("expression", "Expression containing encoded string data") + .with_argument("format", "Same arguments as [encode](#encode)") + .with_related_udf("encode") + .build() + .unwrap() + }) +} + impl ScalarUDFImpl for DecodeFunc { fn as_any(&self) -> &dyn Any { self @@ -170,6 +207,10 @@ impl ScalarUDFImpl for DecodeFunc { ), } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_decode_doc()) + } } #[derive(Debug, Copy, Clone)] diff --git a/datafusion/functions/src/math/log.rs b/datafusion/functions/src/math/log.rs index ad7cff1f7149..889e3761d26c 100644 --- a/datafusion/functions/src/math/log.rs +++ b/datafusion/functions/src/math/log.rs @@ -18,7 +18,7 @@ //! Math function: `log()`. use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use super::power::PowerFunc; @@ -29,9 +29,12 @@ use datafusion_common::{ ScalarValue, }; use datafusion_expr::expr::ScalarFunction; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_MATH; use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; use datafusion_expr::sort_properties::{ExprProperties, SortProperties}; -use datafusion_expr::{lit, ColumnarValue, Expr, ScalarUDF, TypeSignature::*}; +use datafusion_expr::{ + lit, ColumnarValue, Documentation, Expr, ScalarUDF, TypeSignature::*, +}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; #[derive(Debug)] @@ -45,6 +48,24 @@ impl Default for LogFunc { } } +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_log_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_MATH) + .with_description("Returns the base-x logarithm of a number. Can either provide a specified base, or if omitted then takes the base-10 of a number.") + .with_syntax_example(r#"log(base, numeric_expression) +log(numeric_expression)"#) + .with_argument("base", + "Base numeric expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.") + .with_argument("numeric_expression", + "Numeric expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.") + .build() + .unwrap() + }) +} + impl LogFunc { pub fn new() -> Self { use DataType::*; @@ -164,6 +185,10 @@ impl ScalarUDFImpl for LogFunc { Ok(ColumnarValue::Array(arr)) } + fn documentation(&self) -> Option<&Documentation> { + Some(get_log_doc()) + } + /// Simplify the `log` function by the relevant rules: /// 1. Log(a, 1) ===> 0 /// 2. Log(a, Power(a, b)) ===> b diff --git a/datafusion/functions/src/regex/regexplike.rs b/datafusion/functions/src/regex/regexplike.rs index 8cd26a824acc..2abb4a9376c5 100644 --- a/datafusion/functions/src/regex/regexplike.rs +++ b/datafusion/functions/src/regex/regexplike.rs @@ -25,22 +25,64 @@ use datafusion_common::{arrow_datafusion_err, plan_err}; use datafusion_common::{ cast::as_generic_string_array, internal_err, DataFusionError, Result, }; -use datafusion_expr::ColumnarValue; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_REGEX; use datafusion_expr::TypeSignature::*; +use datafusion_expr::{ColumnarValue, Documentation}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; #[derive(Debug)] pub struct RegexpLikeFunc { signature: Signature, } + impl Default for RegexpLikeFunc { fn default() -> Self { Self::new() } } +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_regexp_like_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_REGEX) + .with_description("Returns true if a [regular expression](https://docs.rs/regex/latest/regex/#syntax) has at least one match in a string, false otherwise.") + .with_syntax_example("regexp_like(str, regexp[, flags])") + .with_sql_example(r#"```sql +select regexp_like('Köln', '[a-zA-Z]ö[a-zA-Z]{2}'); ++--------------------------------------------------------+ +| regexp_like(Utf8("Köln"),Utf8("[a-zA-Z]ö[a-zA-Z]{2}")) | ++--------------------------------------------------------+ +| true | ++--------------------------------------------------------+ +SELECT regexp_like('aBc', '(b|d)', 'i'); ++--------------------------------------------------+ +| regexp_like(Utf8("aBc"),Utf8("(b|d)"),Utf8("i")) | ++--------------------------------------------------+ +| true | ++--------------------------------------------------+ +``` +Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs) +"#) + .with_argument("str", + "String expression to operate on. Can be a constant, column, or function, and any combination of string operators.") + .with_argument("regexp", + "Regular expression to test against the string expression. Can be a constant, column, or function.") + .with_argument("flags", + r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported: + - **i**: case-insensitive: letters match both upper and lower case + - **m**: multi-line mode: ^ and $ match begin/end of line + - **s**: allow . to match \n + - **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used + - **U**: swap the meaning of x* and x*?"#) + .build() + .unwrap() + }) +} + impl RegexpLikeFunc { pub fn new() -> Self { use DataType::*; @@ -105,6 +147,10 @@ impl ScalarUDFImpl for RegexpLikeFunc { result.map(ColumnarValue::Array) } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_regexp_like_doc()) + } } fn regexp_like_func(args: &[ArrayRef]) -> Result { match args[0].data_type() { diff --git a/datafusion/functions/src/string/ascii.rs b/datafusion/functions/src/string/ascii.rs index 68ba3f5ff15f..d01c6631e9dd 100644 --- a/datafusion/functions/src/string/ascii.rs +++ b/datafusion/functions/src/string/ascii.rs @@ -20,10 +20,29 @@ use arrow::array::{ArrayAccessor, ArrayIter, ArrayRef, AsArray, Int32Array}; use arrow::datatypes::DataType; use arrow::error::ArrowError; use datafusion_common::{internal_err, Result}; -use datafusion_expr::ColumnarValue; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ColumnarValue, Documentation}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_ascii_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Returns the ASCII value of the first character in a string.") + .with_syntax_example("ascii(str)") + .with_argument( + "str", + "String expression to operate on. Can be a constant, column, or function that evaluates to or can be coerced to a Utf8, LargeUtf8 or a Utf8View.", + ) + .with_related_udf("chr") + .build() + .unwrap() + }) +} #[derive(Debug)] pub struct AsciiFunc { @@ -71,6 +90,10 @@ impl ScalarUDFImpl for AsciiFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { make_scalar_function(ascii, vec![])(args) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_ascii_doc()) + } } fn calculate_ascii<'a, V>(array: V) -> Result diff --git a/datafusion/functions/src/unicode/rpad.rs b/datafusion/functions/src/unicode/rpad.rs index c1d6f327928f..ce221b44f42b 100644 --- a/datafusion/functions/src/unicode/rpad.rs +++ b/datafusion/functions/src/unicode/rpad.rs @@ -25,11 +25,14 @@ use arrow::datatypes::DataType; use datafusion_common::cast::as_int64_array; use datafusion_common::DataFusionError; use datafusion_common::{exec_err, Result}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::TypeSignature::Exact; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; use std::any::Any; use std::fmt::Write; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use unicode_segmentation::UnicodeSegmentation; use DataType::{LargeUtf8, Utf8, Utf8View}; @@ -44,6 +47,27 @@ impl Default for RPadFunc { } } +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_rpad_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Pads the right side of a string with another string to a specified string length.") + .with_syntax_example("rpad(str, n[, padding_str])") + .with_argument( + "str", + "String expression to operate on. Can be a constant, column, or function, and any combination of string operators.", + ) + .with_argument("n", "String length to pad to.") + .with_argument("padding_str", + "String expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._") + .with_related_udf("lpad") + .build() + .unwrap() + }) +} + impl RPadFunc { pub fn new() -> Self { use DataType::*; @@ -113,6 +137,10 @@ impl ScalarUDFImpl for RPadFunc { } } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_rpad_doc()) + } } pub fn rpad( diff --git a/dev/update_config_docs.sh b/dev/update_config_docs.sh index 836ba6772eac..585cb77839f9 100755 --- a/dev/update_config_docs.sh +++ b/dev/update_config_docs.sh @@ -24,7 +24,7 @@ SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" cd "${SOURCE_DIR}/../" && pwd TARGET_FILE="docs/source/user-guide/configs.md" -PRINT_DOCS_COMMAND="cargo run --manifest-path datafusion/core/Cargo.toml --bin print_config_docs" +PRINT_CONFIG_DOCS_COMMAND="cargo run --manifest-path datafusion/core/Cargo.toml --bin print_config_docs" echo "Inserting header" cat <<'EOF' > "$TARGET_FILE" @@ -67,8 +67,8 @@ Environment variables are read during `SessionConfig` initialisation so they mus EOF -echo "Running CLI and inserting docs table" -$PRINT_DOCS_COMMAND >> "$TARGET_FILE" +echo "Running CLI and inserting config docs table" +$PRINT_CONFIG_DOCS_COMMAND >> "$TARGET_FILE" echo "Running prettier" npx prettier@2.3.2 --write "$TARGET_FILE" diff --git a/dev/update_function_docs.sh b/dev/update_function_docs.sh new file mode 100755 index 000000000000..a4236eefc8c8 --- /dev/null +++ b/dev/update_function_docs.sh @@ -0,0 +1,284 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +set -e + +SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "${SOURCE_DIR}/../" && pwd + + +TARGET_FILE="docs/source/user-guide/sql/aggregate_functions_new.md" +PRINT_AGGREGATE_FUNCTION_DOCS_COMMAND="cargo run --manifest-path datafusion/core/Cargo.toml --bin print_functions_docs -- aggregate" + +echo "Inserting header" +cat <<'EOF' > "$TARGET_FILE" + + + + +# Aggregate Functions (NEW) + +This page is a WIP and will replace the Aggregate Functions page once completed. + +Aggregate functions operate on a set of values to compute a single result. +EOF + +echo "Running CLI and inserting aggregate function docs table" +$PRINT_AGGREGATE_FUNCTION_DOCS_COMMAND >> "$TARGET_FILE" + +echo "Running prettier" +npx prettier@2.3.2 --write "$TARGET_FILE" + +echo "'$TARGET_FILE' successfully updated!" + +TARGET_FILE="docs/source/user-guide/sql/scalar_functions_new.md" +PRINT_SCALAR_FUNCTION_DOCS_COMMAND="cargo run --manifest-path datafusion/core/Cargo.toml --bin print_functions_docs -- scalar" + +echo "Inserting header" +cat <<'EOF' > "$TARGET_FILE" + + + + +# Scalar Functions (NEW) + +This page is a WIP and will replace the Scalar Functions page once completed. +EOF + +echo "Running CLI and inserting scalar function docs table" +$PRINT_SCALAR_FUNCTION_DOCS_COMMAND >> "$TARGET_FILE" + +echo "Running prettier" +npx prettier@2.3.2 --write "$TARGET_FILE" + +echo "'$TARGET_FILE' successfully updated!" + +TARGET_FILE="docs/source/user-guide/sql/window_functions_new.md" +PRINT_WINDOW_FUNCTION_DOCS_COMMAND="cargo run --manifest-path datafusion/core/Cargo.toml --bin print_functions_docs -- window" + +echo "Inserting header" +cat <<'EOF' > "$TARGET_FILE" + + + + + +# Window Functions (NEW) + +This page is a WIP and will replace the Window Functions page once completed. + +A _window function_ performs a calculation across a set of table rows that are somehow related to the current row. This is comparable to the type of calculation that can be done with an aggregate function. However, window functions do not cause rows to become grouped into a single output row like non-window aggregate calls would. Instead, the rows retain their separate identities. Behind the scenes, the window function is able to access more than just the current row of the query result + +Here is an example that shows how to compare each employee's salary with the average salary in his or her department: + +```sql +SELECT depname, empno, salary, avg(salary) OVER (PARTITION BY depname) FROM empsalary; + ++-----------+-------+--------+-------------------+ +| depname | empno | salary | avg | ++-----------+-------+--------+-------------------+ +| personnel | 2 | 3900 | 3700.0 | +| personnel | 5 | 3500 | 3700.0 | +| develop | 8 | 6000 | 5020.0 | +| develop | 10 | 5200 | 5020.0 | +| develop | 11 | 5200 | 5020.0 | +| develop | 9 | 4500 | 5020.0 | +| develop | 7 | 4200 | 5020.0 | +| sales | 1 | 5000 | 4866.666666666667 | +| sales | 4 | 4800 | 4866.666666666667 | +| sales | 3 | 4800 | 4866.666666666667 | ++-----------+-------+--------+-------------------+ +``` + +A window function call always contains an OVER clause directly following the window function's name and argument(s). This is what syntactically distinguishes it from a normal function or non-window aggregate. The OVER clause determines exactly how the rows of the query are split up for processing by the window function. The PARTITION BY clause within OVER divides the rows into groups, or partitions, that share the same values of the PARTITION BY expression(s). For each row, the window function is computed across the rows that fall into the same partition as the current row. The previous example showed how to count the average of a column per partition. + +You can also control the order in which rows are processed by window functions using ORDER BY within OVER. (The window ORDER BY does not even have to match the order in which the rows are output.) Here is an example: + +```sql +SELECT depname, empno, salary, + rank() OVER (PARTITION BY depname ORDER BY salary DESC) +FROM empsalary; + ++-----------+-------+--------+--------+ +| depname | empno | salary | rank | ++-----------+-------+--------+--------+ +| personnel | 2 | 3900 | 1 | +| develop | 8 | 6000 | 1 | +| develop | 10 | 5200 | 2 | +| develop | 11 | 5200 | 2 | +| develop | 9 | 4500 | 4 | +| develop | 7 | 4200 | 5 | +| sales | 1 | 5000 | 1 | +| sales | 4 | 4800 | 2 | +| personnel | 5 | 3500 | 2 | +| sales | 3 | 4800 | 2 | ++-----------+-------+--------+--------+ +``` + +There is another important concept associated with window functions: for each row, there is a set of rows within its partition called its window frame. Some window functions act only on the rows of the window frame, rather than of the whole partition. Here is an example of using window frames in queries: + +```sql +SELECT depname, empno, salary, + avg(salary) OVER(ORDER BY salary ASC ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) AS avg, + min(salary) OVER(ORDER BY empno ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cum_min +FROM empsalary +ORDER BY empno ASC; + ++-----------+-------+--------+--------------------+---------+ +| depname | empno | salary | avg | cum_min | ++-----------+-------+--------+--------------------+---------+ +| sales | 1 | 5000 | 5000.0 | 5000 | +| personnel | 2 | 3900 | 3866.6666666666665 | 3900 | +| sales | 3 | 4800 | 4700.0 | 3900 | +| sales | 4 | 4800 | 4866.666666666667 | 3900 | +| personnel | 5 | 3500 | 3700.0 | 3500 | +| develop | 7 | 4200 | 4200.0 | 3500 | +| develop | 8 | 6000 | 5600.0 | 3500 | +| develop | 9 | 4500 | 4500.0 | 3500 | +| develop | 10 | 5200 | 5133.333333333333 | 3500 | +| develop | 11 | 5200 | 5466.666666666667 | 3500 | ++-----------+-------+--------+--------------------+---------+ +``` + +When a query involves multiple window functions, it is possible to write out each one with a separate OVER clause, but this is duplicative and error-prone if the same windowing behavior is wanted for several functions. Instead, each windowing behavior can be named in a WINDOW clause and then referenced in OVER. For example: + +```sql +SELECT sum(salary) OVER w, avg(salary) OVER w +FROM empsalary +WINDOW w AS (PARTITION BY depname ORDER BY salary DESC); +``` + +## Syntax + +The syntax for the OVER-clause is + +``` +function([expr]) + OVER( + [PARTITION BY expr[, …]] + [ORDER BY expr [ ASC | DESC ][, …]] + [ frame_clause ] + ) +``` + +where **frame_clause** is one of: + +``` + { RANGE | ROWS | GROUPS } frame_start + { RANGE | ROWS | GROUPS } BETWEEN frame_start AND frame_end +``` + +and **frame_start** and **frame_end** can be one of + +```sql +UNBOUNDED PRECEDING +offset PRECEDING +CURRENT ROW +offset FOLLOWING +UNBOUNDED FOLLOWING +``` + +where **offset** is an non-negative integer. + +RANGE and GROUPS modes require an ORDER BY clause (with RANGE the ORDER BY must specify exactly one column). + +## Aggregate functions + +All [aggregate functions](aggregate_functions.md) can be used as window functions. + +EOF + +echo "Running CLI and inserting window function docs table" +$PRINT_WINDOW_FUNCTION_DOCS_COMMAND >> "$TARGET_FILE" + +echo "Running prettier" +npx prettier@2.3.2 --write "$TARGET_FILE" + +echo "'$TARGET_FILE' successfully updated!" + diff --git a/docs/source/user-guide/expressions.md b/docs/source/user-guide/expressions.md index c8f0ffbec701..ababb001f5c5 100644 --- a/docs/source/user-guide/expressions.md +++ b/docs/source/user-guide/expressions.md @@ -69,7 +69,7 @@ value ::: :::{note} -Since `&&` and `||` are existed as logical operators in Rust, but those are not overloadable and not works with expression API. +Since `&&` and `||` are logical operators in Rust and cannot be overloaded these are not available in the expression API. ::: ## Bitwise Expressions @@ -151,7 +151,7 @@ but these operators always return a `bool` which makes them not work with the ex | trunc(x) | truncate toward zero | :::{note} -Unlike to some databases the math functions in Datafusion works the same way as Rust math functions, avoiding failing on corner cases e.g +Unlike to some databases the math functions in Datafusion works the same way as Rust math functions, avoiding failing on corner cases e.g. ```sql select log(-1), log(0), sqrt(-1); diff --git a/docs/source/user-guide/sql/aggregate_functions_new.md b/docs/source/user-guide/sql/aggregate_functions_new.md new file mode 100644 index 000000000000..8303c50c2471 --- /dev/null +++ b/docs/source/user-guide/sql/aggregate_functions_new.md @@ -0,0 +1,74 @@ + + + + +# Aggregate Functions (NEW) + +This page is a WIP and will replace the Aggregate Functions page once completed. + +Aggregate functions operate on a set of values to compute a single result. + +## General Functions + +- [bit_and](#bit_and) +- [bit_or](#bit_or) +- [bit_xor](#bit_xor) + +### `bit_and` + +Computes the bitwise AND of all non-null input values. + +``` +bit_and(expression) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +### `bit_or` + +Computes the bitwise OR of all non-null input values. + +``` +bit_or(expression) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +### `bit_xor` + +Computes the bitwise exclusive OR of all non-null input values. + +``` +bit_xor(expression) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. diff --git a/docs/source/user-guide/sql/index.rst b/docs/source/user-guide/sql/index.rst index 04d1fc228f81..6eb451c83b96 100644 --- a/docs/source/user-guide/sql/index.rst +++ b/docs/source/user-guide/sql/index.rst @@ -30,7 +30,10 @@ SQL Reference information_schema operators aggregate_functions + aggregate_functions_new window_functions + window_functions_new scalar_functions + scalar_functions_new sql_status write_options diff --git a/docs/source/user-guide/sql/scalar_functions_new.md b/docs/source/user-guide/sql/scalar_functions_new.md new file mode 100644 index 000000000000..ae2744c1650e --- /dev/null +++ b/docs/source/user-guide/sql/scalar_functions_new.md @@ -0,0 +1,249 @@ + + + + +# Scalar Functions (NEW) + +This page is a WIP and will replace the Scalar Functions page once completed. + +## Math Functions + +- [log](#log) + +### `log` + +Returns the base-x logarithm of a number. Can either provide a specified base, or if omitted then takes the base-10 of a number. + +``` +log(base, numeric_expression) +log(numeric_expression) +``` + +#### Arguments + +- **base**: Base numeric expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. +- **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +## Conditional Functions + +- [coalesce](#coalesce) + +### `coalesce` + +Returns the first of its arguments that is not _null_. Returns _null_ if all arguments are _null_. This function is often used to substitute a default value for _null_ values. + +``` +coalesce(expression1[, ..., expression_n]) +``` + +#### Arguments + +- **expression1, expression_n**: Expression to use if previous expressions are _null_. Can be a constant, column, or function, and any combination of arithmetic operators. Pass as many expression arguments as necessary. + +## String Functions + +- [ascii](#ascii) +- [rpad](#rpad) + +### `ascii` + +Returns the ASCII value of the first character in a string. + +``` +ascii(str) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function that evaluates to or can be coerced to a Utf8, LargeUtf8 or a Utf8View. + +**Related functions**: + +- [chr](#chr) + +### `rpad` + +Pads the right side of a string with another string to a specified string length. + +``` +rpad(str, n[, padding_str]) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of string operators. +- **n**: String length to pad to. +- **padding_str**: String expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._ + +**Related functions**: + +- [lpad](#lpad) + +## Binary String Functions + +- [decode](#decode) +- [encode](#encode) + +### `decode` + +Decode binary data from textual representation in string. + +``` +decode(expression, format) +``` + +#### Arguments + +- **expression**: Expression containing encoded string data +- **format**: Same arguments as [encode](#encode) + +**Related functions**: + +- [encode](#encode) + +### `encode` + +Encode binary data into a textual representation. + +``` +encode(expression, format) +``` + +#### Arguments + +- **expression**: Expression containing string or binary data +- **format**: Supported formats are: `base64`, `hex` + +**Related functions**: + +- [decode](#decode) + +## Regular Expression Functions + +Apache DataFusion uses a [PCRE-like](https://en.wikibooks.org/wiki/Regular_Expressions/Perl-Compatible_Regular_Expressions) +regular expression [syntax](https://docs.rs/regex/latest/regex/#syntax) +(minus support for several features including look-around and backreferences). +The following regular expression functions are supported: + +- [regexp_like](#regexp_like) + +### `regexp_like` + +Returns true if a [regular expression](https://docs.rs/regex/latest/regex/#syntax) has at least one match in a string, false otherwise. + +``` +regexp_like(str, regexp[, flags]) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of string operators. +- **regexp**: Regular expression to test against the string expression. Can be a constant, column, or function. +- **flags**: Optional regular expression flags that control the behavior of the regular expression. The following flags are supported: + - **i**: case-insensitive: letters match both upper and lower case + - **m**: multi-line mode: ^ and $ match begin/end of line + - **s**: allow . to match \n + - **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used + - **U**: swap the meaning of x* and x*? + +#### Example + +```sql +select regexp_like('Köln', '[a-zA-Z]ö[a-zA-Z]{2}'); ++--------------------------------------------------------+ +| regexp_like(Utf8("Köln"),Utf8("[a-zA-Z]ö[a-zA-Z]{2}")) | ++--------------------------------------------------------+ +| true | ++--------------------------------------------------------+ +SELECT regexp_like('aBc', '(b|d)', 'i'); ++--------------------------------------------------+ +| regexp_like(Utf8("aBc"),Utf8("(b|d)"),Utf8("i")) | ++--------------------------------------------------+ +| true | ++--------------------------------------------------+ +``` + +Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs) + +## Time and Date Functions + +- [to_date](#to_date) + +### `to_date` + +Converts a value to a date (`YYYY-MM-DD`). +Supports strings, integer and double types as input. +Strings are parsed as YYYY-MM-DD (e.g. '2023-07-20') if no [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are provided. +Integers and doubles are interpreted as days since the unix epoch (`1970-01-01T00:00:00Z`). +Returns the corresponding date. + +Note: `to_date` returns Date32, which represents its values as the number of days since unix epoch(`1970-01-01`) stored as signed 32 bit value. The largest supported date value is `9999-12-31`. + +``` +to_date('2017-05-31', '%Y-%m-%d') +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. +- **format_n**: Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order + they appear with the first successful one being returned. If none of the formats successfully parse the expression + an error will be returned. + +#### Example + +```sql +> select to_date('2023-01-31'); ++-----------------------------+ +| to_date(Utf8("2023-01-31")) | ++-----------------------------+ +| 2023-01-31 | ++-----------------------------+ +> select to_date('2023/01/31', '%Y-%m-%d', '%Y/%m/%d'); ++---------------------------------------------------------------+ +| to_date(Utf8("2023/01/31"),Utf8("%Y-%m-%d"),Utf8("%Y/%m/%d")) | ++---------------------------------------------------------------+ +| 2023-01-31 | ++---------------------------------------------------------------+ +``` + +Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/to_date.rs) + +## Hashing Functions + +- [sha224](#sha224) + +### `sha224` + +Computes the SHA-224 hash of a binary string. + +``` +sha224(expression) +``` + +#### Arguments + +- **expression**: String expression to operate on. Can be a constant, column, or function, and any combination of string operators. diff --git a/docs/source/user-guide/sql/window_functions_new.md b/docs/source/user-guide/sql/window_functions_new.md new file mode 100644 index 000000000000..1ab6740a6f87 --- /dev/null +++ b/docs/source/user-guide/sql/window_functions_new.md @@ -0,0 +1,161 @@ + + + + +# Window Functions (NEW) + +This page is a WIP and will replace the Window Functions page once completed. + +A _window function_ performs a calculation across a set of table rows that are somehow related to the current row. This is comparable to the type of calculation that can be done with an aggregate function. However, window functions do not cause rows to become grouped into a single output row like non-window aggregate calls would. Instead, the rows retain their separate identities. Behind the scenes, the window function is able to access more than just the current row of the query result + +Here is an example that shows how to compare each employee's salary with the average salary in his or her department: + +```sql +SELECT depname, empno, salary, avg(salary) OVER (PARTITION BY depname) FROM empsalary; + ++-----------+-------+--------+-------------------+ +| depname | empno | salary | avg | ++-----------+-------+--------+-------------------+ +| personnel | 2 | 3900 | 3700.0 | +| personnel | 5 | 3500 | 3700.0 | +| develop | 8 | 6000 | 5020.0 | +| develop | 10 | 5200 | 5020.0 | +| develop | 11 | 5200 | 5020.0 | +| develop | 9 | 4500 | 5020.0 | +| develop | 7 | 4200 | 5020.0 | +| sales | 1 | 5000 | 4866.666666666667 | +| sales | 4 | 4800 | 4866.666666666667 | +| sales | 3 | 4800 | 4866.666666666667 | ++-----------+-------+--------+-------------------+ +``` + +A window function call always contains an OVER clause directly following the window function's name and argument(s). This is what syntactically distinguishes it from a normal function or non-window aggregate. The OVER clause determines exactly how the rows of the query are split up for processing by the window function. The PARTITION BY clause within OVER divides the rows into groups, or partitions, that share the same values of the PARTITION BY expression(s). For each row, the window function is computed across the rows that fall into the same partition as the current row. The previous example showed how to count the average of a column per partition. + +You can also control the order in which rows are processed by window functions using ORDER BY within OVER. (The window ORDER BY does not even have to match the order in which the rows are output.) Here is an example: + +```sql +SELECT depname, empno, salary, + rank() OVER (PARTITION BY depname ORDER BY salary DESC) +FROM empsalary; + ++-----------+-------+--------+--------+ +| depname | empno | salary | rank | ++-----------+-------+--------+--------+ +| personnel | 2 | 3900 | 1 | +| develop | 8 | 6000 | 1 | +| develop | 10 | 5200 | 2 | +| develop | 11 | 5200 | 2 | +| develop | 9 | 4500 | 4 | +| develop | 7 | 4200 | 5 | +| sales | 1 | 5000 | 1 | +| sales | 4 | 4800 | 2 | +| personnel | 5 | 3500 | 2 | +| sales | 3 | 4800 | 2 | ++-----------+-------+--------+--------+ +``` + +There is another important concept associated with window functions: for each row, there is a set of rows within its partition called its window frame. Some window functions act only on the rows of the window frame, rather than of the whole partition. Here is an example of using window frames in queries: + +```sql +SELECT depname, empno, salary, + avg(salary) OVER(ORDER BY salary ASC ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) AS avg, + min(salary) OVER(ORDER BY empno ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cum_min +FROM empsalary +ORDER BY empno ASC; + ++-----------+-------+--------+--------------------+---------+ +| depname | empno | salary | avg | cum_min | ++-----------+-------+--------+--------------------+---------+ +| sales | 1 | 5000 | 5000.0 | 5000 | +| personnel | 2 | 3900 | 3866.6666666666665 | 3900 | +| sales | 3 | 4800 | 4700.0 | 3900 | +| sales | 4 | 4800 | 4866.666666666667 | 3900 | +| personnel | 5 | 3500 | 3700.0 | 3500 | +| develop | 7 | 4200 | 4200.0 | 3500 | +| develop | 8 | 6000 | 5600.0 | 3500 | +| develop | 9 | 4500 | 4500.0 | 3500 | +| develop | 10 | 5200 | 5133.333333333333 | 3500 | +| develop | 11 | 5200 | 5466.666666666667 | 3500 | ++-----------+-------+--------+--------------------+---------+ +``` + +When a query involves multiple window functions, it is possible to write out each one with a separate OVER clause, but this is duplicative and error-prone if the same windowing behavior is wanted for several functions. Instead, each windowing behavior can be named in a WINDOW clause and then referenced in OVER. For example: + +```sql +SELECT sum(salary) OVER w, avg(salary) OVER w +FROM empsalary +WINDOW w AS (PARTITION BY depname ORDER BY salary DESC); +``` + +## Syntax + +The syntax for the OVER-clause is + +``` +function([expr]) + OVER( + [PARTITION BY expr[, …]] + [ORDER BY expr [ ASC | DESC ][, …]] + [ frame_clause ] + ) +``` + +where **frame_clause** is one of: + +``` + { RANGE | ROWS | GROUPS } frame_start + { RANGE | ROWS | GROUPS } BETWEEN frame_start AND frame_end +``` + +and **frame_start** and **frame_end** can be one of + +```sql +UNBOUNDED PRECEDING +offset PRECEDING +CURRENT ROW +offset FOLLOWING +UNBOUNDED FOLLOWING +``` + +where **offset** is an non-negative integer. + +RANGE and GROUPS modes require an ORDER BY clause (with RANGE the ORDER BY must specify exactly one column). + +## Aggregate functions + +All [aggregate functions](aggregate_functions.md) can be used as window functions. + +## Ranking Functions + +- [row_number](#row_number) + +### `row_number` + +Number of the current row within its partition, counting from 1. + +``` +row_number() +```