apache · alamb · Jun 28, 2024 · Jun 28, 2024 · Jun 28, 2024 · Jun 28, 2024
diff --git a/datafusion-examples/README.md b/datafusion-examples/README.md
@@ -47,7 +47,6 @@ cargo run --example csv_sql
 - [`advanced_udf.rs`](examples/advanced_udf.rs): Define and invoke a more complicated User Defined Scalar Function (UDF)
 - [`advanced_udwf.rs`](examples/advanced_udwf.rs): Define and invoke a more complicated User Defined Window Function (UDWF)
 - [`advanced_parquet_index.rs`](examples/advanced_parquet_index.rs): Creates a detailed secondary index that covers the contents of several parquet files
-- [`avro_sql.rs`](examples/avro_sql.rs): Build and run a query plan from a SQL statement against a local AVRO file
 - [`catalog.rs`](examples/catalog.rs): Register the table into a custom catalog
 - [`composed_extension_codec`](examples/composed_extension_codec.rs): Example of using multiple extension codecs for serialization / deserialization
 - [`csv_sql.rs`](examples/csv_sql.rs): Build and run a query plan from a SQL statement against a local CSV file
@@ -62,27 +61,22 @@ cargo run --example csv_sql
 - [`file_stream_provider.rs`](examples/file_stream_provider.rs): Run a query on `FileStreamProvider` which implements `StreamProvider` for reading and writing to arbitrary stream sources / sinks.
 - [`flight_sql_server.rs`](examples/flight/flight_sql_server.rs): Run DataFusion as a standalone process and execute SQL queries from JDBC clients
 - [`function_factory.rs`](examples/function_factory.rs): Register `CREATE FUNCTION` handler to implement SQL macros
-- [`make_date.rs`](examples/make_date.rs): Examples of using the make_date function
 - [`memtable.rs`](examples/memtable.rs): Create an query data in memory using SQL and `RecordBatch`es
 - [`optimizer_rule.rs`](examples/optimizer_rule.rs): Use a custom OptimizerRule to replace certain predicates
 - [`parquet_index.rs`](examples/parquet_index.rs): Create an secondary index over several parquet files and use it to speed up queries
-- [`parquet_sql.rs`](examples/parquet_sql.rs): Build and run a query plan from a SQL statement against a local Parquet file
-- [`parquet_sql_multiple_files.rs`](examples/parquet_sql_multiple_files.rs): Build and run a query plan from a SQL statement against multiple local Parquet files
 - [`parquet_exec_visitor.rs`](examples/parquet_exec_visitor.rs): Extract statistics by visiting an ExecutionPlan after execution
 - [`parse_sql_expr.rs`](examples/parse_sql_expr.rs): Parse SQL text into Datafusion `Expr`.
 - [`plan_to_sql.rs`](examples/plan_to_sql.rs): Generate SQL from Datafusion `Expr` and `LogicalPlan`
 - [`pruning.rs`](examples/parquet_sql.rs): Use pruning to rule out files based on statistics
 - [`query-aws-s3.rs`](examples/external_dependency/query-aws-s3.rs): Configure `object_store` and run a query against files stored in AWS S3
 - [`query-http-csv.rs`](examples/query-http-csv.rs): Configure `object_store` and run a query against files vi HTTP
-- [`regexp.rs`](examples/regexp.rs): Examples of using regular expression functions
 - [`rewrite_expr.rs`](examples/rewrite_expr.rs): Define and invoke a custom Query Optimizer pass
 - [`simple_udaf.rs`](examples/simple_udaf.rs): Define and invoke a User Defined Aggregate Function (UDAF)
 - [`simple_udf.rs`](examples/simple_udf.rs): Define and invoke a User Defined Scalar Function (UDF)
 - [`simple_udfw.rs`](examples/simple_udwf.rs): Define and invoke a User Defined Window Function (UDWF)
+- [`sql.rs`](examples/sql.rs): Run SQL queries with functions against files, and in memory data
 - [`sql_analysis.rs`](examples/sql_analysis.rs): Analyse SQL queries with DataFusion structures
 - [`sql_dialect.rs`](examples/sql_dialect.rs): Example of implementing a custom SQL dialect on top of `DFParser`
-- [`to_char.rs`](examples/to_char.rs): Examples of using the to_char function
-- [`to_timestamp.rs`](examples/to_timestamp.rs): Examples of using to_timestamp functions
 
 ## Distributed
 

diff --git a/datafusion-examples/examples/avro_sql.rs b/datafusion-examples/examples/avro_sql.rs
diff --git a/datafusion-examples/examples/dataframe.rs b/datafusion-examples/examples/dataframe.rs
@@ -15,17 +15,37 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! This file contains several examples of how to run queries using DataFusion's
+//! DataFrame API:
+//!
+//! * [`parquet`]: query a single Parquet file
+//! * [`to_date_demo`]: use the `to_date` function to convert dates to strings
+//! * [`to_timestamp_demo`]: use the `to_timestamp` function to convert strings to timestamps
+//! * [`make_date_demo`]: use the `make_date` function to create dates from year, month, and day
+
+use arrow::array::{Int32Array, RecordBatch, StringArray};
 use datafusion::arrow::datatypes::{DataType, Field, Schema};
 use datafusion::error::Result;
 use datafusion::prelude::*;
 use std::fs::File;
 use std::io::Write;
+use std::sync::Arc;
 use tempfile::tempdir;
 
-/// This example demonstrates executing a simple query against an Arrow data source (Parquet) and
-/// fetching results, using the DataFrame trait
 #[tokio::main]
 async fn main() -> Result<()> {
+    parquet().await?;
+    to_date_demo().await?;
+    to_timestamp_demo().await?;
+    make_date_demo().await?;
+
+    Ok(())
+}
+
+/// This example demonstrates executing a simple query against an Arrow data
+/// source (Parquet) and fetching results, using the DataFrame trait
+
+async fn parquet() -> Result<()> {
     // create local execution context
     let ctx = SessionContext::new();
 
@@ -109,3 +129,135 @@ async fn example_read_csv_file_with_schema(file_path: &str) -> DataFrame {
     // Register a lazy DataFrame by using the context and option provider
     ctx.read_csv(file_path, csv_read_option).await.unwrap()
 }
+
+/// This example demonstrates how to use the to_date series
+/// of functions in the DataFrame API
+async fn to_date_demo() -> Result<()> {
+    // define a schema.
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8, false)]));
+
+    // define data.
+    let batch = RecordBatch::try_new(
+        schema,
+        vec![Arc::new(StringArray::from(vec![
+            "2020-09-08T13:42:29Z",
+            "2020-09-08T13:42:29.190855-05:00",
+            "2020-08-09 12:13:29",
+            "2020-01-02",
+        ]))],
+    )?;
+
+    // declare a new context. In spark API, this corresponds to a new spark SQLsession
+    let ctx = SessionContext::new();
+
+    // declare a table in memory. In spark API, this corresponds to createDataFrame(...).
+    ctx.register_batch("t", batch)?;
+    let df = ctx.table("t").await?;
+
+    // use to_date function to convert col 'a' to timestamp type using the default parsing
+    let df = df.with_column("a", to_date(vec![col("a")]))?;
+
+    let df = df.select_columns(&["a"])?;
+
+    // print the results
+    df.show().await?;
+
+    Ok(())
+}
+
+/// This example demonstrates how to use the to_timestamp series
+/// of functions in the DataFrame API
+async fn to_timestamp_demo() -> Result<()> {
+    // define a schema.
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8, false),
+    ]));
+
+    // define data.
+    let batch = RecordBatch::try_new(
+        schema,
+        vec![
+            Arc::new(StringArray::from(vec![
+                "2020-09-08T13:42:29Z",
+                "2020-09-08T13:42:29.190855-05:00",
+                "2020-08-09 12:13:29",
+                "2020-01-02",
+            ])),
+            Arc::new(StringArray::from(vec![
+                "2020-09-08T13:42:29Z",
+                "2020-09-08T13:42:29.190855-05:00",
+                "08-09-2020 13/42/29",
+                "09-27-2020 13:42:29-05:30",
+            ])),
+        ],
+    )?;
+
+    // declare a new context. In spark API, this corresponds to a new spark SQLsession
+    let ctx = SessionContext::new();
+
+    // declare a table in memory. In spark API, this corresponds to createDataFrame(...).
+    ctx.register_batch("t", batch)?;
+    let df = ctx.table("t").await?;
+
+    // use to_timestamp function to convert col 'a' to timestamp type using the default parsing
+    let df = df.with_column("a", to_timestamp(vec![col("a")]))?;
+    // use to_timestamp_seconds function to convert col 'b' to timestamp(Seconds) type using a list
+    // of chrono formats (https://docs.rs/chrono/latest/chrono/format/strftime/index.html) to try
+    let df = df.with_column(
+        "b",
+        to_timestamp_seconds(vec![
+            col("b"),
+            lit("%+"),
+            lit("%d-%m-%Y %H/%M/%S"),
+            lit("%m-%d-%Y %H:%M:%S%#z"),
+        ]),
+    )?;
+
+    let df = df.select_columns(&["a", "b"])?;
+
+    // print the results
+    df.show().await?;
+
+    Ok(())
+}
+
+/// This example demonstrates how to use the make_date
+/// function in the DataFrame API as well as via sql.
+async fn make_date_demo() -> Result<()> {
+    // define a schema.
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("y", DataType::Int32, false),
+        Field::new("m", DataType::Int32, false),
+        Field::new("d", DataType::Int32, false),
+    ]));
+
+    // define data.
+    let batch = RecordBatch::try_new(
+        schema,
+        vec![
+            Arc::new(Int32Array::from(vec![2020, 2021, 2022, 2023, 2024])),
+            Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])),
+            Arc::new(Int32Array::from(vec![15, 16, 17, 18, 19])),
+        ],
+    )?;
+
+    // declare a new context. In spark API, this corresponds to a new spark SQLsession
+    let ctx = SessionContext::new();
+
+    // declare a table in memory. In spark API, this corresponds to createDataFrame(...).
+    ctx.register_batch("t", batch)?;
+    let df = ctx.table("t").await?;
+
+    // use make_date function to convert col 'y', 'm' & 'd' to a date
+    let df = df.with_column("a", make_date(col("y"), col("m"), col("d")))?;
+    // use make_date function to convert col 'y' & 'm' with a static day to a date
+    let df = df.with_column("b", make_date(col("y"), col("m"), lit(22)))?;
+
+    let df = df.select_columns(&["a", "b"])?;
+
+    // print the results
+    df.show().await?;
+
+    Ok(())
+}
diff --git a/datafusion-examples/examples/make_date.rs b/datafusion-examples/examples/make_date.rs