apache · alamb · May 13, 2024 · May 7, 2024 · May 7, 2024 · May 8, 2024
diff --git a/datafusion-cli/src/helper.rs b/datafusion-cli/src/helper.rs
@@ -259,52 +259,69 @@ mod tests {
 
         // shoule be valid
         let result = readline_direct(
-            Cursor::new(r"create external table test stored as csv location 'data.csv' delimiter ',';".as_bytes()),
-            &validator,
-        )?;
+             Cursor::new(
+                 r"create external table test stored as csv location 'data.csv' options ('format.delimiter' ',');"
+                     .as_bytes(),
+             ),
+             &validator,
+         )?;
         assert!(matches!(result, ValidationResult::Valid(None)));
 
         let result = readline_direct(
-            Cursor::new(r"create external table test stored as csv location 'data.csv' delimiter '\0';".as_bytes()),
-            &validator,
-        )?;
+             Cursor::new(
+                 r"create external table test stored as csv location 'data.csv' options ('format.delimiter' '\0');"
+                     .as_bytes()),
+             &validator,
+         )?;
         assert!(matches!(result, ValidationResult::Valid(None)));
 
         let result = readline_direct(
-            Cursor::new(r"create external table test stored as csv location 'data.csv' delimiter '\n';".as_bytes()),
-            &validator,
-        )?;
+             Cursor::new(
+                 r"create external table test stored as csv location 'data.csv' options ('format.delimiter' '\n');"
+                     .as_bytes()),
+             &validator,
+         )?;
         assert!(matches!(result, ValidationResult::Valid(None)));
 
         let result = readline_direct(
-            Cursor::new(r"create external table test stored as csv location 'data.csv' delimiter '\r';".as_bytes()),
-            &validator,
-        )?;
+             Cursor::new(
+                 r"create external table test stored as csv location 'data.csv' options ('format.delimiter' '\r');"
+                     .as_bytes()),
+             &validator,
+         )?;
         assert!(matches!(result, ValidationResult::Valid(None)));
 
         let result = readline_direct(
-            Cursor::new(r"create external table test stored as csv location 'data.csv' delimiter '\t';".as_bytes()),
-            &validator,
-        )?;
+             Cursor::new(
+                 r"create external table test stored as csv location 'data.csv' options ('format.delimiter' '\t');"
+                     .as_bytes()),
+             &validator,
+         )?;
         assert!(matches!(result, ValidationResult::Valid(None)));
 
         let result = readline_direct(
-            Cursor::new(r"create external table test stored as csv location 'data.csv' delimiter '\\';".as_bytes()),
-            &validator,
-        )?;
+             Cursor::new(
+                 r"create external table test stored as csv location 'data.csv' options ('format.delimiter' '\\');"
+                     .as_bytes()),
+             &validator,
+         )?;
         assert!(matches!(result, ValidationResult::Valid(None)));
 
-        // should be invalid
         let result = readline_direct(
-            Cursor::new(r"create external table test stored as csv location 'data.csv' delimiter ',,';".as_bytes()),
-            &validator,
-        )?;
-        assert!(matches!(result, ValidationResult::Invalid(Some(_))));
+             Cursor::new(
+                 r"create external table test stored as csv location 'data.csv' options ('format.delimiter' ',,');"
+                     .as_bytes()),
+             &validator,
+         )?;
+        assert!(matches!(result, ValidationResult::Valid(None)));
 
+        // should be invalid
         let result = readline_direct(
-            Cursor::new(r"create external table test stored as csv location 'data.csv' delimiter '\u{07}';".as_bytes()),
-            &validator,
-        )?;
+             Cursor::new(
+                 r"create external table test stored as csv location 'data.csv' options ('format.delimiter' '\u{07}');"
+                     .as_bytes()),
+             &validator,
+         )?;
         assert!(matches!(result, ValidationResult::Invalid(Some(_))));
 
         Ok(())

diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
@@ -1564,7 +1564,7 @@ config_namespace_with_hashmap! {
 config_namespace! {
     /// Options controlling CSV format
     pub struct CsvOptions {
-        pub has_header: bool, default = true
+        pub has_header: Option<bool>, default = None
         pub delimiter: u8, default = b','
         pub quote: u8, default = b'"'
         pub escape: Option<u8>, default = None
@@ -1600,13 +1600,13 @@ impl CsvOptions {
     /// Set true to indicate that the first line is a header.
     /// - default to true
     pub fn with_has_header(mut self, has_header: bool) -> Self {
-        self.has_header = has_header;
+        self.has_header = Some(has_header);
         self
     }
 
     /// True if the first line is a header.
     pub fn has_header(&self) -> bool {
-        self.has_header
+        self.has_header.unwrap_or(false)
     }
 
     /// The character separating values within a row.

diff --git a/datafusion/common/src/file_options/csv_writer.rs b/datafusion/common/src/file_options/csv_writer.rs
@@ -50,7 +50,7 @@ impl TryFrom<&CsvOptions> for CsvWriterOptions {
 
     fn try_from(value: &CsvOptions) -> Result<Self> {
         let mut builder = WriterBuilder::default()
-            .with_header(value.has_header)
+            .with_header(value.has_header.unwrap_or(false))
             .with_delimiter(value.delimiter);
 
         if let Some(v) = &value.date_format {

diff --git a/datafusion/core/src/catalog/listing_schema.rs b/datafusion/core/src/catalog/listing_schema.rs
@@ -27,7 +27,6 @@ use crate::datasource::provider::TableProviderFactory;
 use crate::datasource::TableProvider;
 use crate::execution::context::SessionState;
 
-use datafusion_common::parsers::CompressionTypeVariant;
 use datafusion_common::{Constraints, DFSchema, DataFusionError, TableReference};
 use datafusion_expr::CreateExternalTable;
 
@@ -140,11 +139,9 @@ impl ListingSchemaProvider {
                             location: table_url,
                             file_type: self.format.clone(),
                             has_header: self.has_header,
-                            delimiter: ',',
                             table_partition_cols: vec![],
                             if_not_exists: false,
                             definition: None,
-                            file_compression_type: CompressionTypeVariant::UNCOMPRESSED,
                             order_exprs: vec![],
                             unbounded: false,
                             options: Default::default(),

diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs
@@ -136,13 +136,13 @@ impl CsvFormat {
     /// Set true to indicate that the first line is a header.
     /// - default to true
     pub fn with_has_header(mut self, has_header: bool) -> Self {
-        self.options.has_header = has_header;
+        self.options.has_header = Some(has_header);
         self
     }
 
     /// True if the first line is a header.
     pub fn has_header(&self) -> bool {
-        self.options.has_header
+        self.options.has_header.unwrap_or(false)
     }
 
     /// The character separating values within a row.
@@ -200,7 +200,7 @@ impl FileFormat for CsvFormat {
 
     async fn infer_schema(
         &self,
-        _state: &SessionState,
+        state: &SessionState,
         store: &Arc<dyn ObjectStore>,
         objects: &[ObjectMeta],
     ) -> Result<SchemaRef> {
@@ -211,7 +211,7 @@ impl FileFormat for CsvFormat {
         for object in objects {
             let stream = self.read_to_delimited_chunks(store, object).await;
             let (schema, records_read) = self
-                .infer_schema_from_stream(records_to_read, stream)
+                .infer_schema_from_stream(state, records_to_read, stream)
                 .await?;
             records_to_read -= records_read;
             schemas.push(schema);
@@ -236,13 +236,15 @@ impl FileFormat for CsvFormat {
 
     async fn create_physical_plan(
         &self,
-        _state: &SessionState,
+        state: &SessionState,
         conf: FileScanConfig,
         _filters: Option<&Arc<dyn PhysicalExpr>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let exec = CsvExec::new(
             conf,
-            self.options.has_header,
+            self.options
+                .has_header
+                .unwrap_or(state.config().options().catalog.has_header),
             self.options.delimiter,
             self.options.quote,
             self.options.escape,
@@ -286,6 +288,7 @@ impl CsvFormat {
     /// number of lines that were read
     async fn infer_schema_from_stream(
         &self,
+        state: &SessionState,
         mut records_to_read: usize,
         stream: impl Stream<Item = Result<Bytes>>,
     ) -> Result<(Schema, usize)> {
@@ -298,7 +301,12 @@ impl CsvFormat {
 
         while let Some(chunk) = stream.next().await.transpose()? {
             let format = arrow::csv::reader::Format::default()
-                .with_header(self.options.has_header && first_chunk)
+                .with_header(
+                    self.options
+                        .has_header
+                        .unwrap_or(state.config_options().catalog.has_header)
+                        && first_chunk,
+                )
                 .with_delimiter(self.options.delimiter);
 
             let (Schema { fields, .. }, records_read) =
@@ -538,6 +546,7 @@ mod tests {
     use datafusion_common::cast::as_string_array;
     use datafusion_common::stats::Precision;
     use datafusion_common::{internal_err, GetExt};
+    use datafusion_execution::runtime_env::{RuntimeConfig, RuntimeEnv};
     use datafusion_expr::{col, lit};
 
     use chrono::DateTime;
@@ -554,7 +563,8 @@ mod tests {
         let task_ctx = state.task_ctx();
         // skip column 9 that overflows the automaticly discovered column type of i64 (u64 would work)
         let projection = Some(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12]);
-        let exec = get_exec(&state, "aggregate_test_100.csv", projection, None).await?;
+        let exec =
+            get_exec(&state, "aggregate_test_100.csv", projection, None, true).await?;
         let stream = exec.execute(0, task_ctx)?;
 
         let tt_batches: i32 = stream
@@ -582,7 +592,7 @@ mod tests {
         let task_ctx = session_ctx.task_ctx();
         let projection = Some(vec![0, 1, 2, 3]);
         let exec =
-            get_exec(&state, "aggregate_test_100.csv", projection, Some(1)).await?;
+            get_exec(&state, "aggregate_test_100.csv", projection, Some(1), true).await?;
         let batches = collect(exec, task_ctx).await?;
         assert_eq!(1, batches.len());
         assert_eq!(4, batches[0].num_columns());
@@ -597,7 +607,8 @@ mod tests {
         let state = session_ctx.state();
 
         let projection = None;
-        let exec = get_exec(&state, "aggregate_test_100.csv", projection, None).await?;
+        let exec =
+            get_exec(&state, "aggregate_test_100.csv", projection, None, true).await?;
 
         let x: Vec<String> = exec
             .schema()
@@ -633,7 +644,8 @@ mod tests {
         let state = session_ctx.state();
         let task_ctx = session_ctx.task_ctx();
         let projection = Some(vec![0]);
-        let exec = get_exec(&state, "aggregate_test_100.csv", projection, None).await?;
+        let exec =
+            get_exec(&state, "aggregate_test_100.csv", projection, None, true).await?;
 
         let batches = collect(exec, task_ctx).await.expect("Collect batches");
 
@@ -716,6 +728,10 @@ mod tests {
     async fn query_compress_data(
         file_compression_type: FileCompressionType,
     ) -> Result<()> {
+        let runtime = Arc::new(RuntimeEnv::new(RuntimeConfig::new()).unwrap());
+        let cfg = SessionConfig::new().set_str("datafusion.catalog.has_header", "true");
+        let session_state = SessionState::new_with_config_rt(cfg, runtime);
+
         let integration = LocalFileSystem::new_with_prefix(arrow_test_data()).unwrap();
 
         let path = Path::from("csv/aggregate_test_100.csv");
@@ -757,7 +773,7 @@ mod tests {
             .read_to_delimited_chunks_from_stream(compressed_stream.unwrap())
             .await;
         let (schema, records_read) = compressed_csv
-            .infer_schema_from_stream(records_to_read, decoded_stream)
+            .infer_schema_from_stream(&session_state, records_to_read, decoded_stream)
             .await?;
 
         assert_eq!(expected, schema);
@@ -803,9 +819,11 @@ mod tests {
         file_name: &str,
         projection: Option<Vec<usize>>,
         limit: Option<usize>,
+        hash_header: bool,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let root = format!("{}/csv", crate::test_util::arrow_test_data());
-        let format = CsvFormat::default();
+        let mut format = CsvFormat::default();
+        format = format.with_has_header(hash_header);
         scan_format(state, &format, &root, file_name, projection, limit).await
     }
 

diff --git a/datafusion/core/src/datasource/listing_table_factory.rs b/datafusion/core/src/datasource/listing_table_factory.rs
@@ -34,7 +34,7 @@ use crate::datasource::TableProvider;
 use crate::execution::context::SessionState;
 
 use arrow::datatypes::{DataType, SchemaRef};
-use datafusion_common::{arrow_datafusion_err, DataFusionError, FileType};
+use datafusion_common::{arrow_datafusion_err, plan_err, DataFusionError, FileType};
 use datafusion_expr::CreateExternalTable;
 
 use async_trait::async_trait;
@@ -67,9 +67,13 @@ impl TableProviderFactory for ListingTableFactory {
         let file_format: Arc<dyn FileFormat> = match file_type {
             FileType::CSV => {
                 let mut csv_options = table_options.csv;
-                csv_options.has_header = cmd.has_header;
-                csv_options.delimiter = cmd.delimiter as u8;
-                csv_options.compression = cmd.file_compression_type;
+                if let Some(has_header) = csv_options.has_header {
+                    if cmd.has_header && !has_header {
+                        return plan_err!("Conflicting header options for CSV file");
+                    }
+                } else {
+                    csv_options.has_header = Some(cmd.has_header);
+                }
                 Arc::new(CsvFormat::default().with_options(csv_options))
             }
             #[cfg(feature = "parquet")]
@@ -78,9 +82,7 @@ impl TableProviderFactory for ListingTableFactory {
             }
             FileType::AVRO => Arc::new(AvroFormat),
             FileType::JSON => {
-                let mut json_options = table_options.json;
-                json_options.compression = cmd.file_compression_type;
-                Arc::new(JsonFormat::default().with_options(json_options))
+                Arc::new(JsonFormat::default().with_options(table_options.json))
             }
             FileType::ARROW => Arc::new(ArrowFormat),
         };
@@ -172,7 +174,6 @@ mod tests {
     use super::*;
     use crate::execution::context::SessionContext;
 
-    use datafusion_common::parsers::CompressionTypeVariant;
     use datafusion_common::{Constraints, DFSchema, TableReference};
 
     #[tokio::test]
@@ -192,11 +193,9 @@ mod tests {
             location: csv_file.path().to_str().unwrap().to_string(),
             file_type: "csv".to_string(),
             has_header: true,
-            delimiter: ',',
             schema: Arc::new(DFSchema::empty()),
             table_partition_cols: vec![],
             if_not_exists: false,
-            file_compression_type: CompressionTypeVariant::UNCOMPRESSED,
             definition: None,
             order_exprs: vec![],
             unbounded: false,
@@ -233,11 +232,9 @@ mod tests {
             location: csv_file.path().to_str().unwrap().to_string(),
             file_type: "csv".to_string(),
             has_header: true,
-            delimiter: ',',
             schema: Arc::new(DFSchema::empty()),
             table_partition_cols: vec![],
             if_not_exists: false,
-            file_compression_type: CompressionTypeVariant::UNCOMPRESSED,
             definition: None,
             order_exprs: vec![],
             unbounded: false,