From 618c775f64b30c4acb4991562373cdd91b6ad365 Mon Sep 17 00:00:00 2001 From: ByteBaker <42913098+ByteBaker@users.noreply.github.com> Date: Sun, 29 Sep 2024 12:20:37 +0530 Subject: [PATCH 1/2] chore: add docs, part of #37 - add pragma `#![warn(missing_docs)]` to the following - `arrow-flight` - `arrow-ipc` - `arrow-integration-test` - `arrow-integration-testing` - `object_store` - also document the caveat with using level 10 GZIP compression in parquet. See #6282. --- arrow-flight/examples/flight_sql_server.rs | 3 +- arrow-flight/src/bin/flight_sql_client.rs | 3 +- arrow-flight/src/decode.rs | 7 ++- arrow-flight/src/encode.rs | 3 +- arrow-flight/src/error.rs | 2 + arrow-flight/src/lib.rs | 5 ++ arrow-flight/src/sql/client.rs | 4 +- arrow-flight/src/sql/metadata/sql_info.rs | 13 ++---- arrow-flight/src/sql/metadata/xdbc_info.rs | 39 ++++++++++++---- arrow-flight/src/sql/mod.rs | 11 ++++- arrow-flight/src/utils.rs | 7 ++- arrow-integration-test/src/lib.rs | 34 ++++++++++++++ .../auth_basic_proto.rs | 3 ++ .../integration_test.rs | 3 ++ .../src/flight_client_scenarios/middleware.rs | 3 ++ .../mod.rs} | 2 + .../auth_basic_proto.rs | 4 ++ .../integration_test.rs | 9 +++- .../src/flight_server_scenarios/middleware.rs | 4 ++ .../mod.rs} | 3 ++ arrow-integration-testing/src/lib.rs | 18 ++++++-- arrow-ipc/src/convert.rs | 3 +- arrow-ipc/src/lib.rs | 2 + arrow-ipc/src/writer.rs | 11 ++++- arrow-json/src/writer.rs | 9 ++-- arrow-schema/src/field.rs | 4 +- arrow/tests/array_cast.rs | 8 ++-- object_store/src/aws/builder.rs | 1 - object_store/src/aws/client.rs | 1 - object_store/src/aws/resolve.rs | 1 - object_store/src/azure/builder.rs | 1 - object_store/src/azure/client.rs | 1 - object_store/src/client/get.rs | 1 - object_store/src/lib.rs | 46 +++++++++++++++++-- object_store/src/local.rs | 1 - object_store/src/memory.rs | 1 - object_store/src/path/mod.rs | 35 ++++++++++++-- parquet/src/compression.rs | 24 ++++++++++ 38 files changed, 265 insertions(+), 65 deletions(-) rename arrow-integration-testing/src/{flight_client_scenarios.rs => flight_client_scenarios/mod.rs} (93%) rename arrow-integration-testing/src/{flight_server_scenarios.rs => flight_server_scenarios/mod.rs} (91%) diff --git a/arrow-flight/examples/flight_sql_server.rs b/arrow-flight/examples/flight_sql_server.rs index 81afecf85625..dd3a3943dd95 100644 --- a/arrow-flight/examples/flight_sql_server.rs +++ b/arrow-flight/examples/flight_sql_server.rs @@ -19,6 +19,7 @@ use arrow_flight::sql::server::PeekableFlightDataStream; use arrow_flight::sql::DoPutPreparedStatementResult; use base64::prelude::BASE64_STANDARD; use base64::Engine; +use core::str; use futures::{stream, Stream, TryStreamExt}; use once_cell::sync::Lazy; use prost::Message; @@ -168,7 +169,7 @@ impl FlightSqlService for FlightSqlServiceImpl { let bytes = BASE64_STANDARD .decode(base64) .map_err(|e| status!("authorization not decodable", e))?; - let str = String::from_utf8(bytes).map_err(|e| status!("authorization not parsable", e))?; + let str = str::from_utf8(&bytes).map_err(|e| status!("authorization not parsable", e))?; let parts: Vec<_> = str.split(':').collect(); let (user, pass) = match parts.as_slice() { [user, pass] => (user, pass), diff --git a/arrow-flight/src/bin/flight_sql_client.rs b/arrow-flight/src/bin/flight_sql_client.rs index c334b95a9a96..1fbf81f3b524 100644 --- a/arrow-flight/src/bin/flight_sql_client.rs +++ b/arrow-flight/src/bin/flight_sql_client.rs @@ -26,6 +26,7 @@ use arrow_flight::{ }; use arrow_schema::Schema; use clap::{Parser, Subcommand}; +use core::str; use futures::TryStreamExt; use tonic::{ metadata::MetadataMap, @@ -421,7 +422,7 @@ fn log_metadata(map: &MetadataMap, what: &'static str) { "{}: {}={}", what, k.as_str(), - String::from_utf8_lossy(v.as_ref()), + str::from_utf8(v.as_ref()).unwrap(), ); } } diff --git a/arrow-flight/src/decode.rs b/arrow-flight/src/decode.rs index 5561f256ce01..7bafc384306b 100644 --- a/arrow-flight/src/decode.rs +++ b/arrow-flight/src/decode.rs @@ -388,11 +388,14 @@ struct FlightStreamState { /// FlightData and the decoded payload (Schema, RecordBatch), if any #[derive(Debug)] pub struct DecodedFlightData { + /// The original FlightData message pub inner: FlightData, + /// The decoded payload pub payload: DecodedPayload, } impl DecodedFlightData { + /// Create a new DecodedFlightData with no payload pub fn new_none(inner: FlightData) -> Self { Self { inner, @@ -400,6 +403,7 @@ impl DecodedFlightData { } } + /// Create a new DecodedFlightData with a [`Schema`] payload pub fn new_schema(inner: FlightData, schema: SchemaRef) -> Self { Self { inner, @@ -407,6 +411,7 @@ impl DecodedFlightData { } } + /// Create a new [`DecodedFlightData`] with a [`RecordBatch`] payload pub fn new_record_batch(inner: FlightData, batch: RecordBatch) -> Self { Self { inner, @@ -414,7 +419,7 @@ impl DecodedFlightData { } } - /// return the metadata field of the inner flight data + /// Return the metadata field of the inner flight data pub fn app_metadata(&self) -> Bytes { self.inner.app_metadata.clone() } diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index 59fa8afd58d5..55bc9240321d 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -144,6 +144,7 @@ impl Default for FlightDataEncoderBuilder { } impl FlightDataEncoderBuilder { + /// Create a new [`FlightDataEncoderBuilder`]. pub fn new() -> Self { Self::default() } @@ -1403,7 +1404,7 @@ mod tests { let input_rows = batch.num_rows(); let split = split_batch_for_grpc_response(batch.clone(), max_flight_data_size_bytes); - let sizes: Vec<_> = split.iter().map(|batch| batch.num_rows()).collect(); + let sizes: Vec<_> = split.iter().map(RecordBatch::num_rows).collect(); let output_rows: usize = sizes.iter().sum(); assert_eq!(sizes, expected_sizes, "mismatch for {batch:?}"); diff --git a/arrow-flight/src/error.rs b/arrow-flight/src/error.rs index ba979ca9f7a6..499706e1ede7 100644 --- a/arrow-flight/src/error.rs +++ b/arrow-flight/src/error.rs @@ -37,6 +37,7 @@ pub enum FlightError { } impl FlightError { + /// Generate a new `FlightError::ProtocolError` variant. pub fn protocol(message: impl Into) -> Self { Self::ProtocolError(message.into()) } @@ -98,6 +99,7 @@ impl From for tonic::Status { } } +/// Result type for the Apache Arrow Flight crate pub type Result = std::result::Result; #[cfg(test)] diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs index 64e3ba01c5bd..9f18416c06ec 100644 --- a/arrow-flight/src/lib.rs +++ b/arrow-flight/src/lib.rs @@ -37,6 +37,7 @@ //! //! [Flight SQL]: https://arrow.apache.org/docs/format/FlightSql.html #![allow(rustdoc::invalid_html_tags)] +#![warn(missing_docs)] use arrow_ipc::{convert, writer, writer::EncodedData, writer::IpcWriteOptions}; use arrow_schema::{ArrowError, Schema}; @@ -52,6 +53,8 @@ type ArrowResult = std::result::Result; #[allow(clippy::all)] mod gen { + // Since this file is auto-generated, we suppress all warnings + #![allow(missing_docs)] include!("arrow.flight.protocol.rs"); } @@ -125,6 +128,7 @@ use flight_descriptor::DescriptorType; /// SchemaAsIpc represents a pairing of a `Schema` with IpcWriteOptions pub struct SchemaAsIpc<'a> { + /// Data type representing a schema and its IPC write options pub pair: (&'a Schema, &'a IpcWriteOptions), } @@ -684,6 +688,7 @@ impl PollInfo { } impl<'a> SchemaAsIpc<'a> { + /// Create a new `SchemaAsIpc` from a `Schema` and `IpcWriteOptions` pub fn new(schema: &'a Schema, options: &'a IpcWriteOptions) -> Self { SchemaAsIpc { pair: (schema, options), diff --git a/arrow-flight/src/sql/client.rs b/arrow-flight/src/sql/client.rs index ef52aa27ef50..e45e505b2b61 100644 --- a/arrow-flight/src/sql/client.rs +++ b/arrow-flight/src/sql/client.rs @@ -695,9 +695,11 @@ fn flight_error_to_arrow_error(err: FlightError) -> ArrowError { } } -// A polymorphic structure to natively represent different types of data contained in `FlightData` +/// A polymorphic structure to natively represent different types of data contained in `FlightData` pub enum ArrowFlightData { + /// A record batch RecordBatch(RecordBatch), + /// A schema Schema(Schema), } diff --git a/arrow-flight/src/sql/metadata/sql_info.rs b/arrow-flight/src/sql/metadata/sql_info.rs index 97304d3c872d..2ea30df7fc2f 100644 --- a/arrow-flight/src/sql/metadata/sql_info.rs +++ b/arrow-flight/src/sql/metadata/sql_info.rs @@ -331,7 +331,7 @@ impl SqlInfoUnionBuilder { /// /// Servers constuct - usually static - [`SqlInfoData`] via the [`SqlInfoDataBuilder`], /// and build responses using [`CommandGetSqlInfo::into_builder`] -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Default)] pub struct SqlInfoDataBuilder { /// Use BTreeMap to ensure the values are sorted by value as /// to make output consistent @@ -341,17 +341,10 @@ pub struct SqlInfoDataBuilder { infos: BTreeMap, } -impl Default for SqlInfoDataBuilder { - fn default() -> Self { - Self::new() - } -} - impl SqlInfoDataBuilder { + /// Create a new SQL info builder pub fn new() -> Self { - Self { - infos: BTreeMap::new(), - } + Self::default() } /// register the specific sql metadata item diff --git a/arrow-flight/src/sql/metadata/xdbc_info.rs b/arrow-flight/src/sql/metadata/xdbc_info.rs index 2e635d3037bc..485bedaebfb0 100644 --- a/arrow-flight/src/sql/metadata/xdbc_info.rs +++ b/arrow-flight/src/sql/metadata/xdbc_info.rs @@ -41,24 +41,43 @@ use crate::sql::{CommandGetXdbcTypeInfo, Nullable, Searchable, XdbcDataType, Xdb /// Data structure representing type information for xdbc types. #[derive(Debug, Clone, Default)] pub struct XdbcTypeInfo { + /// The name of the type pub type_name: String, + /// The data type of the type pub data_type: XdbcDataType, + /// The column size of the type pub column_size: Option, + /// The prefix of the type pub literal_prefix: Option, + /// The suffix of the type pub literal_suffix: Option, + /// The create parameters of the type pub create_params: Option>, + /// The nullability of the type pub nullable: Nullable, + /// Whether the type is case sensitive pub case_sensitive: bool, + /// Whether the type is searchable pub searchable: Searchable, + /// Whether the type is unsigned pub unsigned_attribute: Option, + /// Whether the type has fixed precision and scale pub fixed_prec_scale: bool, + /// Whether the type is auto-incrementing pub auto_increment: Option, + /// The local type name of the type pub local_type_name: Option, + /// The minimum scale of the type pub minimum_scale: Option, + /// The maximum scale of the type pub maximum_scale: Option, + /// The SQL data type of the type pub sql_data_type: XdbcDataType, + /// The optional datetime subcode of the type pub datetime_subcode: Option, + /// The number precision radix of the type pub num_prec_radix: Option, + /// The interval precision of the type pub interval_precision: Option, } @@ -93,16 +112,6 @@ impl XdbcTypeInfoData { } } -pub struct XdbcTypeInfoDataBuilder { - infos: Vec, -} - -impl Default for XdbcTypeInfoDataBuilder { - fn default() -> Self { - Self::new() - } -} - /// A builder for [`XdbcTypeInfoData`] which is used to create [`CommandGetXdbcTypeInfo`] responses. /// /// # Example @@ -138,6 +147,16 @@ impl Default for XdbcTypeInfoDataBuilder { /// // to access the underlying record batch /// let batch = info_list.record_batch(None); /// ``` +pub struct XdbcTypeInfoDataBuilder { + infos: Vec, +} + +impl Default for XdbcTypeInfoDataBuilder { + fn default() -> Self { + Self::new() + } +} + impl XdbcTypeInfoDataBuilder { /// Create a new instance of [`XdbcTypeInfoDataBuilder`]. pub fn new() -> Self { diff --git a/arrow-flight/src/sql/mod.rs b/arrow-flight/src/sql/mod.rs index 453f608d353a..94bb96a4f852 100644 --- a/arrow-flight/src/sql/mod.rs +++ b/arrow-flight/src/sql/mod.rs @@ -43,9 +43,11 @@ use bytes::Bytes; use paste::paste; use prost::Message; +#[allow(clippy::all)] mod gen { - #![allow(clippy::all)] #![allow(rustdoc::unportable_markdown)] + // Since this file is auto-generated, we suppress all warnings + #![allow(missing_docs)] include!("arrow.flight.protocol.sql.rs"); } @@ -163,7 +165,9 @@ macro_rules! prost_message_ext { /// ``` #[derive(Clone, Debug, PartialEq)] pub enum Command { - $($name($name),)* + $( + #[doc = concat!(stringify!($name), "variant")] + $name($name),)* /// Any message that is not any FlightSQL command. Unknown(Any), @@ -297,10 +301,12 @@ pub struct Any { } impl Any { + /// Checks whether the message is of type `M` pub fn is(&self) -> bool { M::type_url() == self.type_url } + /// Unpacks the contents of the message if it is of type `M` pub fn unpack(&self) -> Result, ArrowError> { if !self.is::() { return Ok(None); @@ -310,6 +316,7 @@ impl Any { Ok(Some(m)) } + /// Packs a message into an [`Any`] message pub fn pack(message: &M) -> Result { Ok(message.as_any()) } diff --git a/arrow-flight/src/utils.rs b/arrow-flight/src/utils.rs index 37d7ff9e7293..f6129ddfe248 100644 --- a/arrow-flight/src/utils.rs +++ b/arrow-flight/src/utils.rs @@ -160,9 +160,12 @@ pub fn batches_to_flight_data( dictionaries.extend(encoded_dictionaries.into_iter().map(Into::into)); flight_data.push(encoded_batch.into()); } - let mut stream = vec![schema_flight_data]; + + let mut stream = Vec::with_capacity(1 + dictionaries.len() + flight_data.len()); + + stream.push(schema_flight_data); stream.extend(dictionaries); stream.extend(flight_data); - let flight_data: Vec<_> = stream.into_iter().collect(); + let flight_data = stream; Ok(flight_data) } diff --git a/arrow-integration-test/src/lib.rs b/arrow-integration-test/src/lib.rs index d1486fd5a153..ea5b545f2e81 100644 --- a/arrow-integration-test/src/lib.rs +++ b/arrow-integration-test/src/lib.rs @@ -21,6 +21,7 @@ //! //! This is not a canonical format, but provides a human-readable way of verifying language implementations +#![warn(missing_docs)] use arrow_buffer::{IntervalDayTime, IntervalMonthDayNano, ScalarBuffer}; use hex::decode; use num::BigInt; @@ -49,8 +50,11 @@ pub use schema::*; /// See #[derive(Deserialize, Serialize, Debug)] pub struct ArrowJson { + /// The Arrow schema for JSON file pub schema: ArrowJsonSchema, + /// The `RecordBatch`es in the JSON file pub batches: Vec, + /// The dictionaries in the JSON file #[serde(skip_serializing_if = "Option::is_none")] pub dictionaries: Option>, } @@ -60,7 +64,9 @@ pub struct ArrowJson { /// Fields are left as JSON `Value` as they vary by `DataType` #[derive(Deserialize, Serialize, Debug)] pub struct ArrowJsonSchema { + /// An array of JSON fields pub fields: Vec, + /// An array of metadata key-value pairs #[serde(skip_serializing_if = "Option::is_none")] pub metadata: Option>>, } @@ -68,13 +74,20 @@ pub struct ArrowJsonSchema { /// Fields are left as JSON `Value` as they vary by `DataType` #[derive(Deserialize, Serialize, Debug)] pub struct ArrowJsonField { + /// The name of the field pub name: String, + /// The data type of the field, + /// can be any valid JSON value #[serde(rename = "type")] pub field_type: Value, + /// Whether the field is nullable pub nullable: bool, + /// The children fields pub children: Vec, + /// The dictionary for the field #[serde(skip_serializing_if = "Option::is_none")] pub dictionary: Option, + /// The metadata for the field, if any #[serde(skip_serializing_if = "Option::is_none")] pub metadata: Option, } @@ -115,20 +128,28 @@ impl From<&Field> for ArrowJsonField { } } +/// Represents a dictionary-encoded field in the Arrow JSON format #[derive(Deserialize, Serialize, Debug)] pub struct ArrowJsonFieldDictionary { + /// A unique identifier for the dictionary pub id: i64, + /// The type of the dictionary index #[serde(rename = "indexType")] pub index_type: DictionaryIndexType, + /// Whether the dictionary is ordered #[serde(rename = "isOrdered")] pub is_ordered: bool, } +/// Type of an index for a dictionary-encoded field in the Arrow JSON format #[derive(Deserialize, Serialize, Debug)] pub struct DictionaryIndexType { + /// The name of the dictionary index type pub name: String, + /// Whether the dictionary index type is signed #[serde(rename = "isSigned")] pub is_signed: bool, + /// The bit width of the dictionary index type #[serde(rename = "bitWidth")] pub bit_width: i64, } @@ -137,6 +158,7 @@ pub struct DictionaryIndexType { #[derive(Deserialize, Serialize, Debug, Clone)] pub struct ArrowJsonBatch { count: usize, + /// The columns in the record batch pub columns: Vec, } @@ -144,7 +166,9 @@ pub struct ArrowJsonBatch { #[derive(Deserialize, Serialize, Debug, Clone)] #[allow(non_snake_case)] pub struct ArrowJsonDictionaryBatch { + /// The unique identifier for the dictionary pub id: i64, + /// The data for the dictionary pub data: ArrowJsonBatch, } @@ -152,15 +176,21 @@ pub struct ArrowJsonDictionaryBatch { #[derive(Deserialize, Serialize, Clone, Debug)] pub struct ArrowJsonColumn { name: String, + /// The number of elements in the column pub count: usize, + /// The validity bitmap to determine null values #[serde(rename = "VALIDITY")] pub validity: Option>, + /// The data values in the column #[serde(rename = "DATA")] pub data: Option>, + /// The offsets for variable-sized data types #[serde(rename = "OFFSET")] pub offset: Option>, // leaving as Value as 64-bit offsets are strings + /// The type id for union types #[serde(rename = "TYPE_ID")] pub type_id: Option>, + /// The children columns for nested types pub children: Option>, } @@ -189,6 +219,7 @@ impl ArrowJson { Ok(true) } + /// Convert the stored dictionaries to `Vec[RecordBatch]` pub fn get_record_batches(&self) -> Result> { let schema = self.schema.to_arrow_schema()?; @@ -275,6 +306,7 @@ impl ArrowJsonField { } } +/// Generates a [`RecordBatch`] from an Arrow JSON batch, given a schema pub fn record_batch_from_json( schema: &Schema, json_batch: ArrowJsonBatch, @@ -877,6 +909,7 @@ pub fn array_from_json( } } +/// Construct a [`DictionaryArray`] from a partially typed JSON column pub fn dictionary_array_from_json( field: &Field, json_col: ArrowJsonColumn, @@ -965,6 +998,7 @@ fn create_null_buf(json_col: &ArrowJsonColumn) -> Buffer { } impl ArrowJsonBatch { + /// Convert a [`RecordBatch`] to an [`ArrowJsonBatch`] pub fn from_batch(batch: &RecordBatch) -> ArrowJsonBatch { let mut json_batch = ArrowJsonBatch { count: batch.num_rows(), diff --git a/arrow-integration-testing/src/flight_client_scenarios/auth_basic_proto.rs b/arrow-integration-testing/src/flight_client_scenarios/auth_basic_proto.rs index 376e31e15553..34c3c7706df5 100644 --- a/arrow-integration-testing/src/flight_client_scenarios/auth_basic_proto.rs +++ b/arrow-integration-testing/src/flight_client_scenarios/auth_basic_proto.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! Scenario for testing basic auth. + use crate::{AUTH_PASSWORD, AUTH_USERNAME}; use arrow_flight::{flight_service_client::FlightServiceClient, BasicAuth, HandshakeRequest}; @@ -27,6 +29,7 @@ type Result = std::result::Result; type Client = FlightServiceClient; +/// Run a scenario that tests basic auth. pub async fn run_scenario(host: &str, port: u16) -> Result { let url = format!("http://{host}:{port}"); let mut client = FlightServiceClient::connect(url).await?; diff --git a/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs b/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs index 1a6c4e28a76b..c8289ff446a0 100644 --- a/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs +++ b/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! Integration tests for the Flight client. + use crate::open_json_file; use std::collections::HashMap; @@ -40,6 +42,7 @@ type Result = std::result::Result; type Client = FlightServiceClient; +/// Run a scenario that uploads data to a Flight server and then downloads it back pub async fn run_scenario(host: &str, port: u16, path: &str) -> Result { let url = format!("http://{host}:{port}"); diff --git a/arrow-integration-testing/src/flight_client_scenarios/middleware.rs b/arrow-integration-testing/src/flight_client_scenarios/middleware.rs index 3b71edf446a3..b826ad456055 100644 --- a/arrow-integration-testing/src/flight_client_scenarios/middleware.rs +++ b/arrow-integration-testing/src/flight_client_scenarios/middleware.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! Scenario for testing middleware. + use arrow_flight::{ flight_descriptor::DescriptorType, flight_service_client::FlightServiceClient, FlightDescriptor, }; @@ -24,6 +26,7 @@ use tonic::{Request, Status}; type Error = Box; type Result = std::result::Result; +/// Run a scenario that tests middleware. pub async fn run_scenario(host: &str, port: u16) -> Result { let url = format!("http://{host}:{port}"); let conn = tonic::transport::Endpoint::new(url)?.connect().await?; diff --git a/arrow-integration-testing/src/flight_client_scenarios.rs b/arrow-integration-testing/src/flight_client_scenarios/mod.rs similarity index 93% rename from arrow-integration-testing/src/flight_client_scenarios.rs rename to arrow-integration-testing/src/flight_client_scenarios/mod.rs index 66cced5f4c2e..c5794433764a 100644 --- a/arrow-integration-testing/src/flight_client_scenarios.rs +++ b/arrow-integration-testing/src/flight_client_scenarios/mod.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! Collection of utilities for testing the Flight client. + pub mod auth_basic_proto; pub mod integration_test; pub mod middleware; diff --git a/arrow-integration-testing/src/flight_server_scenarios/auth_basic_proto.rs b/arrow-integration-testing/src/flight_server_scenarios/auth_basic_proto.rs index 20d868953664..5462e5bd674b 100644 --- a/arrow-integration-testing/src/flight_server_scenarios/auth_basic_proto.rs +++ b/arrow-integration-testing/src/flight_server_scenarios/auth_basic_proto.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! Basic auth test for the Flight server. + use std::pin::Pin; use std::sync::Arc; @@ -35,6 +37,7 @@ use prost::Message; use crate::{AUTH_PASSWORD, AUTH_USERNAME}; +/// Run a scenario that tests basic auth. pub async fn scenario_setup(port: u16) -> Result { let service = AuthBasicProtoScenarioImpl { username: AUTH_USERNAME.into(), @@ -52,6 +55,7 @@ pub async fn scenario_setup(port: u16) -> Result { Ok(()) } +/// Scenario for testing basic auth. #[derive(Clone)] pub struct AuthBasicProtoScenarioImpl { username: Arc, diff --git a/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs b/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs index 76eb9d880199..0c58fae93df5 100644 --- a/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs +++ b/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +//! Integration tests for the Flight server. + +use core::str; use std::collections::HashMap; use std::pin::Pin; use std::sync::Arc; @@ -42,6 +45,7 @@ type TonicStream = Pin + Send + Sync + 'static>>; type Error = Box; type Result = std::result::Result; +/// Run a scenario that tests integration testing. pub async fn scenario_setup(port: u16) -> Result { let addr = super::listen_on(port).await?; @@ -65,6 +69,7 @@ struct IntegrationDataset { chunks: Vec, } +/// Flight service implementation for integration testing #[derive(Clone, Default)] pub struct FlightServiceImpl { server_location: String, @@ -100,13 +105,13 @@ impl FlightService for FlightServiceImpl { ) -> Result, Status> { let ticket = request.into_inner(); - let key = String::from_utf8(ticket.ticket.to_vec()) + let key = str::from_utf8(&ticket.ticket) .map_err(|e| Status::invalid_argument(format!("Invalid ticket: {e:?}")))?; let uploaded_chunks = self.uploaded_chunks.lock().await; let flight = uploaded_chunks - .get(&key) + .get(key) .ok_or_else(|| Status::not_found(format!("Could not find flight. {key}")))?; let options = arrow::ipc::writer::IpcWriteOptions::default(); diff --git a/arrow-integration-testing/src/flight_server_scenarios/middleware.rs b/arrow-integration-testing/src/flight_server_scenarios/middleware.rs index e8d9c521bb99..6685d45dffac 100644 --- a/arrow-integration-testing/src/flight_server_scenarios/middleware.rs +++ b/arrow-integration-testing/src/flight_server_scenarios/middleware.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! Middleware test for the Flight server. + use std::pin::Pin; use arrow_flight::{ @@ -31,6 +33,7 @@ type TonicStream = Pin + Send + Sync + 'static>>; type Error = Box; type Result = std::result::Result; +/// Run a scenario that tests middleware. pub async fn scenario_setup(port: u16) -> Result { let service = MiddlewareScenarioImpl {}; let svc = FlightServiceServer::new(service); @@ -44,6 +47,7 @@ pub async fn scenario_setup(port: u16) -> Result { Ok(()) } +/// Middleware interceptor for testing #[derive(Clone, Default)] pub struct MiddlewareScenarioImpl {} diff --git a/arrow-integration-testing/src/flight_server_scenarios.rs b/arrow-integration-testing/src/flight_server_scenarios/mod.rs similarity index 91% rename from arrow-integration-testing/src/flight_server_scenarios.rs rename to arrow-integration-testing/src/flight_server_scenarios/mod.rs index 48d4e6045684..3833e1c6335c 100644 --- a/arrow-integration-testing/src/flight_server_scenarios.rs +++ b/arrow-integration-testing/src/flight_server_scenarios/mod.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +//! Collection of utilities for testing the Flight server. use std::net::SocketAddr; use arrow_flight::{FlightEndpoint, Location, Ticket}; @@ -27,6 +28,7 @@ pub mod middleware; type Error = Box; type Result = std::result::Result; +/// Listen on a port and return the address pub async fn listen_on(port: u16) -> Result { let addr: SocketAddr = format!("0.0.0.0:{port}").parse()?; @@ -36,6 +38,7 @@ pub async fn listen_on(port: u16) -> Result { Ok(addr) } +/// Create a FlightEndpoint with a ticket and location pub fn endpoint(ticket: &str, location_uri: impl Into) -> FlightEndpoint { FlightEndpoint { ticket: Some(Ticket { diff --git a/arrow-integration-testing/src/lib.rs b/arrow-integration-testing/src/lib.rs index 4ce7b06a1888..ba8e3876c3e3 100644 --- a/arrow-integration-testing/src/lib.rs +++ b/arrow-integration-testing/src/lib.rs @@ -17,6 +17,7 @@ //! Common code used in the integration test binaries +#![warn(missing_docs)] use serde_json::Value; use arrow::array::{Array, StructArray}; @@ -42,7 +43,9 @@ pub const AUTH_PASSWORD: &str = "flight"; pub mod flight_client_scenarios; pub mod flight_server_scenarios; +/// An Arrow file in JSON format pub struct ArrowFile { + /// The schema of the file pub schema: Schema, // we can evolve this into a concrete Arrow type // this is temporarily not being read from @@ -51,12 +54,14 @@ pub struct ArrowFile { } impl ArrowFile { + /// Read a single [RecordBatch] from the file pub fn read_batch(&self, batch_num: usize) -> Result { let b = self.arrow_json["batches"].get(batch_num).unwrap(); let json_batch: ArrowJsonBatch = serde_json::from_value(b.clone()).unwrap(); record_batch_from_json(&self.schema, json_batch, Some(&self.dictionaries)) } + /// Read all [RecordBatch]es from the file pub fn read_batches(&self) -> Result> { self.arrow_json["batches"] .as_array() @@ -70,7 +75,7 @@ impl ArrowFile { } } -// Canonicalize the names of map fields in a schema +/// Canonicalize the names of map fields in a schema pub fn canonicalize_schema(schema: &Schema) -> Schema { let fields = schema .fields() @@ -107,6 +112,7 @@ pub fn canonicalize_schema(schema: &Schema) -> Schema { Schema::new(fields).with_metadata(schema.metadata().clone()) } +/// Read an Arrow file in JSON format pub fn open_json_file(json_name: &str) -> Result { let json_file = File::open(json_name)?; let reader = BufReader::new(json_file); @@ -157,10 +163,7 @@ pub fn read_gzip_json(version: &str, path: &str) -> ArrowJson { arrow_json } -// -// C Data Integration entrypoints -// - +/// C Data Integration entrypoint to export the schema from a JSON file fn cdata_integration_export_schema_from_json( c_json_name: *const i8, out: *mut FFI_ArrowSchema, @@ -173,6 +176,7 @@ fn cdata_integration_export_schema_from_json( Ok(()) } +/// C Data Integration entrypoint to export a batch from a JSON file fn cdata_integration_export_batch_from_json( c_json_name: *const i8, batch_num: c_int, @@ -263,6 +267,7 @@ pub unsafe extern "C" fn arrow_rs_free_error(c_error: *mut i8) { } } +/// A C-ABI for exporting an Arrow schema from a JSON file #[no_mangle] pub extern "C" fn arrow_rs_cdata_integration_export_schema_from_json( c_json_name: *const i8, @@ -272,6 +277,7 @@ pub extern "C" fn arrow_rs_cdata_integration_export_schema_from_json( result_to_c_error(&r) } +/// A C-ABI to compare an Arrow schema against a JSON file #[no_mangle] pub extern "C" fn arrow_rs_cdata_integration_import_schema_and_compare_to_json( c_json_name: *const i8, @@ -281,6 +287,7 @@ pub extern "C" fn arrow_rs_cdata_integration_import_schema_and_compare_to_json( result_to_c_error(&r) } +/// A C-ABI for exporting a RecordBatch from a JSON file #[no_mangle] pub extern "C" fn arrow_rs_cdata_integration_export_batch_from_json( c_json_name: *const i8, @@ -291,6 +298,7 @@ pub extern "C" fn arrow_rs_cdata_integration_export_batch_from_json( result_to_c_error(&r) } +/// A C-ABI to compare a RecordBatch against a JSON file #[no_mangle] pub extern "C" fn arrow_rs_cdata_integration_import_batch_and_compare_to_json( c_json_name: *const i8, diff --git a/arrow-ipc/src/convert.rs b/arrow-ipc/src/convert.rs index 52c6a0d614d0..eef236529e10 100644 --- a/arrow-ipc/src/convert.rs +++ b/arrow-ipc/src/convert.rs @@ -133,6 +133,7 @@ pub fn schema_to_fb(schema: &Schema) -> FlatBufferBuilder<'_> { IpcSchemaEncoder::new().schema_to_fb(schema) } +/// Push a key-value metadata into a FlatBufferBuilder and return [WIPOffset] pub fn metadata_to_fb<'a>( fbb: &mut FlatBufferBuilder<'a>, metadata: &HashMap, @@ -152,7 +153,7 @@ pub fn metadata_to_fb<'a>( fbb.create_vector(&custom_metadata) } -#[deprecated(since = "54.0.0", note = "Use `IpcSchemaConverter`.")] +/// Adds a [Schema] to a flatbuffer and returns the offset pub fn schema_to_fb_offset<'a>( fbb: &mut FlatBufferBuilder<'a>, schema: &Schema, diff --git a/arrow-ipc/src/lib.rs b/arrow-ipc/src/lib.rs index 4f35ffb60a9f..dde137153964 100644 --- a/arrow-ipc/src/lib.rs +++ b/arrow-ipc/src/lib.rs @@ -19,6 +19,7 @@ //! //! [Arrow IPC Format]: https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc +#![warn(missing_docs)] pub mod convert; pub mod reader; pub mod writer; @@ -31,6 +32,7 @@ mod compression; #[allow(clippy::redundant_static_lifetimes)] #[allow(clippy::redundant_field_names)] #[allow(non_camel_case_types)] +#[allow(missing_docs)] // Because this is autogenerated pub mod gen; pub use self::gen::File::*; diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index b5cf20ef337f..f9256b4e8175 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -60,7 +60,7 @@ pub struct IpcWriteOptions { /// Compression, if desired. Will result in a runtime error /// if the corresponding feature is not enabled batch_compression_type: Option, - /// Flag indicating whether the writer should preserver the dictionary IDs defined in the + /// Flag indicating whether the writer should preserve the dictionary IDs defined in the /// schema or generate unique dictionary IDs internally during encoding. /// /// Defaults to `true` @@ -135,6 +135,8 @@ impl IpcWriteOptions { } } + /// Return whether the writer is configured to preserve the dictionary IDs + /// defined in the schema pub fn preserve_dict_id(&self) -> bool { self.preserve_dict_id } @@ -200,6 +202,11 @@ impl Default for IpcWriteOptions { pub struct IpcDataGenerator {} impl IpcDataGenerator { + /// Converts a schema to an IPC message along with `dictionary_tracker` + /// and returns it encoded inside [EncodedData] as a flatbuffer + /// + /// Preferred method over [IpcDataGenerator::schema_to_bytes] since it's + /// deprecated since Arrow v54.0.0 pub fn schema_to_bytes_with_dictionary_tracker( &self, schema: &Schema, @@ -234,6 +241,7 @@ impl IpcDataGenerator { since = "54.0.0", note = "Use `schema_to_bytes_with_dictionary_tracker` instead. This function signature of `schema_to_bytes_with_dictionary_tracker` in the next release." )] + /// Converts a schema to an IPC message and returns it encoded inside [EncodedData] as a flatbuffer pub fn schema_to_bytes(&self, schema: &Schema, write_options: &IpcWriteOptions) -> EncodedData { let mut fbb = FlatBufferBuilder::new(); let schema = { @@ -951,6 +959,7 @@ impl FileWriter { }) } + /// Adds a key-value pair to the [FileWriter]'s custom metadata pub fn write_metadata(&mut self, key: impl Into, value: impl Into) { self.custom_metadata.insert(key.into(), value.into()); } diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index 86d2e88d99f0..d973206ccf74 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -397,6 +397,7 @@ where #[cfg(test)] mod tests { + use core::str; use std::fs::{read_to_string, File}; use std::io::{BufReader, Seek}; use std::sync::Arc; @@ -1111,7 +1112,7 @@ mod tests { } } - let result = String::from_utf8(buf).unwrap(); + let result = str::from_utf8(&buf).unwrap(); let expected = read_to_string(test_file).unwrap(); for (r, e) in result.lines().zip(expected.lines()) { let mut expected_json = serde_json::from_str::(e).unwrap(); @@ -1150,7 +1151,7 @@ mod tests { fn json_writer_empty() { let mut writer = ArrayWriter::new(vec![] as Vec); writer.finish().unwrap(); - assert_eq!(String::from_utf8(writer.into_inner()).unwrap(), ""); + assert_eq!(str::from_utf8(&writer.into_inner()).unwrap(), ""); } #[test] @@ -1279,7 +1280,7 @@ mod tests { writer.write(&batch).unwrap(); } - let result = String::from_utf8(buf).unwrap(); + let result = str::from_utf8(&buf).unwrap(); let expected = read_to_string(test_file).unwrap(); for (r, e) in result.lines().zip(expected.lines()) { let mut expected_json = serde_json::from_str::(e).unwrap(); @@ -1321,7 +1322,7 @@ mod tests { writer.write_batches(&batches).unwrap(); } - let result = String::from_utf8(buf).unwrap(); + let result = str::from_utf8(&buf).unwrap(); let expected = read_to_string(test_file).unwrap(); // result is eq to 2 same batches let expected = format!("{expected}\n{expected}"); diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index fc4852a3d37d..b532ea8616b6 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -610,14 +610,14 @@ mod test { #[test] fn test_new_with_string() { // Fields should allow owned Strings to support reuse - let s = String::from("c1"); + let s = "c1"; Field::new(s, DataType::Int64, false); } #[test] fn test_new_dict_with_string() { // Fields should allow owned Strings to support reuse - let s = String::from("c1"); + let s = "c1"; Field::new_dict(s, DataType::Int64, false, 4, false); } diff --git a/arrow/tests/array_cast.rs b/arrow/tests/array_cast.rs index 0fd89cc2bff4..8f86cbeab717 100644 --- a/arrow/tests/array_cast.rs +++ b/arrow/tests/array_cast.rs @@ -179,7 +179,7 @@ fn test_can_cast_types() { /// Create instances of arrays with varying types for cast tests fn get_arrays_of_all_types() -> Vec { - let tz_name = String::from("+08:00"); + let tz_name = "+08:00"; let binary_data: Vec<&[u8]> = vec![b"foo", b"bar"]; vec![ Arc::new(BinaryArray::from(binary_data.clone())), @@ -238,9 +238,9 @@ fn get_arrays_of_all_types() -> Vec { Arc::new(TimestampMillisecondArray::from(vec![1000, 2000])), Arc::new(TimestampMicrosecondArray::from(vec![1000, 2000])), Arc::new(TimestampNanosecondArray::from(vec![1000, 2000])), - Arc::new(TimestampSecondArray::from(vec![1000, 2000]).with_timezone(tz_name.clone())), - Arc::new(TimestampMillisecondArray::from(vec![1000, 2000]).with_timezone(tz_name.clone())), - Arc::new(TimestampMicrosecondArray::from(vec![1000, 2000]).with_timezone(tz_name.clone())), + Arc::new(TimestampSecondArray::from(vec![1000, 2000]).with_timezone(tz_name)), + Arc::new(TimestampMillisecondArray::from(vec![1000, 2000]).with_timezone(tz_name)), + Arc::new(TimestampMicrosecondArray::from(vec![1000, 2000]).with_timezone(tz_name)), Arc::new(TimestampNanosecondArray::from(vec![1000, 2000]).with_timezone(tz_name)), Arc::new(Date32Array::from(vec![1000, 2000])), Arc::new(Date64Array::from(vec![1000, 2000])), diff --git a/object_store/src/aws/builder.rs b/object_store/src/aws/builder.rs index 75acb73e56a9..c52c3f8dfbd7 100644 --- a/object_store/src/aws/builder.rs +++ b/object_store/src/aws/builder.rs @@ -44,7 +44,6 @@ static DEFAULT_METADATA_ENDPOINT: &str = "http://169.254.169.254"; /// A specialized `Error` for object store-related errors #[derive(Debug, Snafu)] -#[allow(missing_docs)] enum Error { #[snafu(display("Missing bucket name"))] MissingBucketName, diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index 6fe4889db176..7034a372e95f 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -65,7 +65,6 @@ const USER_DEFINED_METADATA_HEADER_PREFIX: &str = "x-amz-meta-"; /// A specialized `Error` for object store-related errors #[derive(Debug, Snafu)] -#[allow(missing_docs)] pub(crate) enum Error { #[snafu(display("Error performing DeleteObjects request: {}", source))] DeleteObjectsRequest { source: crate::client::retry::Error }, diff --git a/object_store/src/aws/resolve.rs b/object_store/src/aws/resolve.rs index 12c9f26d220b..4c7489316b6c 100644 --- a/object_store/src/aws/resolve.rs +++ b/object_store/src/aws/resolve.rs @@ -21,7 +21,6 @@ use snafu::{ensure, OptionExt, ResultExt, Snafu}; /// A specialized `Error` for object store-related errors #[derive(Debug, Snafu)] -#[allow(missing_docs)] enum Error { #[snafu(display("Bucket '{}' not found", bucket))] BucketNotFound { bucket: String }, diff --git a/object_store/src/azure/builder.rs b/object_store/src/azure/builder.rs index 35cedeafc049..1c4589ba1ec6 100644 --- a/object_store/src/azure/builder.rs +++ b/object_store/src/azure/builder.rs @@ -46,7 +46,6 @@ const MSI_ENDPOINT_ENV_KEY: &str = "IDENTITY_ENDPOINT"; /// A specialized `Error` for Azure builder-related errors #[derive(Debug, Snafu)] -#[allow(missing_docs)] enum Error { #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] UnableToParseUrl { diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index 04990515543a..06d3fb5c8678 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -60,7 +60,6 @@ static TAGS_HEADER: HeaderName = HeaderName::from_static("x-ms-tags"); /// A specialized `Error` for object store-related errors #[derive(Debug, Snafu)] -#[allow(missing_docs)] pub(crate) enum Error { #[snafu(display("Error performing get request {}: {}", path, source))] GetRequest { diff --git a/object_store/src/client/get.rs b/object_store/src/client/get.rs index 0fef5785c052..ae6a8d9deaae 100644 --- a/object_store/src/client/get.rs +++ b/object_store/src/client/get.rs @@ -96,7 +96,6 @@ impl ContentRange { /// A specialized `Error` for get-related errors #[derive(Debug, Snafu)] -#[allow(missing_docs)] enum GetResultError { #[snafu(context(false))] Header { diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 8820983b2025..a0d83eb0b6dd 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -1224,78 +1224,116 @@ pub type Result = std::result::Result; /// A specialized `Error` for object store-related errors #[derive(Debug, Snafu)] -#[allow(missing_docs)] #[non_exhaustive] pub enum Error { + /// A fallback error type when no variant matches #[snafu(display("Generic {} error: {}", store, source))] Generic { + /// The store this error originated from store: &'static str, + /// The wrapped error source: Box, }, + /// Error when the object is not found at given location #[snafu(display("Object at location {} not found: {}", path, source))] NotFound { + /// The path to file path: String, + /// The wrapped error source: Box, }, + /// Error for invalid path #[snafu( display("Encountered object with invalid path: {}", source), context(false) )] - InvalidPath { source: path::Error }, + InvalidPath { + /// The wrapped error + source: path::Error, + }, + /// Error when `tokio::spawn` failed #[snafu(display("Error joining spawned task: {}", source), context(false))] - JoinError { source: tokio::task::JoinError }, + JoinError { + /// The wrapped error + source: tokio::task::JoinError, + }, + /// Error when the attempted operation is not supported #[snafu(display("Operation not supported: {}", source))] NotSupported { + /// The wrapped error source: Box, }, + /// Error when the object already exists #[snafu(display("Object at location {} already exists: {}", path, source))] AlreadyExists { + /// The path to the path: String, + /// The wrapped error source: Box, }, + /// Error when the required conditions failed for the operation #[snafu(display("Request precondition failure for path {}: {}", path, source))] Precondition { + /// The path to the file path: String, + /// The wrapped error source: Box, }, + /// Error when the object at the location isn't modified #[snafu(display("Object at location {} not modified: {}", path, source))] NotModified { + /// The path to the file path: String, + /// The wrapped error source: Box, }, + /// Error when an operation is not implemented #[snafu(display("Operation not yet implemented."))] NotImplemented, + /// Error when the used credentials don't have enough permission + /// to perform the requested operation #[snafu(display( "The operation lacked the necessary privileges to complete for path {}: {}", path, source ))] PermissionDenied { + /// The path to the file path: String, + /// The wrapped error source: Box, }, + /// Error when the used credentials lack valid authentication #[snafu(display( "The operation lacked valid authentication credentials for path {}: {}", path, source ))] Unauthenticated { + /// The path to the file path: String, + /// The wrapped error source: Box, }, + /// Error when a configuration key is invalid for the store used #[snafu(display("Configuration key: '{}' is not valid for store '{}'.", key, store))] - UnknownConfigurationKey { store: &'static str, key: String }, + UnknownConfigurationKey { + /// The object store used + store: &'static str, + /// The configuration key used + key: String, + }, } impl From for std::io::Error { diff --git a/object_store/src/local.rs b/object_store/src/local.rs index db4b4b05031e..ac10f332d743 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -44,7 +44,6 @@ use crate::{ /// A specialized `Error` for filesystem object store-related errors #[derive(Debug, Snafu)] -#[allow(missing_docs)] pub(crate) enum Error { #[snafu(display("File size for {} did not fit in a usize: {}", path, source))] FileSizeOverflowedUsize { diff --git a/object_store/src/memory.rs b/object_store/src/memory.rs index 0d72983b0495..b458bdddfbf5 100644 --- a/object_store/src/memory.rs +++ b/object_store/src/memory.rs @@ -38,7 +38,6 @@ use crate::{GetOptions, PutPayload}; /// A specialized `Error` for in-memory object store-related errors #[derive(Debug, Snafu)] -#[allow(missing_docs)] enum Error { #[snafu(display("No data in memory found. Location: {path}"))] NoDataInMemory { path: String }, diff --git a/object_store/src/path/mod.rs b/object_store/src/path/mod.rs index 59e08e2eaba9..4c9bb5f05186 100644 --- a/object_store/src/path/mod.rs +++ b/object_store/src/path/mod.rs @@ -36,32 +36,57 @@ pub use parts::{InvalidPart, PathPart}; /// Error returned by [`Path::parse`] #[derive(Debug, Snafu)] -#[allow(missing_docs)] #[non_exhaustive] pub enum Error { + /// Error when there's an empty segment between two slashes `/` in the path #[snafu(display("Path \"{}\" contained empty path segment", path))] - EmptySegment { path: String }, + EmptySegment { + /// The source path + path: String, + }, + /// Error when an invalid segment is encountered in the given path #[snafu(display("Error parsing Path \"{}\": {}", path, source))] - BadSegment { path: String, source: InvalidPart }, + BadSegment { + /// The source path + path: String, + /// The part containing the error + source: InvalidPart, + }, + /// Error when path cannot be canonicalized #[snafu(display("Failed to canonicalize path \"{}\": {}", path.display(), source))] Canonicalize { + /// The source path path: std::path::PathBuf, + /// The underlying error source: std::io::Error, }, + /// Error when the path is not a valid URL #[snafu(display("Unable to convert path \"{}\" to URL", path.display()))] - InvalidPath { path: std::path::PathBuf }, + InvalidPath { + /// The source path + path: std::path::PathBuf, + }, + /// Error when a path contains non-unicode characters #[snafu(display("Path \"{}\" contained non-unicode characters: {}", path, source))] NonUnicode { + /// The source path path: String, + /// The underlying `UTF8Error` source: std::str::Utf8Error, }, + /// Error when the a path doesn't start with given prefix #[snafu(display("Path {} does not start with prefix {}", path, prefix))] - PrefixMismatch { path: String, prefix: String }, + PrefixMismatch { + /// The source path + path: String, + /// The mismatched prefix + prefix: String, + }, } /// A parsed path representation that can be safely written to object storage diff --git a/parquet/src/compression.rs b/parquet/src/compression.rs index edf675f1302a..6953f2d6c09c 100644 --- a/parquet/src/compression.rs +++ b/parquet/src/compression.rs @@ -298,6 +298,30 @@ mod gzip_codec { pub use gzip_codec::*; /// Represents a valid gzip compression level. +/// +/// #### WARNING: +/// Level 10 compression can offer smallest file size, +/// but Parquet files created with it will not be readable +/// by other "standard" paquet readers. +/// +/// Do **NOT** use level 10 if you need other software to +/// be able to read the files. Read below for details. +/// +/// ### IMPORTANT: +/// There's often confusion about the compression levels in `flate2` vs `arrow` +/// as highlighted in issue [#1011](https://github.com/apache/arrow-rs/issues/6282). +/// +/// `flate2` supports two compression backends: `miniz_oxide` and `zlib`. +/// +/// - `zlib` supports levels from 0 to 9. +/// - `miniz_oxide` supports levels from 0 to 10. +/// +/// `arrow` uses `flate` with `rust_backend` feature, +/// which provides `miniz_oxide` as the backend. +/// Therefore 0-10 levels are supported. +/// +/// `flate2` documents this behavior properly with +/// [this commit](https://github.com/rust-lang/flate2-rs/pull/430). #[derive(Debug, Eq, PartialEq, Hash, Clone, Copy)] pub struct GzipLevel(u32); From 880c59b4951d83a1b2395e3a9f24d785fb69a7ca Mon Sep 17 00:00:00 2001 From: ByteBaker <42913098+ByteBaker@users.noreply.github.com> Date: Tue, 1 Oct 2024 01:51:26 +0530 Subject: [PATCH 2/2] chore: resolve PR comments from #6453 --- arrow-flight/src/bin/flight_sql_client.rs | 2 +- parquet/src/compression.rs | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arrow-flight/src/bin/flight_sql_client.rs b/arrow-flight/src/bin/flight_sql_client.rs index 1fbf81f3b524..8f0618f495bc 100644 --- a/arrow-flight/src/bin/flight_sql_client.rs +++ b/arrow-flight/src/bin/flight_sql_client.rs @@ -422,7 +422,7 @@ fn log_metadata(map: &MetadataMap, what: &'static str) { "{}: {}={}", what, k.as_str(), - str::from_utf8(v.as_ref()).unwrap(), + String::from_utf8_lossy(v.as_ref()), ); } } diff --git a/parquet/src/compression.rs b/parquet/src/compression.rs index 6953f2d6c09c..ccc060250af4 100644 --- a/parquet/src/compression.rs +++ b/parquet/src/compression.rs @@ -299,6 +299,11 @@ pub use gzip_codec::*; /// Represents a valid gzip compression level. /// +/// Defaults to 6. +/// +/// * 0: least compression +/// * 9: most compression (that other software can read) +/// * 10: most compression (incompatible with other software, see below) /// #### WARNING: /// Level 10 compression can offer smallest file size, /// but Parquet files created with it will not be readable