diff --git a/crates/polars-io/src/csv/write/options.rs b/crates/polars-io/src/csv/write/options.rs index e1c71f3873b4..b0602cbc2a92 100644 --- a/crates/polars-io/src/csv/write/options.rs +++ b/crates/polars-io/src/csv/write/options.rs @@ -40,6 +40,7 @@ pub struct SerializeOptions { pub datetime_format: Option, /// Used for [`DataType::Float64`](polars_core::datatypes::DataType::Float64) /// and [`DataType::Float32`](polars_core::datatypes::DataType::Float32). + pub float_scientific: Option, pub float_precision: Option, /// Used as separator. pub separator: u8, @@ -59,6 +60,7 @@ impl Default for SerializeOptions { date_format: None, time_format: None, datetime_format: None, + float_scientific: None, float_precision: None, separator: b',', quote_char: b'"', diff --git a/crates/polars-io/src/csv/write/write_impl/serializer.rs b/crates/polars-io/src/csv/write/write_impl/serializer.rs index 89983dfd8fce..e6b29b80d2b0 100644 --- a/crates/polars-io/src/csv/write/write_impl/serializer.rs +++ b/crates/polars-io/src/csv/write/write_impl/serializer.rs @@ -30,6 +30,7 @@ //! but also with `QUOTE_NON_NULL = false`. //! 3. A serializer that quotes only non-nulls. This is a bare serializer with `QUOTE_NON_NULL = true`. +use std::fmt::LowerExp; use std::io::Write; use arrow::array::{Array, BooleanArray, NullArray, PrimitiveArray, Utf8ViewArray}; @@ -38,6 +39,7 @@ use arrow::types::NativeType; #[cfg(feature = "timezones")] use chrono::TimeZone; use memchr::{memchr3, memchr_iter}; +use num_traits::NumCast; use polars_core::prelude::*; use crate::csv::write::{QuoteStyle, SerializeOptions}; @@ -121,7 +123,7 @@ fn integer_serializer(array: &PrimitiveArray) }) } -fn float_serializer_no_precision( +fn float_serializer_no_precision_autoformat( array: &PrimitiveArray, ) -> impl Serializer { let f = move |&item, buf: &mut Vec, _options: &SerializeOptions| { @@ -139,7 +141,60 @@ fn float_serializer_no_precision( }) } -fn float_serializer_with_precision( +fn float_serializer_no_precision_scientific( + array: &PrimitiveArray, +) -> impl Serializer { + let f = move |&item, buf: &mut Vec, _options: &SerializeOptions| { + // Float writing into a buffer of `Vec` cannot fail. + let _ = write!(buf, "{item:.e}"); + }; + + make_serializer::<_, _, false>(f, array.iter(), |array| { + array + .as_any() + .downcast_ref::>() + .expect(ARRAY_MISMATCH_MSG) + .iter() + }) +} + +fn float_serializer_no_precision_positional( + array: &PrimitiveArray, +) -> impl Serializer { + let f = move |&item, buf: &mut Vec, _options: &SerializeOptions| { + let v: f64 = NumCast::from(item).unwrap(); + let value = v.to_string(); + buf.extend_from_slice(value.as_bytes()); + }; + + make_serializer::<_, _, false>(f, array.iter(), |array| { + array + .as_any() + .downcast_ref::>() + .expect(ARRAY_MISMATCH_MSG) + .iter() + }) +} + +fn float_serializer_with_precision_scientific( + array: &PrimitiveArray, + precision: usize, +) -> impl Serializer { + let f = move |&item, buf: &mut Vec, _options: &SerializeOptions| { + // Float writing into a buffer of `Vec` cannot fail. + let _ = write!(buf, "{item:.precision$e}"); + }; + + make_serializer::<_, _, false>(f, array.iter(), |array| { + array + .as_any() + .downcast_ref::>() + .expect(ARRAY_MISMATCH_MSG) + .iter() + }) +} + +fn float_serializer_with_precision_positional( array: &PrimitiveArray, precision: usize, ) -> impl Serializer { @@ -463,12 +518,30 @@ pub(super) fn serializer_for<'a>( DataType::Int64 => quote_if_always!(integer_serializer::), DataType::UInt64 => quote_if_always!(integer_serializer::), DataType::Float32 => match options.float_precision { - Some(precision) => quote_if_always!(float_serializer_with_precision::, precision), - None => quote_if_always!(float_serializer_no_precision::), + Some(precision) => match options.float_scientific { + Some(true) => { + quote_if_always!(float_serializer_with_precision_scientific::, precision) + }, + _ => quote_if_always!(float_serializer_with_precision_positional::, precision), + }, + None => match options.float_scientific { + Some(true) => quote_if_always!(float_serializer_no_precision_scientific::), + Some(false) => quote_if_always!(float_serializer_no_precision_positional::), + None => quote_if_always!(float_serializer_no_precision_autoformat::), + }, }, DataType::Float64 => match options.float_precision { - Some(precision) => quote_if_always!(float_serializer_with_precision::, precision), - None => quote_if_always!(float_serializer_no_precision::), + Some(precision) => match options.float_scientific { + Some(true) => { + quote_if_always!(float_serializer_with_precision_scientific::, precision) + }, + _ => quote_if_always!(float_serializer_with_precision_positional::, precision), + }, + None => match options.float_scientific { + Some(true) => quote_if_always!(float_serializer_no_precision_scientific::), + Some(false) => quote_if_always!(float_serializer_no_precision_positional::), + None => quote_if_always!(float_serializer_no_precision_autoformat::), + }, }, DataType::Null => quote_if_always!(null_serializer), DataType::Boolean => { diff --git a/crates/polars-io/src/csv/write/writer.rs b/crates/polars-io/src/csv/write/writer.rs index 0bb6bd2ce534..9369dacbe6da 100644 --- a/crates/polars-io/src/csv/write/writer.rs +++ b/crates/polars-io/src/csv/write/writer.rs @@ -115,6 +115,14 @@ where self } + /// Set the CSV file's forced scientific notation for floats. + pub fn with_float_scientific(mut self, scientific: Option) -> Self { + if scientific.is_some() { + self.options.float_scientific = scientific; + } + self + } + /// Set the CSV file's float precision. pub fn with_float_precision(mut self, precision: Option) -> Self { if precision.is_some() { diff --git a/crates/polars-pipe/src/executors/sinks/output/csv.rs b/crates/polars-pipe/src/executors/sinks/output/csv.rs index 5c7481c2ad6f..773287b834b1 100644 --- a/crates/polars-pipe/src/executors/sinks/output/csv.rs +++ b/crates/polars-pipe/src/executors/sinks/output/csv.rs @@ -23,6 +23,7 @@ impl CsvSink { .with_datetime_format(options.serialize_options.datetime_format) .with_date_format(options.serialize_options.date_format) .with_time_format(options.serialize_options.time_format) + .with_float_scientific(options.serialize_options.float_scientific) .with_float_precision(options.serialize_options.float_precision) .with_null_value(options.serialize_options.null) .with_quote_style(options.serialize_options.quote_style) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index ec0ce54249ba..83d05afafc48 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -2512,6 +2512,7 @@ def write_csv( datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., + float_scientific: bool | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ..., @@ -2531,6 +2532,7 @@ def write_csv( datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., + float_scientific: bool | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ..., @@ -2549,6 +2551,7 @@ def write_csv( datetime_format: str | None = None, date_format: str | None = None, time_format: str | None = None, + float_scientific: bool | None = None, float_precision: int | None = None, null_value: str | None = None, quote_style: CsvQuoteStyle | None = None, @@ -2587,6 +2590,9 @@ def write_csv( A format string, with the specifiers defined by the `chrono `_ Rust crate. + float_scientific + Whether to use scientific form always (true), never (false), or + automatically (None) for `Float32` and `Float64` datatypes. float_precision Number of decimal places to write, applied to both `Float32` and `Float64` datatypes. @@ -2650,6 +2656,7 @@ def write_csv( datetime_format, date_format, time_format, + float_scientific, float_precision, null_value, quote_style, diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index 698884360910..0c56c702c398 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -2312,6 +2312,7 @@ def sink_csv( datetime_format: str | None = None, date_format: str | None = None, time_format: str | None = None, + float_scientific: bool | None = None, float_precision: int | None = None, null_value: str | None = None, quote_style: CsvQuoteStyle | None = None, @@ -2362,6 +2363,9 @@ def sink_csv( A format string, with the specifiers defined by the `chrono `_ Rust crate. + float_scientific + Whether to use scientific form always (true), never (false), or + automatically (None) for `Float32` and `Float64` datatypes. float_precision Number of decimal places to write, applied to both `Float32` and `Float64` datatypes. @@ -2436,6 +2440,7 @@ def sink_csv( datetime_format=datetime_format, date_format=date_format, time_format=time_format, + float_scientific=float_scientific, float_precision=float_precision, null_value=null_value, quote_style=quote_style, diff --git a/py-polars/src/dataframe/io.rs b/py-polars/src/dataframe/io.rs index 3c46eb1dbd83..7542c5905193 100644 --- a/py-polars/src/dataframe/io.rs +++ b/py-polars/src/dataframe/io.rs @@ -370,6 +370,7 @@ impl PyDataFrame { datetime_format: Option, date_format: Option, time_format: Option, + float_scientific: Option, float_precision: Option, null_value: Option, quote_style: Option>, @@ -390,6 +391,7 @@ impl PyDataFrame { .with_datetime_format(datetime_format) .with_date_format(date_format) .with_time_format(time_format) + .with_float_scientific(float_scientific) .with_float_precision(float_precision) .with_null_value(null) .with_quote_style(quote_style.map(|wrap| wrap.0).unwrap_or_default()) @@ -408,6 +410,7 @@ impl PyDataFrame { .with_datetime_format(datetime_format) .with_date_format(date_format) .with_time_format(time_format) + .with_float_scientific(float_scientific) .with_float_precision(float_precision) .with_null_value(null) .with_quote_style(quote_style.map(|wrap| wrap.0).unwrap_or_default()) diff --git a/py-polars/src/lazyframe/mod.rs b/py-polars/src/lazyframe/mod.rs index 9de61cd2b022..d57c3af98394 100644 --- a/py-polars/src/lazyframe/mod.rs +++ b/py-polars/src/lazyframe/mod.rs @@ -721,7 +721,7 @@ impl PyLazyFrame { } #[cfg(all(feature = "streaming", feature = "csv"))] - #[pyo3(signature = (path, include_bom, include_header, separator, line_terminator, quote_char, batch_size, datetime_format, date_format, time_format, float_precision, null_value, quote_style, maintain_order))] + #[pyo3(signature = (path, include_bom, include_header, separator, line_terminator, quote_char, batch_size, datetime_format, date_format, time_format, float_scientific, float_precision, null_value, quote_style, maintain_order))] fn sink_csv( &self, py: Python, @@ -735,6 +735,7 @@ impl PyLazyFrame { datetime_format: Option, date_format: Option, time_format: Option, + float_scientific: Option, float_precision: Option, null_value: Option, quote_style: Option>, @@ -747,6 +748,7 @@ impl PyLazyFrame { date_format, time_format, datetime_format, + float_scientific, float_precision, separator, quote_char, diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py index e66c1e53ef3b..a1a059ee62f6 100644 --- a/py-polars/tests/unit/io/test_csv.py +++ b/py-polars/tests/unit/io/test_csv.py @@ -1347,6 +1347,52 @@ def test_float_precision(dtype: pl.Float32 | pl.Float64) -> None: assert df.write_csv(float_precision=3) == "col\n1.000\n2.200\n3.330\n" +def test_float_scientific() -> None: + df = ( + pl.Series( + "colf64", + [3.141592653589793 * mult for mult in (1e-8, 1e-3, 1e3, 1e17)], + dtype=pl.Float64, + ) + .to_frame() + .with_columns(pl.col("colf64").cast(pl.Float32).alias("colf32")) + ) + + assert ( + df.write_csv(float_precision=None, float_scientific=False) + == "colf64,colf32\n0.00000003141592653589793,0.00000003141592586075603\n0.0031415926535897933,0.0031415927223861217\n3141.592653589793,3141.5927734375\n314159265358979300,314159265516355600\n" + ) + assert ( + df.write_csv(float_precision=0, float_scientific=False) + == "colf64,colf32\n0,0\n0,0\n3142,3142\n314159265358979328,314159265516355584\n" + ) + assert ( + df.write_csv(float_precision=1, float_scientific=False) + == "colf64,colf32\n0.0,0.0\n0.0,0.0\n3141.6,3141.6\n314159265358979328.0,314159265516355584.0\n" + ) + assert ( + df.write_csv(float_precision=3, float_scientific=False) + == "colf64,colf32\n0.000,0.000\n0.003,0.003\n3141.593,3141.593\n314159265358979328.000,314159265516355584.000\n" + ) + + assert ( + df.write_csv(float_precision=None, float_scientific=True) + == "colf64,colf32\n3.141592653589793e-8,3.1415926e-8\n3.1415926535897933e-3,3.1415927e-3\n3.141592653589793e3,3.1415928e3\n3.141592653589793e17,3.1415927e17\n" + ) + assert ( + df.write_csv(float_precision=0, float_scientific=True) + == "colf64,colf32\n3e-8,3e-8\n3e-3,3e-3\n3e3,3e3\n3e17,3e17\n" + ) + assert ( + df.write_csv(float_precision=1, float_scientific=True) + == "colf64,colf32\n3.1e-8,3.1e-8\n3.1e-3,3.1e-3\n3.1e3,3.1e3\n3.1e17,3.1e17\n" + ) + assert ( + df.write_csv(float_precision=3, float_scientific=True) + == "colf64,colf32\n3.142e-8,3.142e-8\n3.142e-3,3.142e-3\n3.142e3,3.142e3\n3.142e17,3.142e17\n" + ) + + def test_skip_rows_different_field_len() -> None: csv = io.StringIO( textwrap.dedent( diff --git a/py-polars/tests/unit/streaming/test_streaming_io.py b/py-polars/tests/unit/streaming/test_streaming_io.py index e2533128cab9..1f60cd3c6f3e 100644 --- a/py-polars/tests/unit/streaming/test_streaming_io.py +++ b/py-polars/tests/unit/streaming/test_streaming_io.py @@ -148,6 +148,7 @@ def test_sink_csv_with_options() -> None: datetime_format="%Y", date_format="%d", time_format="%H", + float_scientific=True, float_precision=42, null_value="BOOM", quote_style="always", @@ -165,6 +166,7 @@ def test_sink_csv_with_options() -> None: datetime_format="%Y", date_format="%d", time_format="%H", + float_scientific=True, float_precision=42, null_value="BOOM", quote_style="always",