From 2881ae0c3a61491e8e5feb8d13ddb2769eed22a7 Mon Sep 17 00:00:00 2001 From: Luke Shingles Date: Fri, 21 Jun 2024 12:05:28 +0100 Subject: [PATCH 1/3] Add float_scientific option to write_csv --- crates/polars-io/src/csv/write/options.rs | 2 + .../src/csv/write/write_impl/serializer.rs | 30 +++++++++--- crates/polars-io/src/csv/write/writer.rs | 8 ++++ .../src/executors/sinks/output/csv.rs | 1 + py-polars/polars/dataframe/frame.py | 7 +++ py-polars/polars/lazyframe/frame.py | 5 ++ py-polars/src/dataframe/io.rs | 3 ++ py-polars/src/lazyframe/mod.rs | 4 +- py-polars/tests/unit/io/test_csv.py | 46 +++++++++++++++++++ .../tests/unit/streaming/test_streaming_io.py | 2 + 10 files changed, 100 insertions(+), 8 deletions(-) diff --git a/crates/polars-io/src/csv/write/options.rs b/crates/polars-io/src/csv/write/options.rs index e1c71f3873b4..b0602cbc2a92 100644 --- a/crates/polars-io/src/csv/write/options.rs +++ b/crates/polars-io/src/csv/write/options.rs @@ -40,6 +40,7 @@ pub struct SerializeOptions { pub datetime_format: Option, /// Used for [`DataType::Float64`](polars_core::datatypes::DataType::Float64) /// and [`DataType::Float32`](polars_core::datatypes::DataType::Float32). + pub float_scientific: Option, pub float_precision: Option, /// Used as separator. pub separator: u8, @@ -59,6 +60,7 @@ impl Default for SerializeOptions { date_format: None, time_format: None, datetime_format: None, + float_scientific: None, float_precision: None, separator: b',', quote_char: b'"', diff --git a/crates/polars-io/src/csv/write/write_impl/serializer.rs b/crates/polars-io/src/csv/write/write_impl/serializer.rs index 89983dfd8fce..badfd3c6dd73 100644 --- a/crates/polars-io/src/csv/write/write_impl/serializer.rs +++ b/crates/polars-io/src/csv/write/write_impl/serializer.rs @@ -30,6 +30,7 @@ //! but also with `QUOTE_NON_NULL = false`. //! 3. A serializer that quotes only non-nulls. This is a bare serializer with `QUOTE_NON_NULL = true`. +use std::fmt::LowerExp; use std::io::Write; use arrow::array::{Array, BooleanArray, NullArray, PrimitiveArray, Utf8ViewArray}; @@ -38,6 +39,7 @@ use arrow::types::NativeType; #[cfg(feature = "timezones")] use chrono::TimeZone; use memchr::{memchr3, memchr_iter}; +use num_traits::NumCast; use polars_core::prelude::*; use crate::csv::write::{QuoteStyle, SerializeOptions}; @@ -121,13 +123,23 @@ fn integer_serializer(array: &PrimitiveArray) }) } -fn float_serializer_no_precision( +fn float_serializer_no_precision( array: &PrimitiveArray, ) -> impl Serializer { - let f = move |&item, buf: &mut Vec, _options: &SerializeOptions| { - let mut buffer = ryu::Buffer::new(); - let value = buffer.format(item); - buf.extend_from_slice(value.as_bytes()); + let f = move |&item, buf: &mut Vec, _options: &SerializeOptions| match _options + .float_scientific + { + Some(true) => write!(buf, "{item:.e}").unwrap(), + Some(false) => { + let v: f64 = NumCast::from(item).unwrap(); + let value = v.to_string(); + buf.extend_from_slice(value.as_bytes()); + }, + None => { + let mut buffer = ryu::Buffer::new(); + let value = buffer.format(item); + buf.extend_from_slice(value.as_bytes()); + }, }; make_serializer::<_, _, false>(f, array.iter(), |array| { @@ -139,13 +151,17 @@ fn float_serializer_no_precision( }) } -fn float_serializer_with_precision( +fn float_serializer_with_precision( array: &PrimitiveArray, precision: usize, ) -> impl Serializer { let f = move |&item, buf: &mut Vec, _options: &SerializeOptions| { // Float writing into a buffer of `Vec` cannot fail. - let _ = write!(buf, "{item:.precision$}"); + let _ = if _options.float_scientific.unwrap_or(false) { + write!(buf, "{item:.precision$e}") + } else { + write!(buf, "{item:.precision$}") + }; }; make_serializer::<_, _, false>(f, array.iter(), |array| { diff --git a/crates/polars-io/src/csv/write/writer.rs b/crates/polars-io/src/csv/write/writer.rs index 0bb6bd2ce534..9369dacbe6da 100644 --- a/crates/polars-io/src/csv/write/writer.rs +++ b/crates/polars-io/src/csv/write/writer.rs @@ -115,6 +115,14 @@ where self } + /// Set the CSV file's forced scientific notation for floats. + pub fn with_float_scientific(mut self, scientific: Option) -> Self { + if scientific.is_some() { + self.options.float_scientific = scientific; + } + self + } + /// Set the CSV file's float precision. pub fn with_float_precision(mut self, precision: Option) -> Self { if precision.is_some() { diff --git a/crates/polars-pipe/src/executors/sinks/output/csv.rs b/crates/polars-pipe/src/executors/sinks/output/csv.rs index 5c7481c2ad6f..773287b834b1 100644 --- a/crates/polars-pipe/src/executors/sinks/output/csv.rs +++ b/crates/polars-pipe/src/executors/sinks/output/csv.rs @@ -23,6 +23,7 @@ impl CsvSink { .with_datetime_format(options.serialize_options.datetime_format) .with_date_format(options.serialize_options.date_format) .with_time_format(options.serialize_options.time_format) + .with_float_scientific(options.serialize_options.float_scientific) .with_float_precision(options.serialize_options.float_precision) .with_null_value(options.serialize_options.null) .with_quote_style(options.serialize_options.quote_style) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index ec0ce54249ba..142ce2fe186f 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -2512,6 +2512,7 @@ def write_csv( datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., + float_scientific: bool | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ..., @@ -2531,6 +2532,7 @@ def write_csv( datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., + float_scientific: bool | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ..., @@ -2549,6 +2551,7 @@ def write_csv( datetime_format: str | None = None, date_format: str | None = None, time_format: str | None = None, + float_scientific: bool | None = None, float_precision: int | None = None, null_value: str | None = None, quote_style: CsvQuoteStyle | None = None, @@ -2587,6 +2590,9 @@ def write_csv( A format string, with the specifiers defined by the `chrono `_ Rust crate. + float_scientific + Whether to use of scientific form always (true) or never (false) or auto + (None) `Float32` and `Float64` datatypes. float_precision Number of decimal places to write, applied to both `Float32` and `Float64` datatypes. @@ -2650,6 +2656,7 @@ def write_csv( datetime_format, date_format, time_format, + float_scientific, float_precision, null_value, quote_style, diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index 698884360910..2cc3018a6555 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -2312,6 +2312,7 @@ def sink_csv( datetime_format: str | None = None, date_format: str | None = None, time_format: str | None = None, + float_scientific: bool | None = None, float_precision: int | None = None, null_value: str | None = None, quote_style: CsvQuoteStyle | None = None, @@ -2362,6 +2363,9 @@ def sink_csv( A format string, with the specifiers defined by the `chrono `_ Rust crate. + float_scientific + Whether to use of scientific form always (true) or never (false) or auto + (None) `Float32` and `Float64` datatypes. float_precision Number of decimal places to write, applied to both `Float32` and `Float64` datatypes. @@ -2436,6 +2440,7 @@ def sink_csv( datetime_format=datetime_format, date_format=date_format, time_format=time_format, + float_scientific=float_scientific, float_precision=float_precision, null_value=null_value, quote_style=quote_style, diff --git a/py-polars/src/dataframe/io.rs b/py-polars/src/dataframe/io.rs index 3c46eb1dbd83..7542c5905193 100644 --- a/py-polars/src/dataframe/io.rs +++ b/py-polars/src/dataframe/io.rs @@ -370,6 +370,7 @@ impl PyDataFrame { datetime_format: Option, date_format: Option, time_format: Option, + float_scientific: Option, float_precision: Option, null_value: Option, quote_style: Option>, @@ -390,6 +391,7 @@ impl PyDataFrame { .with_datetime_format(datetime_format) .with_date_format(date_format) .with_time_format(time_format) + .with_float_scientific(float_scientific) .with_float_precision(float_precision) .with_null_value(null) .with_quote_style(quote_style.map(|wrap| wrap.0).unwrap_or_default()) @@ -408,6 +410,7 @@ impl PyDataFrame { .with_datetime_format(datetime_format) .with_date_format(date_format) .with_time_format(time_format) + .with_float_scientific(float_scientific) .with_float_precision(float_precision) .with_null_value(null) .with_quote_style(quote_style.map(|wrap| wrap.0).unwrap_or_default()) diff --git a/py-polars/src/lazyframe/mod.rs b/py-polars/src/lazyframe/mod.rs index 9de61cd2b022..d57c3af98394 100644 --- a/py-polars/src/lazyframe/mod.rs +++ b/py-polars/src/lazyframe/mod.rs @@ -721,7 +721,7 @@ impl PyLazyFrame { } #[cfg(all(feature = "streaming", feature = "csv"))] - #[pyo3(signature = (path, include_bom, include_header, separator, line_terminator, quote_char, batch_size, datetime_format, date_format, time_format, float_precision, null_value, quote_style, maintain_order))] + #[pyo3(signature = (path, include_bom, include_header, separator, line_terminator, quote_char, batch_size, datetime_format, date_format, time_format, float_scientific, float_precision, null_value, quote_style, maintain_order))] fn sink_csv( &self, py: Python, @@ -735,6 +735,7 @@ impl PyLazyFrame { datetime_format: Option, date_format: Option, time_format: Option, + float_scientific: Option, float_precision: Option, null_value: Option, quote_style: Option>, @@ -747,6 +748,7 @@ impl PyLazyFrame { date_format, time_format, datetime_format, + float_scientific, float_precision, separator, quote_char, diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py index e66c1e53ef3b..a1a059ee62f6 100644 --- a/py-polars/tests/unit/io/test_csv.py +++ b/py-polars/tests/unit/io/test_csv.py @@ -1347,6 +1347,52 @@ def test_float_precision(dtype: pl.Float32 | pl.Float64) -> None: assert df.write_csv(float_precision=3) == "col\n1.000\n2.200\n3.330\n" +def test_float_scientific() -> None: + df = ( + pl.Series( + "colf64", + [3.141592653589793 * mult for mult in (1e-8, 1e-3, 1e3, 1e17)], + dtype=pl.Float64, + ) + .to_frame() + .with_columns(pl.col("colf64").cast(pl.Float32).alias("colf32")) + ) + + assert ( + df.write_csv(float_precision=None, float_scientific=False) + == "colf64,colf32\n0.00000003141592653589793,0.00000003141592586075603\n0.0031415926535897933,0.0031415927223861217\n3141.592653589793,3141.5927734375\n314159265358979300,314159265516355600\n" + ) + assert ( + df.write_csv(float_precision=0, float_scientific=False) + == "colf64,colf32\n0,0\n0,0\n3142,3142\n314159265358979328,314159265516355584\n" + ) + assert ( + df.write_csv(float_precision=1, float_scientific=False) + == "colf64,colf32\n0.0,0.0\n0.0,0.0\n3141.6,3141.6\n314159265358979328.0,314159265516355584.0\n" + ) + assert ( + df.write_csv(float_precision=3, float_scientific=False) + == "colf64,colf32\n0.000,0.000\n0.003,0.003\n3141.593,3141.593\n314159265358979328.000,314159265516355584.000\n" + ) + + assert ( + df.write_csv(float_precision=None, float_scientific=True) + == "colf64,colf32\n3.141592653589793e-8,3.1415926e-8\n3.1415926535897933e-3,3.1415927e-3\n3.141592653589793e3,3.1415928e3\n3.141592653589793e17,3.1415927e17\n" + ) + assert ( + df.write_csv(float_precision=0, float_scientific=True) + == "colf64,colf32\n3e-8,3e-8\n3e-3,3e-3\n3e3,3e3\n3e17,3e17\n" + ) + assert ( + df.write_csv(float_precision=1, float_scientific=True) + == "colf64,colf32\n3.1e-8,3.1e-8\n3.1e-3,3.1e-3\n3.1e3,3.1e3\n3.1e17,3.1e17\n" + ) + assert ( + df.write_csv(float_precision=3, float_scientific=True) + == "colf64,colf32\n3.142e-8,3.142e-8\n3.142e-3,3.142e-3\n3.142e3,3.142e3\n3.142e17,3.142e17\n" + ) + + def test_skip_rows_different_field_len() -> None: csv = io.StringIO( textwrap.dedent( diff --git a/py-polars/tests/unit/streaming/test_streaming_io.py b/py-polars/tests/unit/streaming/test_streaming_io.py index e2533128cab9..1f60cd3c6f3e 100644 --- a/py-polars/tests/unit/streaming/test_streaming_io.py +++ b/py-polars/tests/unit/streaming/test_streaming_io.py @@ -148,6 +148,7 @@ def test_sink_csv_with_options() -> None: datetime_format="%Y", date_format="%d", time_format="%H", + float_scientific=True, float_precision=42, null_value="BOOM", quote_style="always", @@ -165,6 +166,7 @@ def test_sink_csv_with_options() -> None: datetime_format="%Y", date_format="%d", time_format="%H", + float_scientific=True, float_precision=42, null_value="BOOM", quote_style="always", From 871f975bb057c4ea39445f8fc5c58a1a3506a89a Mon Sep 17 00:00:00 2001 From: Luke Shingles Date: Fri, 21 Jun 2024 15:14:56 +0100 Subject: [PATCH 2/3] Fix missing word in doc --- py-polars/polars/dataframe/frame.py | 4 ++-- py-polars/polars/lazyframe/frame.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 142ce2fe186f..83d05afafc48 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -2591,8 +2591,8 @@ def write_csv( `chrono `_ Rust crate. float_scientific - Whether to use of scientific form always (true) or never (false) or auto - (None) `Float32` and `Float64` datatypes. + Whether to use scientific form always (true), never (false), or + automatically (None) for `Float32` and `Float64` datatypes. float_precision Number of decimal places to write, applied to both `Float32` and `Float64` datatypes. diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index 2cc3018a6555..0c56c702c398 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -2364,8 +2364,8 @@ def sink_csv( `chrono `_ Rust crate. float_scientific - Whether to use of scientific form always (true) or never (false) or auto - (None) `Float32` and `Float64` datatypes. + Whether to use scientific form always (true), never (false), or + automatically (None) for `Float32` and `Float64` datatypes. float_precision Number of decimal places to write, applied to both `Float32` and `Float64` datatypes. From 8771d8fe64d8b41923f34be34933384728092a97 Mon Sep 17 00:00:00 2001 From: Luke Shingles Date: Sat, 22 Jun 2024 10:58:07 +0100 Subject: [PATCH 3/3] Split float serializer functions to remove branching --- .../src/csv/write/write_impl/serializer.rs | 107 ++++++++++++++---- 1 file changed, 82 insertions(+), 25 deletions(-) diff --git a/crates/polars-io/src/csv/write/write_impl/serializer.rs b/crates/polars-io/src/csv/write/write_impl/serializer.rs index badfd3c6dd73..e6b29b80d2b0 100644 --- a/crates/polars-io/src/csv/write/write_impl/serializer.rs +++ b/crates/polars-io/src/csv/write/write_impl/serializer.rs @@ -123,23 +123,48 @@ fn integer_serializer(array: &PrimitiveArray) }) } -fn float_serializer_no_precision( +fn float_serializer_no_precision_autoformat( array: &PrimitiveArray, ) -> impl Serializer { - let f = move |&item, buf: &mut Vec, _options: &SerializeOptions| match _options - .float_scientific - { - Some(true) => write!(buf, "{item:.e}").unwrap(), - Some(false) => { - let v: f64 = NumCast::from(item).unwrap(); - let value = v.to_string(); - buf.extend_from_slice(value.as_bytes()); - }, - None => { - let mut buffer = ryu::Buffer::new(); - let value = buffer.format(item); - buf.extend_from_slice(value.as_bytes()); - }, + let f = move |&item, buf: &mut Vec, _options: &SerializeOptions| { + let mut buffer = ryu::Buffer::new(); + let value = buffer.format(item); + buf.extend_from_slice(value.as_bytes()); + }; + + make_serializer::<_, _, false>(f, array.iter(), |array| { + array + .as_any() + .downcast_ref::>() + .expect(ARRAY_MISMATCH_MSG) + .iter() + }) +} + +fn float_serializer_no_precision_scientific( + array: &PrimitiveArray, +) -> impl Serializer { + let f = move |&item, buf: &mut Vec, _options: &SerializeOptions| { + // Float writing into a buffer of `Vec` cannot fail. + let _ = write!(buf, "{item:.e}"); + }; + + make_serializer::<_, _, false>(f, array.iter(), |array| { + array + .as_any() + .downcast_ref::>() + .expect(ARRAY_MISMATCH_MSG) + .iter() + }) +} + +fn float_serializer_no_precision_positional( + array: &PrimitiveArray, +) -> impl Serializer { + let f = move |&item, buf: &mut Vec, _options: &SerializeOptions| { + let v: f64 = NumCast::from(item).unwrap(); + let value = v.to_string(); + buf.extend_from_slice(value.as_bytes()); }; make_serializer::<_, _, false>(f, array.iter(), |array| { @@ -151,17 +176,31 @@ fn float_serializer_no_precision( +fn float_serializer_with_precision_scientific( array: &PrimitiveArray, precision: usize, ) -> impl Serializer { let f = move |&item, buf: &mut Vec, _options: &SerializeOptions| { // Float writing into a buffer of `Vec` cannot fail. - let _ = if _options.float_scientific.unwrap_or(false) { - write!(buf, "{item:.precision$e}") - } else { - write!(buf, "{item:.precision$}") - }; + let _ = write!(buf, "{item:.precision$e}"); + }; + + make_serializer::<_, _, false>(f, array.iter(), |array| { + array + .as_any() + .downcast_ref::>() + .expect(ARRAY_MISMATCH_MSG) + .iter() + }) +} + +fn float_serializer_with_precision_positional( + array: &PrimitiveArray, + precision: usize, +) -> impl Serializer { + let f = move |&item, buf: &mut Vec, _options: &SerializeOptions| { + // Float writing into a buffer of `Vec` cannot fail. + let _ = write!(buf, "{item:.precision$}"); }; make_serializer::<_, _, false>(f, array.iter(), |array| { @@ -479,12 +518,30 @@ pub(super) fn serializer_for<'a>( DataType::Int64 => quote_if_always!(integer_serializer::), DataType::UInt64 => quote_if_always!(integer_serializer::), DataType::Float32 => match options.float_precision { - Some(precision) => quote_if_always!(float_serializer_with_precision::, precision), - None => quote_if_always!(float_serializer_no_precision::), + Some(precision) => match options.float_scientific { + Some(true) => { + quote_if_always!(float_serializer_with_precision_scientific::, precision) + }, + _ => quote_if_always!(float_serializer_with_precision_positional::, precision), + }, + None => match options.float_scientific { + Some(true) => quote_if_always!(float_serializer_no_precision_scientific::), + Some(false) => quote_if_always!(float_serializer_no_precision_positional::), + None => quote_if_always!(float_serializer_no_precision_autoformat::), + }, }, DataType::Float64 => match options.float_precision { - Some(precision) => quote_if_always!(float_serializer_with_precision::, precision), - None => quote_if_always!(float_serializer_no_precision::), + Some(precision) => match options.float_scientific { + Some(true) => { + quote_if_always!(float_serializer_with_precision_scientific::, precision) + }, + _ => quote_if_always!(float_serializer_with_precision_positional::, precision), + }, + None => match options.float_scientific { + Some(true) => quote_if_always!(float_serializer_no_precision_scientific::), + Some(false) => quote_if_always!(float_serializer_no_precision_positional::), + None => quote_if_always!(float_serializer_no_precision_autoformat::), + }, }, DataType::Null => quote_if_always!(null_serializer), DataType::Boolean => {