Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add float_scientific option to write_csv/sink_csv #17111

Merged
merged 3 commits into from
Jun 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions crates/polars-io/src/csv/write/options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ pub struct SerializeOptions {
pub datetime_format: Option<String>,
/// Used for [`DataType::Float64`](polars_core::datatypes::DataType::Float64)
/// and [`DataType::Float32`](polars_core::datatypes::DataType::Float32).
pub float_scientific: Option<bool>,
pub float_precision: Option<usize>,
/// Used as separator.
pub separator: u8,
Expand All @@ -59,6 +60,7 @@ impl Default for SerializeOptions {
date_format: None,
time_format: None,
datetime_format: None,
float_scientific: None,
float_precision: None,
separator: b',',
quote_char: b'"',
Expand Down
85 changes: 79 additions & 6 deletions crates/polars-io/src/csv/write/write_impl/serializer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
//! but also with `QUOTE_NON_NULL = false`.
//! 3. A serializer that quotes only non-nulls. This is a bare serializer with `QUOTE_NON_NULL = true`.

use std::fmt::LowerExp;
use std::io::Write;

use arrow::array::{Array, BooleanArray, NullArray, PrimitiveArray, Utf8ViewArray};
Expand All @@ -38,6 +39,7 @@ use arrow::types::NativeType;
#[cfg(feature = "timezones")]
use chrono::TimeZone;
use memchr::{memchr3, memchr_iter};
use num_traits::NumCast;
use polars_core::prelude::*;

use crate::csv::write::{QuoteStyle, SerializeOptions};
Expand Down Expand Up @@ -121,7 +123,7 @@ fn integer_serializer<I: NativeType + itoa::Integer>(array: &PrimitiveArray<I>)
})
}

fn float_serializer_no_precision<I: NativeType + ryu::Float>(
fn float_serializer_no_precision_autoformat<I: NativeType + ryu::Float>(
array: &PrimitiveArray<I>,
) -> impl Serializer {
let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
Expand All @@ -139,7 +141,60 @@ fn float_serializer_no_precision<I: NativeType + ryu::Float>(
})
}

fn float_serializer_with_precision<I: NativeType>(
fn float_serializer_no_precision_scientific<I: NativeType + LowerExp>(
array: &PrimitiveArray<I>,
) -> impl Serializer {
let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
// Float writing into a buffer of `Vec<u8>` cannot fail.
let _ = write!(buf, "{item:.e}");
};

make_serializer::<_, _, false>(f, array.iter(), |array| {
array
.as_any()
.downcast_ref::<PrimitiveArray<I>>()
.expect(ARRAY_MISMATCH_MSG)
.iter()
})
}

fn float_serializer_no_precision_positional<I: NativeType + NumCast>(
array: &PrimitiveArray<I>,
) -> impl Serializer {
let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
let v: f64 = NumCast::from(item).unwrap();
let value = v.to_string();
buf.extend_from_slice(value.as_bytes());
};

make_serializer::<_, _, false>(f, array.iter(), |array| {
array
.as_any()
.downcast_ref::<PrimitiveArray<I>>()
.expect(ARRAY_MISMATCH_MSG)
.iter()
})
}

fn float_serializer_with_precision_scientific<I: NativeType + LowerExp>(
array: &PrimitiveArray<I>,
precision: usize,
) -> impl Serializer {
let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
// Float writing into a buffer of `Vec<u8>` cannot fail.
let _ = write!(buf, "{item:.precision$e}");
};

make_serializer::<_, _, false>(f, array.iter(), |array| {
array
.as_any()
.downcast_ref::<PrimitiveArray<I>>()
.expect(ARRAY_MISMATCH_MSG)
.iter()
})
}

fn float_serializer_with_precision_positional<I: NativeType>(
array: &PrimitiveArray<I>,
precision: usize,
) -> impl Serializer {
Expand Down Expand Up @@ -463,12 +518,30 @@ pub(super) fn serializer_for<'a>(
DataType::Int64 => quote_if_always!(integer_serializer::<i64>),
DataType::UInt64 => quote_if_always!(integer_serializer::<u64>),
DataType::Float32 => match options.float_precision {
Some(precision) => quote_if_always!(float_serializer_with_precision::<f32>, precision),
None => quote_if_always!(float_serializer_no_precision::<f32>),
Some(precision) => match options.float_scientific {
Some(true) => {
quote_if_always!(float_serializer_with_precision_scientific::<f32>, precision)
},
_ => quote_if_always!(float_serializer_with_precision_positional::<f32>, precision),
},
None => match options.float_scientific {
Some(true) => quote_if_always!(float_serializer_no_precision_scientific::<f32>),
Some(false) => quote_if_always!(float_serializer_no_precision_positional::<f32>),
None => quote_if_always!(float_serializer_no_precision_autoformat::<f32>),
},
},
DataType::Float64 => match options.float_precision {
Some(precision) => quote_if_always!(float_serializer_with_precision::<f64>, precision),
None => quote_if_always!(float_serializer_no_precision::<f64>),
Some(precision) => match options.float_scientific {
Some(true) => {
quote_if_always!(float_serializer_with_precision_scientific::<f64>, precision)
},
_ => quote_if_always!(float_serializer_with_precision_positional::<f64>, precision),
},
None => match options.float_scientific {
Some(true) => quote_if_always!(float_serializer_no_precision_scientific::<f64>),
Some(false) => quote_if_always!(float_serializer_no_precision_positional::<f64>),
None => quote_if_always!(float_serializer_no_precision_autoformat::<f64>),
},
},
DataType::Null => quote_if_always!(null_serializer),
DataType::Boolean => {
Expand Down
8 changes: 8 additions & 0 deletions crates/polars-io/src/csv/write/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,14 @@ where
self
}

/// Set the CSV file's forced scientific notation for floats.
pub fn with_float_scientific(mut self, scientific: Option<bool>) -> Self {
if scientific.is_some() {
self.options.float_scientific = scientific;
}
self
}

/// Set the CSV file's float precision.
pub fn with_float_precision(mut self, precision: Option<usize>) -> Self {
if precision.is_some() {
Expand Down
1 change: 1 addition & 0 deletions crates/polars-pipe/src/executors/sinks/output/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ impl CsvSink {
.with_datetime_format(options.serialize_options.datetime_format)
.with_date_format(options.serialize_options.date_format)
.with_time_format(options.serialize_options.time_format)
.with_float_scientific(options.serialize_options.float_scientific)
.with_float_precision(options.serialize_options.float_precision)
.with_null_value(options.serialize_options.null)
.with_quote_style(options.serialize_options.quote_style)
Expand Down
7 changes: 7 additions & 0 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2512,6 +2512,7 @@ def write_csv(
datetime_format: str | None = ...,
date_format: str | None = ...,
time_format: str | None = ...,
float_scientific: bool | None = ...,
float_precision: int | None = ...,
null_value: str | None = ...,
quote_style: CsvQuoteStyle | None = ...,
Expand All @@ -2531,6 +2532,7 @@ def write_csv(
datetime_format: str | None = ...,
date_format: str | None = ...,
time_format: str | None = ...,
float_scientific: bool | None = ...,
float_precision: int | None = ...,
null_value: str | None = ...,
quote_style: CsvQuoteStyle | None = ...,
Expand All @@ -2549,6 +2551,7 @@ def write_csv(
datetime_format: str | None = None,
date_format: str | None = None,
time_format: str | None = None,
float_scientific: bool | None = None,
float_precision: int | None = None,
null_value: str | None = None,
quote_style: CsvQuoteStyle | None = None,
Expand Down Expand Up @@ -2587,6 +2590,9 @@ def write_csv(
A format string, with the specifiers defined by the
`chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
Rust crate.
float_scientific
Whether to use scientific form always (true), never (false), or
automatically (None) for `Float32` and `Float64` datatypes.
float_precision
Number of decimal places to write, applied to both `Float32` and
`Float64` datatypes.
Expand Down Expand Up @@ -2650,6 +2656,7 @@ def write_csv(
datetime_format,
date_format,
time_format,
float_scientific,
float_precision,
null_value,
quote_style,
Expand Down
5 changes: 5 additions & 0 deletions py-polars/polars/lazyframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2312,6 +2312,7 @@ def sink_csv(
datetime_format: str | None = None,
date_format: str | None = None,
time_format: str | None = None,
float_scientific: bool | None = None,
float_precision: int | None = None,
null_value: str | None = None,
quote_style: CsvQuoteStyle | None = None,
Expand Down Expand Up @@ -2362,6 +2363,9 @@ def sink_csv(
A format string, with the specifiers defined by the
`chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
Rust crate.
float_scientific
Whether to use scientific form always (true), never (false), or
automatically (None) for `Float32` and `Float64` datatypes.
float_precision
Number of decimal places to write, applied to both `Float32` and
`Float64` datatypes.
Expand Down Expand Up @@ -2436,6 +2440,7 @@ def sink_csv(
datetime_format=datetime_format,
date_format=date_format,
time_format=time_format,
float_scientific=float_scientific,
float_precision=float_precision,
null_value=null_value,
quote_style=quote_style,
Expand Down
3 changes: 3 additions & 0 deletions py-polars/src/dataframe/io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,7 @@ impl PyDataFrame {
datetime_format: Option<String>,
date_format: Option<String>,
time_format: Option<String>,
float_scientific: Option<bool>,
float_precision: Option<usize>,
null_value: Option<String>,
quote_style: Option<Wrap<QuoteStyle>>,
Expand All @@ -390,6 +391,7 @@ impl PyDataFrame {
.with_datetime_format(datetime_format)
.with_date_format(date_format)
.with_time_format(time_format)
.with_float_scientific(float_scientific)
.with_float_precision(float_precision)
.with_null_value(null)
.with_quote_style(quote_style.map(|wrap| wrap.0).unwrap_or_default())
Expand All @@ -408,6 +410,7 @@ impl PyDataFrame {
.with_datetime_format(datetime_format)
.with_date_format(date_format)
.with_time_format(time_format)
.with_float_scientific(float_scientific)
.with_float_precision(float_precision)
.with_null_value(null)
.with_quote_style(quote_style.map(|wrap| wrap.0).unwrap_or_default())
Expand Down
4 changes: 3 additions & 1 deletion py-polars/src/lazyframe/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -721,7 +721,7 @@ impl PyLazyFrame {
}

#[cfg(all(feature = "streaming", feature = "csv"))]
#[pyo3(signature = (path, include_bom, include_header, separator, line_terminator, quote_char, batch_size, datetime_format, date_format, time_format, float_precision, null_value, quote_style, maintain_order))]
#[pyo3(signature = (path, include_bom, include_header, separator, line_terminator, quote_char, batch_size, datetime_format, date_format, time_format, float_scientific, float_precision, null_value, quote_style, maintain_order))]
fn sink_csv(
&self,
py: Python,
Expand All @@ -735,6 +735,7 @@ impl PyLazyFrame {
datetime_format: Option<String>,
date_format: Option<String>,
time_format: Option<String>,
float_scientific: Option<bool>,
float_precision: Option<usize>,
null_value: Option<String>,
quote_style: Option<Wrap<QuoteStyle>>,
Expand All @@ -747,6 +748,7 @@ impl PyLazyFrame {
date_format,
time_format,
datetime_format,
float_scientific,
float_precision,
separator,
quote_char,
Expand Down
46 changes: 46 additions & 0 deletions py-polars/tests/unit/io/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -1347,6 +1347,52 @@ def test_float_precision(dtype: pl.Float32 | pl.Float64) -> None:
assert df.write_csv(float_precision=3) == "col\n1.000\n2.200\n3.330\n"


def test_float_scientific() -> None:
df = (
pl.Series(
"colf64",
[3.141592653589793 * mult for mult in (1e-8, 1e-3, 1e3, 1e17)],
dtype=pl.Float64,
)
.to_frame()
.with_columns(pl.col("colf64").cast(pl.Float32).alias("colf32"))
)

assert (
df.write_csv(float_precision=None, float_scientific=False)
== "colf64,colf32\n0.00000003141592653589793,0.00000003141592586075603\n0.0031415926535897933,0.0031415927223861217\n3141.592653589793,3141.5927734375\n314159265358979300,314159265516355600\n"
)
assert (
df.write_csv(float_precision=0, float_scientific=False)
== "colf64,colf32\n0,0\n0,0\n3142,3142\n314159265358979328,314159265516355584\n"
)
assert (
df.write_csv(float_precision=1, float_scientific=False)
== "colf64,colf32\n0.0,0.0\n0.0,0.0\n3141.6,3141.6\n314159265358979328.0,314159265516355584.0\n"
)
assert (
df.write_csv(float_precision=3, float_scientific=False)
== "colf64,colf32\n0.000,0.000\n0.003,0.003\n3141.593,3141.593\n314159265358979328.000,314159265516355584.000\n"
)

assert (
df.write_csv(float_precision=None, float_scientific=True)
== "colf64,colf32\n3.141592653589793e-8,3.1415926e-8\n3.1415926535897933e-3,3.1415927e-3\n3.141592653589793e3,3.1415928e3\n3.141592653589793e17,3.1415927e17\n"
)
assert (
df.write_csv(float_precision=0, float_scientific=True)
== "colf64,colf32\n3e-8,3e-8\n3e-3,3e-3\n3e3,3e3\n3e17,3e17\n"
)
assert (
df.write_csv(float_precision=1, float_scientific=True)
== "colf64,colf32\n3.1e-8,3.1e-8\n3.1e-3,3.1e-3\n3.1e3,3.1e3\n3.1e17,3.1e17\n"
)
assert (
df.write_csv(float_precision=3, float_scientific=True)
== "colf64,colf32\n3.142e-8,3.142e-8\n3.142e-3,3.142e-3\n3.142e3,3.142e3\n3.142e17,3.142e17\n"
)


def test_skip_rows_different_field_len() -> None:
csv = io.StringIO(
textwrap.dedent(
Expand Down
2 changes: 2 additions & 0 deletions py-polars/tests/unit/streaming/test_streaming_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ def test_sink_csv_with_options() -> None:
datetime_format="%Y",
date_format="%d",
time_format="%H",
float_scientific=True,
float_precision=42,
null_value="BOOM",
quote_style="always",
Expand All @@ -165,6 +166,7 @@ def test_sink_csv_with_options() -> None:
datetime_format="%Y",
date_format="%d",
time_format="%H",
float_scientific=True,
float_precision=42,
null_value="BOOM",
quote_style="always",
Expand Down