Skip to content

Commit

Permalink
fix: Incorrect parquet statistics written for UInt64 values > Int64::…
Browse files Browse the repository at this point in the history
…MAX (#16766)
  • Loading branch information
nameexhaustion authored Jun 6, 2024
1 parent 5a0c803 commit b329894
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 10 deletions.
15 changes: 5 additions & 10 deletions crates/polars-parquet/src/arrow/write/primitive/basic.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use arrow::array::{Array, PrimitiveArray};
use arrow::types::NativeType;
use polars_error::{polars_bail, PolarsResult};
use polars_utils::total_ord::TotalOrd;

use super::super::{utils, WriteOptions};
use crate::arrow::read::schema::is_nullable;
Expand Down Expand Up @@ -176,23 +177,17 @@ where
.then(|| {
array
.non_null_values_iter()
.map(|x| {
let x: P = x.as_();
x
})
.max_by(|x, y| x.ord(y))
.max_by(TotalOrd::tot_cmp)
.map(T::as_)
})
.flatten(),
min_value: options
.min_value
.then(|| {
array
.non_null_values_iter()
.map(|x| {
let x: P = x.as_();
x
})
.min_by(|x, y| x.ord(y))
.min_by(TotalOrd::tot_cmp)
.map(T::as_)
})
.flatten(),
}
Expand Down
12 changes: 12 additions & 0 deletions py-polars/tests/unit/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -970,3 +970,15 @@ def test_hybrid_rle() -> None:
assert "RLE_DICTIONARY" in column["encodings"]
f.seek(0)
assert_frame_equal(pl.read_parquet(f), df)


def test_parquet_statistics_uint64_16683() -> None:
u64_max = (1 << 64) - 1
df = pl.Series("a", [u64_max, 0], dtype=pl.UInt64).to_frame()
file = io.BytesIO()
df.write_parquet(file, statistics=True)
file.seek(0)
statistics = pq.read_metadata(file).row_group(0).column(0).statistics

assert statistics.min == 0
assert statistics.max == u64_max

0 comments on commit b329894

Please sign in to comment.