Skip to content

Commit

Permalink
fix: Only validate UTF-8 for selected items when all below len 128 (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
coastalwhite authored Dec 6, 2024
1 parent ab8b71a commit 4a18809
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,7 @@ pub fn decode_plain_generic(
// only a valid first byte of a UTF-8 code-point and (L, 0, 0, 0) is valid UTF-8.
// Consequently, it is valid to just check the whole buffer.
} else if all_len_below_128 {
if simdutf8::basic::from_utf8(values).is_err() {
if simdutf8::basic::from_utf8(&values[..values.len() - mvalues.len()]).is_err() {
return Err(invalid_utf8_err());
}
} else {
Expand Down
10 changes: 10 additions & 0 deletions py-polars/tests/unit/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -2579,3 +2579,13 @@ def test_prefilter_with_projection_column_order_20175(tmp_path: Path) -> None:
1,1,1
"""),
)


def test_utf8_verification_with_slice_20174() -> None:
f = io.BytesIO()
pq.write_table(
pl.Series("s", ["a", "a" * 128]).to_frame().to_arrow(), f, use_dictionary=False
)

f.seek(0)
pl.scan_parquet(f).head(1).collect()

0 comments on commit 4a18809

Please sign in to comment.