From 7c8333f903a0d86dc2f3af0dca0d0160d7700cb3 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 6 May 2024 07:26:09 -0700 Subject: [PATCH] fix: fix a panic that could happen when scanning only the row id from fragment with deleted rows (#2302) --- python/python/tests/test_dataset.py | 8 ++++++++ rust/lance/src/dataset/fragment.rs | 5 +++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/python/python/tests/test_dataset.py b/python/python/tests/test_dataset.py index 0209068a1f..8706f903e2 100644 --- a/python/python/tests/test_dataset.py +++ b/python/python/tests/test_dataset.py @@ -1333,6 +1333,14 @@ def test_scan_no_columns(tmp_path: Path): with pytest.raises(ValueError, match="no columns were selected"): dataset.scanner(columns=[]).to_table() + # also test with deleted data to make sure deleted ids not included + dataset.delete("a = 5") + num_rows = 0 + for batch in dataset.scanner(columns=[], with_row_id=True).to_batches(): + num_rows += batch.num_rows + + assert num_rows == 99 + def test_scan_prefilter(tmp_path: Path): base_dir = tmp_path / "dataset" diff --git a/rust/lance/src/dataset/fragment.rs b/rust/lance/src/dataset/fragment.rs index c0d3ad59a6..28e59d6e42 100644 --- a/rust/lance/src/dataset/fragment.rs +++ b/rust/lance/src/dataset/fragment.rs @@ -1328,12 +1328,13 @@ impl FragmentReader { u64::from(RowAddress::new_from_parts(self.fragment_id as u32, *row_id)) }) .collect(); + let num_intact_rows = row_ids.len() as u32; let row_ids_array = UInt64Array::from(row_ids); let row_id_schema = Arc::new(self.output_schema.clone()); - let tasks = (0..total_num_rows) + let tasks = (0..num_intact_rows) .step_by(batch_size as usize) .map(move |offset| { - let length = batch_size.min(total_num_rows - offset); + let length = batch_size.min(num_intact_rows - offset); let array = Arc::new(row_ids_array.slice(offset as usize, length as usize)); let batch = RecordBatch::try_new(row_id_schema.clone(), vec![array]); std::future::ready(batch.map_err(Error::from)).boxed()