Expand testing to make sure we return nice error messages for the uns…

…upported operations
lancedb · Nov 5, 2024 · 74a6ba3 · 74a6ba3
1 parent 5fe765f
commit 74a6ba3
Show file tree

Hide file tree

Showing 5 changed files with 211 additions and 134 deletions.
diff --git a/python/python/tests/test_balanced.py b/python/python/tests/test_balanced.py
@@ -199,6 +199,80 @@ def test_compaction(tmp_path, big_val):
     )
 
 
+def test_schema(balanced_dataset):
+    # Schema should contain blob columns
+    assert balanced_dataset.schema == pa.schema(
+        [
+            pa.field(
+                "blobs",
+                pa.large_binary(),
+                metadata={
+                    "lance-schema:storage-class": "blob",
+                },
+            ),
+            pa.field("idx", pa.uint64()),
+        ]
+    )
+
+
+def test_sample(balanced_dataset):
+    assert balanced_dataset.sample(10, columns=["idx"]).num_rows == 10
+    # Not the most obvious error but hopefully not long lived
+    with pytest.raises(
+        OSError, match="Not supported.*mapping from row addresses to row ids"
+    ):
+        assert balanced_dataset.sample(10).num_rows == 10
+    with pytest.raises(
+        OSError, match="Not supported.*mapping from row addresses to row ids"
+    ):
+        assert balanced_dataset.sample(10, columns=["blobs"]).num_rows == 10
+
+
+def test_add_columns(tmp_path, balanced_dataset):
+    # Adding columns should be fine as long as we don't try to use the blob
+    # column in any way
+
+    balanced_dataset.add_columns(
+        {
+            "idx2": "idx * 2",
+        }
+    )
+
+    assert balanced_dataset.to_table() == pa.table(
+        {
+            "idx": pa.array(range(128), pa.uint64()),
+            "idx2": pa.array(range(0, 256, 2), pa.uint64()),
+        }
+    )
+
+    with pytest.raises(
+        OSError, match="Not supported.*adding columns.*scanning non-default storage"
+    ):
+        balanced_dataset.add_columns({"blobs2": "blobs"})
+
+
+def test_unsupported(balanced_dataset, big_val):
+    # The following operations are not yet supported and we need to make
+    # sure they fail with a useful error message
+
+    # Updates & merge-insert are not supported.  They add new rows and we
+    # will need to make sure the sibling datasets are kept in sync.
+
+    with pytest.raises(
+        ValueError, match="Not supported.*Updating.*non-default storage"
+    ):
+        balanced_dataset.update({"idx": "0"})
+
+    with pytest.raises(
+        # This error could be nicer but it's fine for now
+        OSError,
+        match="Not supported.*Scanning.*non-default storage",
+    ):
+        balanced_dataset.merge_insert("idx").when_not_matched_insert_all().execute(
+            make_table(0, 1, big_val)
+        )
+
+
 # TODO: Once https://github.com/lancedb/lance/pull/3041 merges we will
 #       want to test partial appends.  We need to make sure an append of
 #       non-blob data is supported.  In order to do this we need to make

diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs
@@ -1199,12 +1199,7 @@ impl Dataset {
         row_indices: &[u64],
         projection: impl Into<ProjectionRequest>,
     ) -> Result<RecordBatch> {
-        take::take(
-            self,
-            row_indices,
-            &projection.into().into_projection_plan(self.schema())?,
-        )
-        .await
+        take::take(self, row_indices, projection.into()).await
     }
 
     /// Take Rows by the internal ROW ids.
@@ -1605,10 +1600,6 @@ impl Dataset {
             .collect())
     }
 
-    // Leaving this here so it is more obvious to future readers that we can do this and
-    // someone doesn't go off and create a new function to do this.  Delete this comment
-    // if you use this method.
-    #[allow(unused)]
     pub(crate) async fn filter_deleted_addresses(&self, addrs: &[u64]) -> Result<Vec<u64>> {
         self.filter_addr_or_ids(addrs, addrs).await
     }

diff --git a/rust/lance/src/dataset/fragment.rs b/rust/lance/src/dataset/fragment.rs
@@ -1186,6 +1186,14 @@ impl FileFragment {
             }
             schema = schema.project(&projection)?;
         }
+
+        if schema.fields.iter().any(|f| !f.is_default_storage()) {
+            return Err(Error::NotSupported {
+                source: "adding columns whose value depends on scanning non-default storage".into(),
+                location: location!(),
+            });
+        }
+
         // If there is no projection, we at least need to read the row addresses
         with_row_addr |= schema.fields.is_empty();