lancedb · wjones127 · Jun 6, 2023 · Apr 13, 2023 · Apr 13, 2023 · Apr 13, 2023
diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
@@ -356,6 +356,39 @@ def join(
         Not implemented (just override pyarrow dataset to prevent segfault)
         """
         raise NotImplementedError("Versioning not yet supported in Rust")
+
+    def merge(
+        self,
+        data_obj: ReaderLike,
+        left_on: str,
+        right_on: Optional[str] = None,
+    ):
+        """
+        Merge another dataset into this one.
+
+        Performs a left join, where the dataset is the left side and data_obj
+        is the right side. Rows existing in the dataset but not on the left will
+        be filled with null values, unless Lance doesn't support null values for
+        some types, in which case an error will be raised.
+
+        Parameters
+        ----------
+        data_obj: Reader-like
+            The data to be merged. Acceptable types are:
+            - Pandas DataFrame, Pyarrow Table, Dataset, Scanner, or RecordBatchReader
+        left_on: str
+            The name of the column in the dataset to join on.
+        right_on: str or None
+            The name of the column in data_obj to join on. If None, defaults to
+            left_on.
+        """
+        if right_on is None:
+            right_on = left_on
+
+        reader = _coerce_reader(data_obj)
+
+        self._ds.merge(reader, left_on, right_on)
+
 
     def versions(self):
         """
@@ -808,18 +841,7 @@ def write_dataset(
         The max number of rows before starting a new group (in the same file)
 
     """
-    if isinstance(data_obj, pd.DataFrame):
-        reader = pa.Table.from_pandas(data_obj, schema=schema).to_reader()
-    elif isinstance(data_obj, pa.Table):
-        reader = data_obj.to_reader()
-    elif isinstance(data_obj, pa.dataset.Dataset):
-        reader = pa.dataset.Scanner.from_dataset(data_obj).to_reader()
-    elif isinstance(data_obj, pa.dataset.Scanner):
-        reader = data_obj.to_reader()
-    elif isinstance(data_obj, pa.RecordBatchReader):
-        reader = data_obj
-    else:
-        raise TypeError(f"Unknown data_obj type {type(data_obj)}")
+    reader = _coerce_reader(data_obj)
     # TODO add support for passing in LanceDataset and LanceScanner here
 
     params = {
@@ -831,3 +853,18 @@ def write_dataset(
     uri = os.fspath(uri) if isinstance(uri, Path) else uri
     _write_dataset(reader, uri, params)
     return LanceDataset(uri)
+
+
+def _coerce_reader(data_obj: ReaderLike, schema: Optional[pa.Schema] = None) -> pa.RecordBatchReader:
+    if isinstance(data_obj, pd.DataFrame):
+        return pa.Table.from_pandas(data_obj, schema=schema).to_reader()
+    elif isinstance(data_obj, pa.Table):
+        return data_obj.to_reader()
+    elif isinstance(data_obj, pa.dataset.Dataset):
+        return pa.dataset.Scanner.from_dataset(data_obj).to_reader()
+    elif isinstance(data_obj, pa.dataset.Scanner):
+        return data_obj.to_reader()
+    elif isinstance(data_obj, pa.RecordBatchReader):
+        return data_obj
+    else:
+        raise TypeError(f"Unknown data_obj type {type(data_obj)}")
diff --git a/python/python/tests/test_dataset.py b/python/python/tests/test_dataset.py
@@ -357,3 +357,33 @@ def test_load_scanner_from_fragments(tmp_path: Path):
     # Accepts an iterator
     scanner = dataset.scanner(fragments=iter(fragments[0:2]), scan_in_order=False)
     assert scanner.to_table().num_rows == 2 * 100
+
+
+def test_merge_data(tmp_path: Path):
+    tab = pa.table({"a": range(100), "b": range(100)})
+    lance.write_dataset(tab, tmp_path / "dataset", mode="append")
+
+    dataset = lance.dataset(tmp_path / "dataset")
+
+    # rejects partial data for non-nullable types
+    new_tab = pa.table({"a": range(40), "c": range(40)})
+    # TODO: this should be ValueError
+    with pytest.raises(OSError, match=".+Lance does not yet support nulls for type Int64."):
+        dataset.merge(new_tab, "a")
+
+    # accepts a full merge
+    new_tab = pa.table({"a": range(100), "c": range(100)})
+    dataset.merge(new_tab, "a")
+    assert dataset.version == 2
+    assert dataset.to_table() == pa.table({"a": range(100), "b": range(100), "c": range(100)})
+
+    # accepts a partial for string
+    new_tab = pa.table({"a2": range(5), "d": ["a", "b", "c", "d", "e"]})
+    dataset.merge(new_tab, left_on="a", right_on="a2")
+    assert dataset.version == 3
+    assert dataset.to_table() == pa.table({
+        "a": range(100),
+        "b": range(100),
+        "c": range(100),
+        "d": ["a", "b", "c", "d", "e"] + [None] * 95
+    })
diff --git a/python/src/dataset.rs b/python/src/dataset.rs
@@ -281,6 +281,22 @@ impl Dataset {
         batch.to_pyarrow(self_.py())
     }
 
+    fn merge(
+        &mut self,
+        reader: PyArrowType<ArrowArrayStreamReader>,
+        left_on: &str,
+        right_on: &str,
+    ) -> PyResult<()> {
+        let mut reader: Box<dyn RecordBatchReader> = Box::new(reader.0);
+        let mut new_self = self.ds.as_ref().clone();
+        let fut = new_self.merge(&mut reader, left_on, right_on);
+        self.rt.block_on(
+            async move { fut.await.map_err(|err| PyIOError::new_err(err.to_string())) },
+        )?;
+        self.ds = Arc::new(new_self);
+        Ok(())
+    }
+
     fn versions(self_: PyRef<'_, Self>) -> PyResult<Vec<PyObject>> {
         let versions = self_
             .list_versions()

diff --git a/rust/Cargo.toml b/rust/Cargo.toml
@@ -34,13 +34,16 @@ arrow-cast = "37.0.0"
 arrow-data = "37.0"
 arrow-ipc = { version = "37.0", features = ["zstd"] }
 arrow-ord = "37.0"
+arrow-row = "37.0"
 arrow-schema = "37.0"
 arrow-select = "37.0"
 async-recursion = "1.0"
 async-trait = "0.1.60"
 byteorder = "1.4.3"
 chrono = "0.4.23"
 clap = { version = "4.1.1", features = ["derive"], optional = true }
+# This is already used by datafusion
+dashmap = "5"
 object_store = { version = "0.5.6", features = ["aws_profile", "gcp"] }
 reqwest = { version = "0.11.16" }
 aws-config = "0.54"

diff --git a/rust/src/arrow/record_batch.rs b/rust/src/arrow/record_batch.rs
@@ -1,19 +1,16 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
+// Copyright 2023 Lance Developers.
 //
-//   http://www.apache.org/licenses/LICENSE-2.0
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
 //
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 //! Additional utility for [`RecordBatch`]
 //!
@@ -23,6 +20,9 @@ use arrow_schema::{ArrowError, SchemaRef};
 
 use crate::Result;
 
+/// RecordBatchBuffer is a in-memory buffer for multiple [`RecordBatch`]s.
+///
+///
 #[derive(Debug)]
 pub struct RecordBatchBuffer {
     pub batches: Vec<RecordBatch>,
@@ -69,3 +69,10 @@ impl Iterator for RecordBatchBuffer {
         }
     }
 }
+
+impl FromIterator<RecordBatch> for RecordBatchBuffer {
+    fn from_iter<T: IntoIterator<Item = RecordBatch>>(iter: T) -> Self {
+        let batches = iter.into_iter().collect::<Vec<_>>();
+        Self::new(batches)
+    }
+}