From 62b8eeb24fc0a2f04f42b4d950c4e6bedab22f36 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 13 Jan 2025 06:33:43 -0800 Subject: [PATCH] Need to prefix column names when doing join with identical schemas now --- Cargo.lock | 586 ++++++++----------- rust/lance-datafusion/src/exec.rs | 2 +- rust/lance-datafusion/src/planner.rs | 2 +- rust/lance/src/dataset/scanner.rs | 38 +- rust/lance/src/dataset/write/merge_insert.rs | 60 +- 5 files changed, 322 insertions(+), 366 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c428391884..030211518e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -429,24 +429,6 @@ dependencies = [ "pin-project-lite", ] -[[package]] -name = "async-compression" -version = "0.4.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df895a515f70646414f4b45c0b79082783b80552b373a68283012928df56f522" -dependencies = [ - "bzip2", - "flate2", - "futures-core", - "futures-io", - "memchr", - "pin-project-lite", - "tokio", - "xz2", - "zstd", - "zstd-safe", -] - [[package]] name = "async-executor" version = "1.13.1" @@ -522,7 +504,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -565,7 +547,7 @@ checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -964,6 +946,19 @@ dependencies = [ "vsimd", ] +[[package]] +name = "bigdecimal" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f31f3af01c5c65a07985c804d3366560e6fa7883d640a122819b14ec327482c" +dependencies = [ + "autocfg", + "libm", + "num-bigint", + "num-integer", + "num-traits", +] + [[package]] name = "bincode" version = "1.3.3" @@ -1021,28 +1016,6 @@ dependencies = [ "wyz", ] -[[package]] -name = "blake2" -version = "0.10.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46502ad458c9a52b69d4d4d32775c788b7a1b85e8bc9d482d92250fc0e3f8efe" -dependencies = [ - "digest", -] - -[[package]] -name = "blake3" -version = "1.5.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8ee0c1824c4dea5b5f81736aff91bae041d2c07ee1192bec91054e10e3e601e" -dependencies = [ - "arrayref", - "arrayvec", - "cc", - "cfg-if", - "constant_time_eq", -] - [[package]] name = "block-buffer" version = "0.10.4" @@ -1141,27 +1114,6 @@ dependencies = [ "either", ] -[[package]] -name = "bzip2" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8" -dependencies = [ - "bzip2-sys", - "libc", -] - -[[package]] -name = "bzip2-sys" -version = "0.1.11+1.0.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" -dependencies = [ - "cc", - "libc", - "pkg-config", -] - [[package]] name = "cast" version = "0.3.0" @@ -1304,7 +1256,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -1378,12 +1330,6 @@ dependencies = [ "tiny-keccak", ] -[[package]] -name = "constant_time_eq" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" - [[package]] name = "convert_case" version = "0.6.0" @@ -1609,7 +1555,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -1620,7 +1566,7 @@ checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" dependencies = [ "darling_core", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -1658,19 +1604,16 @@ dependencies = [ [[package]] name = "datafusion" -version = "42.2.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dae5f2abc725737d6e87b6d348a5aa2d0a77e4cf873045f004546da946e6e619" +checksum = "014fc8c384ecacedaabb3bc8359c2a6c6e9d8f7bea65be3434eccacfc37f52d9" dependencies = [ - "ahash", "arrow", "arrow-array", "arrow-ipc", "arrow-schema", - "async-compression", "async-trait", "bytes", - "bzip2", "chrono", "dashmap 6.1.0", "datafusion-catalog", @@ -1681,6 +1624,7 @@ dependencies = [ "datafusion-functions", "datafusion-functions-aggregate", "datafusion-functions-nested", + "datafusion-functions-table", "datafusion-functions-window", "datafusion-optimizer", "datafusion-physical-expr", @@ -1688,36 +1632,27 @@ dependencies = [ "datafusion-physical-optimizer", "datafusion-physical-plan", "datafusion-sql", - "flate2", "futures", "glob", - "half", - "hashbrown 0.14.5", - "indexmap", "itertools 0.13.0", "log", - "num_cpus", "object_store 0.11.1", "parking_lot", "parquet", - "paste", - "pin-project-lite", "rand", + "regex", "sqlparser", "tempfile", "tokio", - "tokio-util", "url", "uuid", - "xz2", - "zstd", ] [[package]] name = "datafusion-catalog" -version = "42.2.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "998761705551f11ffa4ee692cc285b44eb1def6e0d28c4eaf5041b9e2810dc1e" +checksum = "ee60d33e210ef96070377ae667ece7caa0e959c8387496773d4a1a72f1a5012e" dependencies = [ "arrow-schema", "async-trait", @@ -1730,51 +1665,55 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "42.2.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11986f191e88d950f10a5cc512a598afba27d92e04a0201215ad60785005115a" +checksum = "0b42b7d720fe21ed9cca2ebb635f3f13a12cfab786b41e0fba184fb2e620525b" dependencies = [ "ahash", "arrow", "arrow-array", "arrow-buffer", "arrow-schema", - "chrono", "half", "hashbrown 0.14.5", - "instant", + "indexmap", "libc", - "num_cpus", + "log", "object_store 0.11.1", "parquet", "paste", "sqlparser", "tokio", + "web-time", ] [[package]] name = "datafusion-common-runtime" -version = "42.2.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "694c9d7ea1b82f95768215c4cb5c2d5c613690624e832a7ee64be563139d582f" +checksum = "72fbf14d4079f7ce5306393084fe5057dddfdc2113577e0049310afa12e94281" dependencies = [ "log", "tokio", ] +[[package]] +name = "datafusion-doc" +version = "44.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c278dbd64860ed0bb5240fc1f4cb6aeea437153910aea69bcf7d5a8d6d0454f3" + [[package]] name = "datafusion-execution" -version = "42.2.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30b4cedcd98151e0a297f34021b6b232ff0ebc0f2f18ea5e7446b5ebda99b1a1" +checksum = "e22cb02af47e756468b3cbfee7a83e3d4f2278d452deb4b033ba933c75169486" dependencies = [ "arrow", - "chrono", "dashmap 6.1.0", "datafusion-common", "datafusion-expr", "futures", - "hashbrown 0.14.5", "log", "object_store 0.11.1", "parking_lot", @@ -1785,104 +1724,101 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "42.2.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8dd114dc0296cacaee98ad3165724529fcca9a65b2875abcd447b9cc02b2b74" +checksum = "62298eadb1d15b525df1315e61a71519ffc563d41d5c3b2a30fda2d70f77b93c" dependencies = [ - "ahash", "arrow", - "arrow-array", - "arrow-buffer", "chrono", "datafusion-common", + "datafusion-doc", "datafusion-expr-common", "datafusion-functions-aggregate-common", + "datafusion-functions-window-common", "datafusion-physical-expr-common", + "indexmap", "paste", "serde_json", "sqlparser", - "strum", - "strum_macros", ] [[package]] name = "datafusion-expr-common" -version = "42.2.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d1ba2bb018218d9260bbd7de6a46a20f61b93d4911dba8aa07735625004c4fb" +checksum = "dda7f73c5fc349251cd3dcb05773c5bf55d2505a698ef9d38dfc712161ea2f55" dependencies = [ "arrow", "datafusion-common", - "paste", + "itertools 0.13.0", ] [[package]] name = "datafusion-functions" -version = "42.2.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "547cb780a4ac51fd8e52c0fb9188bc16cea4e35aebf6c454bda0b82a7a417304" +checksum = "fd197f3b2975424d3a4898ea46651be855a46721a56727515dbd5c9e2fb597da" dependencies = [ "arrow", "arrow-buffer", "base64 0.22.1", - "blake2", - "blake3", "chrono", "datafusion-common", + "datafusion-doc", "datafusion-execution", "datafusion-expr", + "datafusion-expr-common", + "datafusion-macros", "hashbrown 0.14.5", "hex", "itertools 0.13.0", "log", - "md-5", "rand", "regex", - "sha2", "unicode-segmentation", "uuid", ] [[package]] name = "datafusion-functions-aggregate" -version = "42.2.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e68cf5aa7ebcac08bd04bb709a9a6d4963eafd227da62b628133bc509c40f5a0" +checksum = "aabbe48fba18f9981b134124381bee9e46f93518b8ad2f9721ee296cef5affb9" dependencies = [ "ahash", "arrow", "arrow-schema", "datafusion-common", + "datafusion-doc", "datafusion-execution", "datafusion-expr", "datafusion-functions-aggregate-common", + "datafusion-macros", "datafusion-physical-expr", "datafusion-physical-expr-common", "half", "log", "paste", - "sqlparser", ] [[package]] name = "datafusion-functions-aggregate-common" -version = "42.2.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2285d080dfecdfb8605b0ab2f1a41e2473208dc8e9bd6f5d1dbcfe97f517e6f" +checksum = "d7a3fefed9c8c11268d446d924baca8cabf52fe32f73fdaa20854bac6473590c" dependencies = [ "ahash", "arrow", "datafusion-common", "datafusion-expr-common", "datafusion-physical-expr-common", - "rand", ] [[package]] name = "datafusion-functions-nested" -version = "42.2.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b6ffbbb7cf7bf0c0e05eb6207023fef341cac83a593a5365a6fc83803c572a9" +checksum = "6360f27464fab857bec698af39b2ae331dc07c8bf008fb4de387a19cdc6815a5" dependencies = [ "arrow", "arrow-array", @@ -1898,106 +1834,139 @@ dependencies = [ "itertools 0.13.0", "log", "paste", - "rand", +] + +[[package]] +name = "datafusion-functions-table" +version = "44.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c35c070eb705c12795dab399c3809f4dfbc290678c624d3989490ca9b8449c1" +dependencies = [ + "arrow", + "async-trait", + "datafusion-catalog", + "datafusion-common", + "datafusion-expr", + "datafusion-physical-plan", + "parking_lot", + "paste", ] [[package]] name = "datafusion-functions-window" -version = "42.2.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e78d30ebd6e9f74d4aeddec32744f5a18b5f9584591bc586fb5259c4848bac5" +checksum = "52229bca26b590b140900752226c829f15fc1a99840e1ca3ce1a9534690b82a8" dependencies = [ "datafusion-common", + "datafusion-doc", "datafusion-expr", + "datafusion-functions-window-common", + "datafusion-macros", + "datafusion-physical-expr", "datafusion-physical-expr-common", "log", + "paste", +] + +[[package]] +name = "datafusion-functions-window-common" +version = "44.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "367befc303b64a668a10ae6988a064a9289e1999e71a7f8e526b6e14d6bdd9d6" +dependencies = [ + "datafusion-common", + "datafusion-physical-expr-common", +] + +[[package]] +name = "datafusion-macros" +version = "44.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f5de3c8f386ea991696553afe241a326ecbc3c98a12c562867e4be754d3a060c" +dependencies = [ + "quote", + "syn 2.0.96", ] [[package]] name = "datafusion-optimizer" -version = "42.2.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be172c44bf344df707e0c041fa3f41e6dc5fb0976f539c68bc442bca150ee58c" +checksum = "53b520413906f755910422b016fb73884ae6e9e1b376de4f9584b6c0e031da75" dependencies = [ "arrow", - "async-trait", "chrono", "datafusion-common", "datafusion-expr", "datafusion-physical-expr", - "hashbrown 0.14.5", "indexmap", "itertools 0.13.0", "log", - "paste", + "regex", "regex-syntax 0.8.5", ] [[package]] name = "datafusion-physical-expr" -version = "42.2.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43b86b7fa0b8161c49b0f005b0df193fc6d9b65ceec675f155422cda5d1583ca" +checksum = "acd6ddc378f6ad19af95ccd6790dec8f8e1264bc4c70e99ddc1830c1a1c78ccd" dependencies = [ "ahash", "arrow", "arrow-array", "arrow-buffer", - "arrow-ord", "arrow-schema", - "arrow-string", - "base64 0.22.1", - "chrono", "datafusion-common", - "datafusion-execution", "datafusion-expr", "datafusion-expr-common", "datafusion-functions-aggregate-common", "datafusion-physical-expr-common", "half", "hashbrown 0.14.5", - "hex", "indexmap", "itertools 0.13.0", "log", "paste", "petgraph", - "regex", ] [[package]] name = "datafusion-physical-expr-common" -version = "42.2.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "242ba8a26351d9ca16295814c46743b0d1b00ec372174bdfbba991d0953dd596" +checksum = "06e6c05458eccd74b4c77ed6a1fe63d52434240711de7f6960034794dad1caf5" dependencies = [ "ahash", "arrow", "datafusion-common", "datafusion-expr-common", "hashbrown 0.14.5", - "rand", + "itertools 0.13.0", ] [[package]] name = "datafusion-physical-optimizer" -version = "42.2.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25ca088eb904bf1cfc9c5e5653110c70a6eaba43164085a9d180b35b77ce3b8b" +checksum = "9dc3a82190f49c37d377f31317e07ab5d7588b837adadba8ac367baad5dc2351" dependencies = [ - "arrow-schema", + "arrow", "datafusion-common", "datafusion-execution", + "datafusion-expr-common", "datafusion-physical-expr", "datafusion-physical-plan", "itertools 0.13.0", + "log", ] [[package]] name = "datafusion-physical-plan" -version = "42.2.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4989a53b824abc759685eb643f4d604c2fc2fea4e2c309ac3473bea263ecbbeb" +checksum = "6a6608bc9844b4ddb5ed4e687d173e6c88700b1d0482f43894617d18a1fe75da" dependencies = [ "ahash", "arrow", @@ -2011,8 +1980,7 @@ dependencies = [ "datafusion-common-runtime", "datafusion-execution", "datafusion-expr", - "datafusion-functions-aggregate", - "datafusion-functions-aggregate-common", + "datafusion-functions-window-common", "datafusion-physical-expr", "datafusion-physical-expr-common", "futures", @@ -2021,45 +1989,45 @@ dependencies = [ "indexmap", "itertools 0.13.0", "log", - "once_cell", "parking_lot", "pin-project-lite", - "rand", "tokio", ] [[package]] name = "datafusion-sql" -version = "42.2.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66b9b75b9da10ed656073ac0553708f17eb8fa5a7b065ef9848914c93150ab9e" +checksum = "6a884061c79b33d0c8e84a6f4f4be8bdc12c0f53f5af28ddf5d6d95ac0b15fdc" dependencies = [ "arrow", "arrow-array", "arrow-schema", + "bigdecimal", "datafusion-common", "datafusion-expr", + "indexmap", "log", "regex", "sqlparser", - "strum", ] [[package]] name = "datafusion-substrait" -version = "42.2.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "220d7ab0ffadd8b1af753904b18dd92d270271810b1ce9f8be3c3dbe2392b636" +checksum = "d2ec36dd38512b1ecc7a3bb92e72046b944611b2f0d709445c1e51b0143bffd4" dependencies = [ "arrow-buffer", "async-recursion", + "async-trait", "chrono", "datafusion", "itertools 0.13.0", "object_store 0.11.1", "pbjson-types", - "prost 0.13.3", - "substrait 0.41.9", + "prost 0.13.4", + "substrait 0.50.4", "url", ] @@ -2120,7 +2088,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -2130,7 +2098,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" dependencies = [ "derive_builder_core", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -2179,7 +2147,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -2345,7 +2313,7 @@ checksum = "3bf679796c0322556351f287a51b49e48f7c4986e727b5dd78c972d30e2e16cc" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -2603,7 +2571,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -3158,7 +3126,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -3208,7 +3176,7 @@ dependencies = [ "libflate", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -3470,8 +3438,8 @@ dependencies = [ "pprof", "pretty_assertions", "prost 0.12.6", - "prost 0.13.3", - "prost-types 0.13.3", + "prost 0.13.4", + "prost-types 0.13.4", "rand", "random_word", "roaring", @@ -3533,7 +3501,7 @@ dependencies = [ "object_store 0.10.2", "pin-project", "proptest", - "prost 0.13.3", + "prost 0.13.4", "rand", "roaring", "serde_json", @@ -3568,7 +3536,7 @@ dependencies = [ "lance-datagen", "lazy_static", "log", - "prost 0.13.3", + "prost 0.13.4", "snafu 0.7.5", "substrait-expr", "tokio", @@ -3622,9 +3590,9 @@ dependencies = [ "num-traits", "paste", "pprof", - "prost 0.13.3", - "prost-build 0.13.3", - "prost-types 0.13.3", + "prost 0.13.4", + "prost-build 0.13.4", + "prost-types 0.13.4", "protobuf-src", "rand", "rand_xoshiro", @@ -3661,9 +3629,9 @@ dependencies = [ "lance-io", "log", "pprof", - "prost 0.13.3", - "prost-build 0.13.3", - "prost-types 0.13.3", + "prost 0.13.4", + "prost-build 0.13.4", + "prost-types 0.13.4", "protobuf-src", "rand", "snafu 0.7.5", @@ -3701,9 +3669,9 @@ dependencies = [ "pprof", "pretty_assertions", "proptest", - "prost 0.13.3", - "prost-build 0.13.3", - "prost-types 0.13.3", + "prost 0.13.4", + "prost-build 0.13.4", + "prost-types 0.13.4", "protobuf-src", "rand", "roaring", @@ -3760,8 +3728,8 @@ dependencies = [ "num-traits", "object_store 0.10.2", "pprof", - "prost 0.13.3", - "prost-build 0.13.3", + "prost 0.13.4", + "prost-build 0.13.4", "protobuf-src", "rand", "random_word", @@ -3811,7 +3779,7 @@ dependencies = [ "path_abs", "pin-project", "pprof", - "prost 0.13.3", + "prost 0.13.4", "rand", "shellexpand", "snafu 0.7.5", @@ -3902,9 +3870,9 @@ dependencies = [ "pprof", "pretty_assertions", "proptest", - "prost 0.13.3", - "prost-build 0.13.3", - "prost-types 0.13.3", + "prost 0.13.4", + "prost-build 0.13.4", + "prost-types 0.13.4", "protobuf-src", "rand", "rangemap", @@ -3924,7 +3892,7 @@ version = "0.22.0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -4316,7 +4284,7 @@ dependencies = [ "cfg-if", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -4614,7 +4582,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -4781,8 +4749,8 @@ checksum = "6eea3058763d6e656105d1403cb04e0a41b7bbac6362d413e7c33be0c32279c9" dependencies = [ "heck 0.5.0", "itertools 0.13.0", - "prost 0.13.3", - "prost-types 0.13.3", + "prost 0.13.4", + "prost-types 0.13.4", ] [[package]] @@ -4795,8 +4763,8 @@ dependencies = [ "chrono", "pbjson", "pbjson-build", - "prost 0.13.3", - "prost-build 0.13.3", + "prost 0.13.4", + "prost-build 0.13.4", "serde", ] @@ -4877,7 +4845,7 @@ checksum = "3c0f5fad0874fc7abcd4d750e76917eaebbecaa2c20bde22e1dbeeba8beb758c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -5033,7 +5001,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "64d1ec885c64d0457d564db4ec299b2dae3f9c02808b8ad9c3a089c591b18033" dependencies = [ "proc-macro2", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -5086,12 +5054,12 @@ dependencies = [ [[package]] name = "prost" -version = "0.13.3" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b0487d90e047de87f984913713b85c601c05609aad5b0df4b4573fbf69aa13f" +checksum = "2c0fef6c4230e4ccf618a35c59d7ede15dea37de8427500f50aff708806e42ec" dependencies = [ "bytes", - "prost-derive 0.13.3", + "prost-derive 0.13.4", ] [[package]] @@ -5111,17 +5079,16 @@ dependencies = [ "prost 0.12.6", "prost-types 0.12.6", "regex", - "syn 2.0.90", + "syn 2.0.96", "tempfile", ] [[package]] name = "prost-build" -version = "0.13.3" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15" +checksum = "d0f3e5beed80eb580c68e2c600937ac2c4eedabdfd5ef1e5b7ea4f3fba84497b" dependencies = [ - "bytes", "heck 0.5.0", "itertools 0.13.0", "log", @@ -5129,10 +5096,10 @@ dependencies = [ "once_cell", "petgraph", "prettyplease", - "prost 0.13.3", - "prost-types 0.13.3", + "prost 0.13.4", + "prost-types 0.13.4", "regex", - "syn 2.0.90", + "syn 2.0.96", "tempfile", ] @@ -5146,20 +5113,20 @@ dependencies = [ "itertools 0.12.1", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] name = "prost-derive" -version = "0.13.3" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9552f850d5f0964a4e4d0bf306459ac29323ddfbae05e35a7c0d35cb0803cc5" +checksum = "157c5a9d7ea5c2ed2d9fb8f495b64759f7816c7eaea54ba3978f0d63000162e3" dependencies = [ "anyhow", "itertools 0.13.0", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -5173,11 +5140,11 @@ dependencies = [ [[package]] name = "prost-types" -version = "0.13.3" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4759aa0d3a6232fb8dbdb97b61de2c20047c68aca932c7ed76da9d788508d670" +checksum = "cc2f1e56baa61e93533aebc21af4d2134b70f66275e0fcdf3cbe43d77ff7e8fc" dependencies = [ - "prost 0.13.3", + "prost 0.13.4", ] [[package]] @@ -5473,16 +5440,6 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" -[[package]] -name = "regress" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0eae2a1ebfecc58aff952ef8ccd364329abe627762f5bf09ff42eb9d98522479" -dependencies = [ - "hashbrown 0.14.5", - "memchr", -] - [[package]] name = "regress" version = "0.10.1" @@ -5615,7 +5572,7 @@ dependencies = [ "regex", "relative-path", "rustc_version", - "syn 2.0.90", + "syn 2.0.96", "unicode-ident", ] @@ -5831,7 +5788,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -5888,9 +5845,9 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.23" +version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" +checksum = "3cb6eb87a131f756572d7fb904f6e7b68633f09cca868c5df1c4b8d1a694bbba" dependencies = [ "serde", ] @@ -5903,22 +5860,22 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.215" +version = "1.0.217" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f" +checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.215" +version = "1.0.217" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0" +checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -5929,14 +5886,14 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] name = "serde_json" -version = "1.0.133" +version = "1.0.135" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377" +checksum = "2b0d7ba2887406110130a978386c4e1befb98c674b4fba677954e4db976630d9" dependencies = [ "itoa", "memchr", @@ -5953,7 +5910,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -6095,7 +6052,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -6122,9 +6079,9 @@ checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" [[package]] name = "sqlparser" -version = "0.50.0" +version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2e5b515a2bd5168426033e9efbfd05500114833916f1d5c268f938b4ee130ac" +checksum = "05a528114c392209b3264855ad491fcce534b94a38771b0a0b97a79379275ce8" dependencies = [ "log", "sqlparser_derive", @@ -6132,13 +6089,13 @@ dependencies = [ [[package]] name = "sqlparser_derive" -version = "0.2.2" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" +checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -6196,64 +6153,65 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] name = "substrait" -version = "0.41.9" +version = "0.49.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a3bf05f1d7a3fd7a97790d410f6e859b3a98dcde05e7a3fc00b31b0f60fe7cb" +checksum = "2c271a596176d3b82bfc5b4107fe9fbd30e6a9a99c0dca146777f05d8f0e08e4" dependencies = [ "heck 0.5.0", - "pbjson", - "pbjson-build", - "pbjson-types", "prettyplease", - "prost 0.13.3", - "prost-build 0.13.3", - "prost-types 0.13.3", + "prost 0.13.4", + "prost-build 0.13.4", + "prost-types 0.13.4", + "regress", "schemars", "semver", "serde", "serde_json", "serde_yaml", - "syn 2.0.90", - "typify 0.1.0", + "syn 2.0.96", + "typify", "walkdir", ] [[package]] name = "substrait" -version = "0.49.5" +version = "0.50.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c271a596176d3b82bfc5b4107fe9fbd30e6a9a99c0dca146777f05d8f0e08e4" +checksum = "b1772d041c37cc7e6477733c76b2acf4ee36bd52b2ae4d9ea0ec9c87d003db32" dependencies = [ "heck 0.5.0", + "pbjson", + "pbjson-build", + "pbjson-types", "prettyplease", - "prost 0.13.3", - "prost-build 0.13.3", - "prost-types 0.13.3", - "regress 0.10.1", + "prost 0.13.4", + "prost-build 0.13.4", + "prost-types 0.13.4", + "regress", "schemars", "semver", "serde", "serde_json", "serde_yaml", - "syn 2.0.90", - "typify 0.2.0", + "syn 2.0.96", + "typify", "walkdir", ] [[package]] name = "substrait-expr" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45a6a94f5dd69c5329a9c96c93ac5f17a8d64089ca21d29d7971825f7451941d" +checksum = "9d091cf06bc7808bd81eb01f5f5b77b2b14288bb022501a2dcad78633c65262f" dependencies = [ "once_cell", - "prost 0.13.3", - "substrait 0.49.5", + "prost 0.13.4", + "substrait 0.50.4", "substrait-expr-funcgen", "substrait-expr-macros", "thiserror 2.0.4", @@ -6271,7 +6229,7 @@ dependencies = [ "quote", "serde_yaml", "substrait 0.49.5", - "syn 2.0.90", + "syn 2.0.96", "thiserror 2.0.4", ] @@ -6283,7 +6241,7 @@ checksum = "3a2be2af0276c9d693f90d0f4e0e7b1790b14692538e0d418812249f41c055be" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -6328,9 +6286,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.90" +version = "2.0.96" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31" +checksum = "d5d0adab1ae378d7f53bdebc67a39f1f151407ef230f0ce2883572f5d8985c80" dependencies = [ "proc-macro2", "quote", @@ -6354,7 +6312,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -6589,7 +6547,7 @@ checksum = "5999e24eaa32083191ba4e425deb75cdf25efefabe5aaccb7446dd0d4122a3f5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -6647,7 +6605,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -6658,7 +6616,7 @@ checksum = "8381894bb3efe0c4acac3ded651301ceee58a15d47c2e34885ed1908ad667061" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -6782,7 +6740,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -6882,7 +6840,7 @@ checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -6963,44 +6921,14 @@ version = "1.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" -[[package]] -name = "typify" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adb6beec125971dda80a086f90b4a70f60f222990ce4d63ad0fc140492f53444" -dependencies = [ - "typify-impl 0.1.0", - "typify-macro 0.1.0", -] - [[package]] name = "typify" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4c644dda9862f0fef3a570d8ddb3c2cfb1d5ac824a1f2ddfa7bc8f071a5ad8a" dependencies = [ - "typify-impl 0.2.0", - "typify-macro 0.2.0", -] - -[[package]] -name = "typify-impl" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93bbb24e990654aff858d80fee8114f4322f7d7a1b1ecb45129e2fcb0d0ad5ae" -dependencies = [ - "heck 0.5.0", - "log", - "proc-macro2", - "quote", - "regress 0.9.1", - "schemars", - "semver", - "serde", - "serde_json", - "syn 2.0.90", - "thiserror 1.0.69", - "unicode-ident", + "typify-impl", + "typify-macro", ] [[package]] @@ -7013,33 +6941,16 @@ dependencies = [ "log", "proc-macro2", "quote", - "regress 0.10.1", + "regress", "schemars", "semver", "serde", "serde_json", - "syn 2.0.90", + "syn 2.0.96", "thiserror 1.0.69", "unicode-ident", ] -[[package]] -name = "typify-macro" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8e6491896e955692d68361c68db2b263e3bec317ec0b684e0e2fa882fb6e31e" -dependencies = [ - "proc-macro2", - "quote", - "schemars", - "semver", - "serde", - "serde_json", - "serde_tokenstream", - "syn 2.0.90", - "typify-impl 0.1.0", -] - [[package]] name = "typify-macro" version = "0.2.0" @@ -7053,8 +6964,8 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 2.0.90", - "typify-impl 0.2.0", + "syn 2.0.96", + "typify-impl", ] [[package]] @@ -7267,7 +7178,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", "wasm-bindgen-shared", ] @@ -7302,7 +7213,7 @@ checksum = "98c9ae5a76e46f4deecd0f0255cc223cfa18dc9b261213b8aa0c7b36f61b3f1d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -7686,15 +7597,6 @@ version = "0.13.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" -[[package]] -name = "xz2" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" -dependencies = [ - "lzma-sys", -] - [[package]] name = "yada" version = "0.5.1" @@ -7727,7 +7629,7 @@ checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", "synstructure", ] @@ -7749,7 +7651,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -7769,7 +7671,7 @@ checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", "synstructure", ] @@ -7798,7 +7700,7 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] diff --git a/rust/lance-datafusion/src/exec.rs b/rust/lance-datafusion/src/exec.rs index f9cd332f23..ac6b633c88 100644 --- a/rust/lance-datafusion/src/exec.rs +++ b/rust/lance-datafusion/src/exec.rs @@ -210,7 +210,7 @@ pub fn new_session_context(options: LanceExecutionOptions) -> SessionContext { options.mem_pool_size() as usize ))); } - let runtime_env = Arc::new(runtime_env_builder.build().unwrap()); + let runtime_env = runtime_env_builder.build_arc().unwrap(); SessionContext::new_with_config_rt(session_config, runtime_env) } diff --git a/rust/lance-datafusion/src/planner.rs b/rust/lance-datafusion/src/planner.rs index e29f3aff91..d9d9b44e0d 100644 --- a/rust/lance-datafusion/src/planner.rs +++ b/rust/lance-datafusion/src/planner.rs @@ -162,7 +162,7 @@ struct LanceContextProvider { impl Default for LanceContextProvider { fn default() -> Self { let config = SessionConfig::new(); - let runtime = Arc::new(RuntimeEnvBuilder::new().build().unwrap()); + let runtime = RuntimeEnvBuilder::new().build_arc().unwrap(); let mut state_builder = SessionStateBuilder::new() .with_config(config) .with_runtime_env(runtime) diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index 529138a03d..265c9a2220 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -2155,26 +2155,28 @@ impl Scanner { expressions::col(ROW_ID, schema.as_ref())?, ROW_ID.to_string(), )]; - // for now multivector is always with cosine distance so here convert the distance to `1 - distance`, + // for now multivector is always with cosine distance so here convert the distance to `1 - distance` + // and calculate the sum across all rows with the same row id. + let sum_expr = AggregateExprBuilder::new( + functions_aggregate::sum::sum_udaf(), + vec![expressions::binary( + expressions::lit(1.0), + datafusion_expr::Operator::Minus, + expressions::cast( + expressions::col(DIST_COL, &schema)?, + &schema, + DataType::Float64, + )?, + &schema, + )?], + ) + .schema(schema.clone()) + .alias(DIST_COL) + .build()?; let ann_node: Arc = Arc::new(AggregateExec::try_new( AggregateMode::Single, PhysicalGroupBy::new_single(group_expr), - vec![AggregateExprBuilder::new( - functions_aggregate::sum::sum_udaf(), - vec![expressions::binary( - expressions::lit(1.0), - datafusion_expr::Operator::Minus, - expressions::cast( - expressions::col(DIST_COL, &schema)?, - &schema, - DataType::Float64, - )?, - &schema, - )?], - ) - .schema(schema.clone()) - .alias(DIST_COL) - .build()?], + vec![Arc::new(sum_expr)], vec![None], ann_node, schema, @@ -2188,7 +2190,7 @@ impl Scanner { }, }; let ann_node = Arc::new( - SortExec::new(vec![sort_expr], ann_node) + SortExec::new(LexOrdering::new(vec![sort_expr]), ann_node) .with_fetch(Some(q.k * q.refine_factor.unwrap_or(1) as usize)), ); diff --git a/rust/lance/src/dataset/write/merge_insert.rs b/rust/lance/src/dataset/write/merge_insert.rs index 1d603dec40..5faf57a34d 100644 --- a/rust/lance/src/dataset/write/merge_insert.rs +++ b/rust/lance/src/dataset/write/merge_insert.rs @@ -31,14 +31,16 @@ use datafusion::{ context::{SessionConfig, SessionContext}, memory_pool::MemoryConsumer, }, - logical_expr::{Expr, JoinType}, + logical_expr::{self, Expr, JoinType}, physical_plan::{ joins::{HashJoinExec, PartitionMode}, + projection::ProjectionExec, repartition::RepartitionExec, stream::RecordBatchStreamAdapter, union::UnionExec, ColumnarValue, ExecutionPlan, PhysicalExpr, SendableRecordBatchStream, }, + prelude::DataFrame, scalar::ScalarValue, }; @@ -512,9 +514,16 @@ impl MergeInsertJob { )?); } + // We need to prefix the fields in the target with target_ so that we don't have any duplicate + // field names (DF doesn't support this as of version 44) + target = Self::prefix_columns_phys(target, "target_"); + // 6 - Finally, join the input (source table) with the taken data (target table) let source_key = Column::new_with_schema(&index_column, shared_input.schema().as_ref())?; - let target_key = Column::new_with_schema(&index_column, target.schema().as_ref())?; + let target_key = Column::new_with_schema( + &format!("target_{}", index_column), + target.schema().as_ref(), + )?; let joined = Arc::new( HashJoinExec::try_new( shared_input, @@ -537,6 +546,31 @@ impl MergeInsertJob { ) } + fn prefix_columns(df: DataFrame, prefix: &str) -> DataFrame { + let schema = df.schema(); + let columns = schema + .fields() + .iter() + .map(|f| logical_expr::col(f.name()).alias(format!("{}{}", prefix, f.name()))) + .collect::>(); + df.select(columns).unwrap() + } + + fn prefix_columns_phys(inp: Arc, prefix: &str) -> Arc { + let schema = inp.schema(); + let exprs = schema + .fields() + .iter() + .enumerate() + .map(|(idx, f)| { + let col = Arc::new(Column::new(f.name(), idx)) as Arc; + let new_name = format!("{}{}", prefix, f.name()); + (col, new_name) + }) + .collect::>(); + Arc::new(ProjectionExec::try_new(exprs, inp).unwrap()) + } + // If the join keys are not indexed then we need to do a full scan of the table async fn create_full_table_joined_stream( &self, @@ -552,12 +586,21 @@ impl MergeInsertJob { .iter() .map(|c| c.as_str()) .collect::>(); // vector of strings of col names to join + let target_cols = self + .params + .on + .iter() + .map(|c| format!("target_{}", c)) + .collect::>(); + let target_cols = target_cols.iter().map(|s| s.as_str()).collect::>(); match self.check_compatible_schema(&schema)? { SchemaComparison::FullCompatible => { let existing = session_ctx.read_lance(self.dataset.clone(), true, false)?; + // We need to rename the columns from the target table so that they don't conflict with the source table + let existing = Self::prefix_columns(existing, "target_"); let joined = - new_data.join(existing, JoinType::Full, &join_cols, &join_cols, None)?; // full join + new_data.join(existing, JoinType::Full, &join_cols, &target_cols, None)?; // full join Ok(joined.execute_stream().await?) } SchemaComparison::Subschema => { @@ -569,18 +612,27 @@ impl MergeInsertJob { .chain([ROW_ID, ROW_ADDR]) .collect::>(); let projected = existing.select_columns(&columns)?; + // We need to rename the columns from the target table so that they don't conflict with the source table + let projected = Self::prefix_columns(projected, "target_"); // We aren't supporting inserts or deletes right now, so we can use inner join let join_type = if self.params.insert_not_matched { JoinType::Left } else { JoinType::Inner }; - let joined = new_data.join(projected, join_type, &join_cols, &join_cols, None)?; + let joined = new_data.join(projected, join_type, &join_cols, &target_cols, None)?; Ok(joined.execute_stream().await?) } } } + /// Join the source and target data streams + /// + /// If there is a scalar index on the join key, we can use it to do an indexed join. Otherwise we need to do + /// a full outer join. + /// + /// Datafusion doesn't allow duplicate column names so during this join we rename the columns from target and + /// prefix them with _target. async fn create_joined_stream( &self, source: SendableRecordBatchStream,