diff --git a/Cargo.lock b/Cargo.lock index 9b5ffb95a5..8e113ef2c6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,6 +17,12 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" +[[package]] +name = "adler32" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234" + [[package]] name = "ahash" version = "0.8.11" @@ -958,6 +964,15 @@ dependencies = [ "vsimd", ] +[[package]] +name = "bincode" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" +dependencies = [ + "serde", +] + [[package]] name = "bit-set" version = "0.5.3" @@ -1164,6 +1179,15 @@ dependencies = [ "shlex", ] +[[package]] +name = "cedarwood" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d910bedd62c24733263d0bed247460853c9d22e8956bd4cd964302095e04e90" +dependencies = [ + "smallvec", +] + [[package]] name = "census" version = "0.4.2" @@ -1386,6 +1410,15 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" +[[package]] +name = "core2" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b49ba7ef1ad6107f8824dbe97de947cbaac53c44e7f9756a1fba0d37c1eec505" +dependencies = [ + "memchr", +] + [[package]] name = "cpp_demangle" version = "0.4.4" @@ -1546,6 +1579,47 @@ dependencies = [ "memchr", ] +[[package]] +name = "darling" +version = "0.20.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.20.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn 2.0.90", +] + +[[package]] +name = "darling_macro" +version = "0.20.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" +dependencies = [ + "darling_core", + "quote", + "syn 2.0.90", +] + +[[package]] +name = "dary_heap" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04d2cd9c18b9f454ed67da600630b021a8a80bf33f8c95896ab33aaf1c26b728" + [[package]] name = "dashmap" version = "5.5.3" @@ -2019,6 +2093,37 @@ dependencies = [ "serde", ] +[[package]] +name = "derive_builder" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947" +dependencies = [ + "derive_builder_macro", +] + +[[package]] +name = "derive_builder_core" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn 2.0.90", +] + +[[package]] +name = "derive_builder_macro" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" +dependencies = [ + "derive_builder_core", + "syn 2.0.90", +] + [[package]] name = "diff" version = "0.1.13" @@ -2098,6 +2203,88 @@ version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" +[[package]] +name = "encoding" +version = "0.2.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec" +dependencies = [ + "encoding-index-japanese", + "encoding-index-korean", + "encoding-index-simpchinese", + "encoding-index-singlebyte", + "encoding-index-tradchinese", +] + +[[package]] +name = "encoding-index-japanese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-korean" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-simpchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-singlebyte" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-tradchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding_index_tests" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" + +[[package]] +name = "encoding_rs" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "encoding_rs_io" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cc3c5651fb62ab8aa3103998dade57efdd028544bd300516baa31840c252a83" +dependencies = [ + "encoding_rs", +] + [[package]] name = "env_filter" version = "0.1.2" @@ -2280,6 +2467,21 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f81ec6369c545a7d40e4589b5597581fa1c441fe1cce96dd1de43159910a36a2" +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + [[package]] name = "form_urlencoded" version = "1.2.1" @@ -2431,6 +2633,15 @@ dependencies = [ "slab", ] +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + [[package]] name = "generic-array" version = "0.14.7" @@ -2756,6 +2967,22 @@ dependencies = [ "tower-service", ] +[[package]] +name = "hyper-tls" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" +dependencies = [ + "bytes", + "http-body-util", + "hyper 1.5.1", + "hyper-util", + "native-tls", + "tokio", + "tokio-native-tls", + "tower-service", +] + [[package]] name = "hyper-util" version = "0.1.10" @@ -2925,6 +3152,12 @@ dependencies = [ "syn 2.0.90", ] +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + [[package]] name = "idna" version = "1.0.3" @@ -2946,6 +3179,29 @@ dependencies = [ "icu_properties", ] +[[package]] +name = "include-flate" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df49c16750695486c1f34de05da5b7438096156466e7f76c38fcdf285cf0113e" +dependencies = [ + "include-flate-codegen", + "lazy_static", + "libflate", +] + +[[package]] +name = "include-flate-codegen" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c5b246c6261be723b85c61ecf87804e8ea4a35cb68be0ff282ed84b95ffe7d7" +dependencies = [ + "libflate", + "proc-macro2", + "quote", + "syn 2.0.90", +] + [[package]] name = "indexmap" version = "2.7.0" @@ -3063,6 +3319,30 @@ version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" +[[package]] +name = "jieba-macros" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c676b32a471d3cfae8dac2ad2f8334cd52e53377733cca8c1fb0a5062fec192" +dependencies = [ + "phf_codegen", +] + +[[package]] +name = "jieba-rs" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a77d0ae8831f870c4f6ffce310f708b5273ea2e7a88e6af770a10d1b4876311" +dependencies = [ + "cedarwood", + "fxhash", + "include-flate", + "jieba-macros", + "lazy_static", + "phf", + "regex", +] + [[package]] name = "jni" version = "0.21.1" @@ -3104,6 +3384,15 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "kanaria" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0f9d9652540055ac4fded998a73aca97d965899077ab1212587437da44196ff" +dependencies = [ + "bitflags 1.3.2", +] + [[package]] name = "kv-log-macro" version = "1.0.7" @@ -3437,9 +3726,11 @@ dependencies = [ "datafusion-physical-expr", "datafusion-sql", "deepsize", + "dirs", "futures", "half", "itertools 0.13.0", + "jieba-rs", "lance-arrow", "lance-core", "lance-datafusion", @@ -3451,6 +3742,8 @@ dependencies = [ "lance-table", "lance-testing", "lazy_static", + "lindera", + "lindera-tantivy", "log", "moka", "num-traits", @@ -3735,6 +4028,30 @@ version = "0.2.167" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09d6582e104315a817dff97f75133544b2e094ee22447d2acf4a74e189ba06fc" +[[package]] +name = "libflate" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45d9dfdc14ea4ef0900c1cddbc8dcd553fbaacd8a4a282cf4018ae9dd04fb21e" +dependencies = [ + "adler32", + "core2", + "crc32fast", + "dary_heap", + "libflate_lz77", +] + +[[package]] +name = "libflate_lz77" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e0d73b369f386f1c44abd9c570d5318f55ccde816ff4b562fa452e5182863d" +dependencies = [ + "core2", + "hashbrown 0.14.5", + "rle-decode-fast", +] + [[package]] name = "libm" version = "0.2.11" @@ -3752,6 +4069,67 @@ dependencies = [ "redox_syscall", ] +[[package]] +name = "lindera" +version = "0.38.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fff887f4b98539fb5f879ede50e17eb7eaafa5622c252cffe8280f42cafc6b7d" +dependencies = [ + "anyhow", + "bincode", + "byteorder", + "csv", + "kanaria", + "lindera-dictionary", + "once_cell", + "regex", + "serde", + "serde_json", + "serde_yaml", + "strum", + "strum_macros", + "unicode-blocks", + "unicode-normalization", + "unicode-segmentation", + "yada", +] + +[[package]] +name = "lindera-dictionary" +version = "0.38.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec716483ceb95aa84ac262cb766eef314b24257c343ca230daa71f856a278fe4" +dependencies = [ + "anyhow", + "bincode", + "byteorder", + "csv", + "derive_builder", + "encoding", + "encoding_rs", + "encoding_rs_io", + "flate2", + "glob", + "log", + "once_cell", + "reqwest", + "serde", + "tar", + "thiserror 2.0.4", + "yada", +] + +[[package]] +name = "lindera-tantivy" +version = "0.38.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "261c87882a909fd17db4dd797e4dc2aac3992bdbbb4e2900d1362a1e0746266f" +dependencies = [ + "lindera", + "tantivy", + "tantivy-tokenizer-api", +] + [[package]] name = "linux-raw-sys" version = "0.4.14" @@ -3965,6 +4343,23 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2195bf6aa996a481483b29d62a7663eed3fe39600c460e323f8ff41e90bdd89b" +[[package]] +name = "native-tls" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8614eb2c83d59d1c8cc974dd3f920198647674a0a035e1af1fa58707e317466" +dependencies = [ + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework 2.11.1", + "security-framework-sys", + "tempfile", +] + [[package]] name = "nix" version = "0.26.4" @@ -4184,12 +4579,50 @@ version = "11.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b410bbe7e14ab526a0e86877eb47c6996a2bd7746f027ba551028c925390e4e9" +[[package]] +name = "openssl" +version = "0.10.68" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6174bc48f102d208783c2c84bf931bb75927a617866870de8a4ea85597f871f5" +dependencies = [ + "bitflags 2.6.0", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.90", +] + [[package]] name = "openssl-probe" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" +[[package]] +name = "openssl-sys" +version = "0.9.104" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45abf306cbf99debc8195b66b7346498d7b10c210de50418b5ccd7ceba08c741" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + [[package]] name = "option-ext" version = "0.2.0" @@ -5053,6 +5486,7 @@ checksum = "a77c62af46e79de0a562e1a9849205ffcb7fc1238876e9bd743357570e04046f" dependencies = [ "base64 0.22.1", "bytes", + "encoding_rs", "futures-core", "futures-util", "h2 0.4.7", @@ -5061,11 +5495,13 @@ dependencies = [ "http-body-util", "hyper 1.5.1", "hyper-rustls 0.27.3", + "hyper-tls", "hyper-util", "ipnet", "js-sys", "log", "mime", + "native-tls", "once_cell", "percent-encoding", "pin-project-lite", @@ -5078,7 +5514,9 @@ dependencies = [ "serde_json", "serde_urlencoded", "sync_wrapper", + "system-configuration", "tokio", + "tokio-native-tls", "tokio-rustls 0.26.0", "tokio-util", "tower-service", @@ -5114,6 +5552,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "rle-decode-fast" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422" + [[package]] name = "roaring" version = "0.10.7" @@ -5892,6 +6336,27 @@ dependencies = [ "syn 2.0.90", ] +[[package]] +name = "system-configuration" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" +dependencies = [ + "bitflags 2.6.0", + "core-foundation 0.9.4", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "tagptr" version = "0.2.0" @@ -6299,6 +6764,16 @@ dependencies = [ "syn 2.0.90", ] +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + [[package]] name = "tokio-rustls" version = "0.24.1" @@ -6573,12 +7048,27 @@ version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7e51b68083f157f853b6379db119d1c1be0e6e4dec98101079dec41f6f5cf6df" +[[package]] +name = "unicode-blocks" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b12e05d9e06373163a9bb6bb8c263c261b396643a99445fe6b9811fd376581b" + [[package]] name = "unicode-ident" version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" +[[package]] +name = "unicode-normalization" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" +dependencies = [ + "tinyvec", +] + [[package]] name = "unicode-segmentation" version = "1.12.0" @@ -6682,6 +7172,12 @@ version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3ef4c4aa54d5d05a279399bfa921ec387b7aba77caf7a682ae8d86785b8fdad2" +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "version_check" version = "0.9.5" @@ -7178,6 +7674,12 @@ dependencies = [ "lzma-sys", ] +[[package]] +name = "yada" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aed111bd9e48a802518765906cbdadf0b45afb72b9c81ab049a3b86252adffdd" + [[package]] name = "yansi" version = "1.0.1" diff --git a/Cargo.toml b/Cargo.toml index 2b35b080f3..36dd006343 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -110,12 +110,14 @@ datafusion-physical-expr = { version = "42.0", features = [ "regex_expressions", ] } deepsize = "0.2.0" +dirs = "5.0.0" either = "1.0" fsst = { version = "=0.21.1", path = "./rust/lance-encoding/src/compression_algo/fsst" } futures = "0.3" http = "1.1.0" hyperloglogplus = { version = "0.4.1", features = ["const-loop"] } itertools = "0.13" +jieba-rs = { version = "0.7", default-features = false } lazy_static = "1" log = "0.4" mockall = { version = "0.13.1" } @@ -143,6 +145,8 @@ serde_json = { version = "1" } shellexpand = "3.0" snafu = "0.7.5" tantivy = { version = "0.22.0", features = ["stopwords"] } +lindera = { version = "0.38.1"} +lindera-tantivy = { version = "0.38.1"} tempfile = "3" test-log = { version = "0.2.15" } tokio = { version = "1.23", features = [ diff --git a/docs/tokenizer.rst b/docs/tokenizer.rst new file mode 100644 index 0000000000..306b7919ad --- /dev/null +++ b/docs/tokenizer.rst @@ -0,0 +1,87 @@ +Tokenizers +============================ + +Currently, Lance has built-in support for Jieba and Lindera. However, it doesn't come with its own language models. +If tokenization is needed, you can download language models by yourself. +You can specify the location where the language models are stored by setting the environment variable LANCE_LANGUAGE_MODEL_HOME. +If it's not set, the default value is + +... code-block::bash + ${system data directory}/lance/language_models + +It also supports configuring user dictionaries, +which makes it convenient for users to expand their own dictionaries without retraining the language models. + +Language Models of Jieba +--------------- + +Downloading the Model +~~~~~~~~~~~ + +... code-block::bash + python -m lance.download jieba + +The language model is stored by default in `${LANCE_LANGUAGE_MODEL_HOME}/jieba/default`. + +Using the Model +~~~~~~~~~~~ + +... code-block::python + ds.create_scalar_index("text", "INVERTED", base_tokenizer="jieba/default") + +User Dictionaries +~~~~~~~~~~~ +Create a file named config.json in the root directory of the current model. + +... code-block::json + { + "main": "dict.txt", + "users": ["path/to/user/dict.txt"] + } + +- The "main" field is optional. If not filled, the default is "dict.txt". +- "users" is the path of the user dictionary. For the format of the user dictionary, please refer to https://github.com/messense/jieba-rs/blob/main/src/data/dict.txt. + + +Language Models of Lindera +--------------- + +Downloading the Model +~~~~~~~~~~~ + +... code-block::bash + python -m lance.download lindera -l [ipadic|ko-dic|unidic] + +Note that the language models of Lindera need to be compiled. Please install lindera-cli first. For detailed steps, please refer to https://github.com/lindera/lindera/tree/main/lindera-cli. + +The language model is stored by default in ${LANCE_LANGUAGE_MODEL_HOME}/lindera/[ipadic|ko-dic|unidic] + +Using the Model +~~~~~~~~~~~ + +... code-block::python + ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera/ipadic") + +User Dictionaries +~~~~~~~~~~~ + +Create a file named config.json in the root directory of the current model. + +... code-block::json + { + "main": "main", + "users": "path/to/user/dict.bin", + "user_kind": "ipadic|ko-dic|unidic" + } + +- The "main" field is optional. If not filled, the default is the "main" directory. +- "user" is the path of the user dictionary. The user dictionary can be passed as a CSV file or as a binary file compiled by lindera-cli. +- The "user_kind" field can be left blank if the user dictionary is in binary format. If it's in CSV format, you need to specify the type of the language model. + + +Create your own language model +--------------- + +Put your language model into `LANCE_LANGUAGE_MODEL_HOME`. + + diff --git a/python/Cargo.lock b/python/Cargo.lock index a15f68509e..201fa3c0e9 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -17,6 +17,12 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" +[[package]] +name = "adler32" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234" + [[package]] name = "ahash" version = "0.8.11" @@ -880,6 +886,15 @@ dependencies = [ "vsimd", ] +[[package]] +name = "bincode" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" +dependencies = [ + "serde", +] + [[package]] name = "bitflags" version = "1.3.2" @@ -1044,6 +1059,15 @@ dependencies = [ "shlex", ] +[[package]] +name = "cedarwood" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d910bedd62c24733263d0bed247460853c9d22e8956bd4cd964302095e04e90" +dependencies = [ + "smallvec", +] + [[package]] name = "census" version = "0.4.2" @@ -1170,6 +1194,15 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" +[[package]] +name = "core2" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b49ba7ef1ad6107f8824dbe97de947cbaac53c44e7f9756a1fba0d37c1eec505" +dependencies = [ + "memchr", +] + [[package]] name = "cpufeatures" version = "0.2.16" @@ -1283,6 +1316,47 @@ dependencies = [ "memchr", ] +[[package]] +name = "darling" +version = "0.20.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.20.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn 2.0.90", +] + +[[package]] +name = "darling_macro" +version = "0.20.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" +dependencies = [ + "darling_core", + "quote", + "syn 2.0.90", +] + +[[package]] +name = "dary_heap" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04d2cd9c18b9f454ed67da600630b021a8a80bf33f8c95896ab33aaf1c26b728" + [[package]] name = "dashmap" version = "5.5.3" @@ -1747,6 +1821,37 @@ dependencies = [ "serde", ] +[[package]] +name = "derive_builder" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947" +dependencies = [ + "derive_builder_macro", +] + +[[package]] +name = "derive_builder_core" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn 2.0.90", +] + +[[package]] +name = "derive_builder_macro" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" +dependencies = [ + "derive_builder_core", + "syn 2.0.90", +] + [[package]] name = "digest" version = "0.10.7" @@ -1814,6 +1919,88 @@ version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" +[[package]] +name = "encoding" +version = "0.2.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec" +dependencies = [ + "encoding-index-japanese", + "encoding-index-korean", + "encoding-index-simpchinese", + "encoding-index-singlebyte", + "encoding-index-tradchinese", +] + +[[package]] +name = "encoding-index-japanese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-korean" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-simpchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-singlebyte" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-tradchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding_index_tests" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" + +[[package]] +name = "encoding_rs" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "encoding_rs_io" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cc3c5651fb62ab8aa3103998dade57efdd028544bd300516baa31840c252a83" +dependencies = [ + "encoding_rs", +] + [[package]] name = "env_logger" version = "0.10.2" @@ -1943,6 +2130,21 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f81ec6369c545a7d40e4589b5597581fa1c441fe1cce96dd1de43159910a36a2" +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + [[package]] name = "form_urlencoded" version = "1.2.1" @@ -2077,6 +2279,15 @@ dependencies = [ "slab", ] +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + [[package]] name = "generic-array" version = "0.14.7" @@ -2411,6 +2622,22 @@ dependencies = [ "tower-service", ] +[[package]] +name = "hyper-tls" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" +dependencies = [ + "bytes", + "http-body-util", + "hyper 1.5.1", + "hyper-util", + "native-tls", + "tokio", + "tokio-native-tls", + "tower-service", +] + [[package]] name = "hyper-util" version = "0.1.10" @@ -2580,6 +2807,12 @@ dependencies = [ "syn 2.0.90", ] +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + [[package]] name = "idna" version = "1.0.3" @@ -2601,6 +2834,29 @@ dependencies = [ "icu_properties", ] +[[package]] +name = "include-flate" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df49c16750695486c1f34de05da5b7438096156466e7f76c38fcdf285cf0113e" +dependencies = [ + "include-flate-codegen", + "lazy_static", + "libflate", +] + +[[package]] +name = "include-flate-codegen" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c5b246c6261be723b85c61ecf87804e8ea4a35cb68be0ff282ed84b95ffe7d7" +dependencies = [ + "libflate", + "proc-macro2", + "quote", + "syn 2.0.90", +] + [[package]] name = "indexmap" version = "2.7.0" @@ -2700,6 +2956,30 @@ version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" +[[package]] +name = "jieba-macros" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c676b32a471d3cfae8dac2ad2f8334cd52e53377733cca8c1fb0a5062fec192" +dependencies = [ + "phf_codegen", +] + +[[package]] +name = "jieba-rs" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a77d0ae8831f870c4f6ffce310f708b5273ea2e7a88e6af770a10d1b4876311" +dependencies = [ + "cedarwood", + "fxhash", + "include-flate", + "jieba-macros", + "lazy_static", + "phf", + "regex", +] + [[package]] name = "jobserver" version = "0.1.32" @@ -2719,6 +2999,15 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "kanaria" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0f9d9652540055ac4fded998a73aca97d965899077ab1212587437da44196ff" +dependencies = [ + "bitflags 1.3.2", +] + [[package]] name = "kv-log-macro" version = "1.0.7" @@ -2976,9 +3265,11 @@ dependencies = [ "datafusion-physical-expr", "datafusion-sql", "deepsize", + "dirs", "futures", "half", "itertools 0.13.0", + "jieba-rs", "lance-arrow", "lance-core", "lance-datafusion", @@ -2988,6 +3279,8 @@ dependencies = [ "lance-linalg", "lance-table", "lazy_static", + "lindera", + "lindera-tantivy", "log", "moka", "num-traits", @@ -3190,6 +3483,30 @@ version = "0.2.167" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09d6582e104315a817dff97f75133544b2e094ee22447d2acf4a74e189ba06fc" +[[package]] +name = "libflate" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45d9dfdc14ea4ef0900c1cddbc8dcd553fbaacd8a4a282cf4018ae9dd04fb21e" +dependencies = [ + "adler32", + "core2", + "crc32fast", + "dary_heap", + "libflate_lz77", +] + +[[package]] +name = "libflate_lz77" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e0d73b369f386f1c44abd9c570d5318f55ccde816ff4b562fa452e5182863d" +dependencies = [ + "core2", + "hashbrown 0.14.5", + "rle-decode-fast", +] + [[package]] name = "libm" version = "0.2.11" @@ -3207,6 +3524,67 @@ dependencies = [ "redox_syscall", ] +[[package]] +name = "lindera" +version = "0.38.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fff887f4b98539fb5f879ede50e17eb7eaafa5622c252cffe8280f42cafc6b7d" +dependencies = [ + "anyhow", + "bincode", + "byteorder", + "csv", + "kanaria", + "lindera-dictionary", + "once_cell", + "regex", + "serde", + "serde_json", + "serde_yaml", + "strum", + "strum_macros", + "unicode-blocks", + "unicode-normalization", + "unicode-segmentation", + "yada", +] + +[[package]] +name = "lindera-dictionary" +version = "0.38.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec716483ceb95aa84ac262cb766eef314b24257c343ca230daa71f856a278fe4" +dependencies = [ + "anyhow", + "bincode", + "byteorder", + "csv", + "derive_builder", + "encoding", + "encoding_rs", + "encoding_rs_io", + "flate2", + "glob", + "log", + "once_cell", + "reqwest", + "serde", + "tar", + "thiserror 2.0.4", + "yada", +] + +[[package]] +name = "lindera-tantivy" +version = "0.38.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "261c87882a909fd17db4dd797e4dc2aac3992bdbbb4e2900d1362a1e0746266f" +dependencies = [ + "lindera", + "tantivy", + "tantivy-tokenizer-api", +] + [[package]] name = "linux-raw-sys" version = "0.4.14" @@ -3394,6 +3772,23 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2195bf6aa996a481483b29d62a7663eed3fe39600c460e323f8ff41e90bdd89b" +[[package]] +name = "native-tls" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8614eb2c83d59d1c8cc974dd3f920198647674a0a035e1af1fa58707e317466" +dependencies = [ + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework 2.11.1", + "security-framework-sys", + "tempfile", +] + [[package]] name = "noisy_float" version = "0.2.0" @@ -3586,12 +3981,50 @@ version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e296cf87e61c9cfc1a61c3c63a0f7f286ed4554e0e22be84e8a38e1d264a2a29" +[[package]] +name = "openssl" +version = "0.10.68" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6174bc48f102d208783c2c84bf931bb75927a617866870de8a4ea85597f871f5" +dependencies = [ + "bitflags 2.6.0", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.90", +] + [[package]] name = "openssl-probe" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" +[[package]] +name = "openssl-sys" +version = "0.9.104" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45abf306cbf99debc8195b66b7346498d7b10c210de50418b5ccd7ceba08c741" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + [[package]] name = "option-ext" version = "0.2.0" @@ -4446,6 +4879,7 @@ checksum = "a77c62af46e79de0a562e1a9849205ffcb7fc1238876e9bd743357570e04046f" dependencies = [ "base64 0.22.1", "bytes", + "encoding_rs", "futures-core", "futures-util", "h2 0.4.7", @@ -4454,11 +4888,13 @@ dependencies = [ "http-body-util", "hyper 1.5.1", "hyper-rustls 0.27.3", + "hyper-tls", "hyper-util", "ipnet", "js-sys", "log", "mime", + "native-tls", "once_cell", "percent-encoding", "pin-project-lite", @@ -4471,7 +4907,9 @@ dependencies = [ "serde_json", "serde_urlencoded", "sync_wrapper", + "system-configuration", "tokio", + "tokio-native-tls", "tokio-rustls 0.26.0", "tokio-util", "tower-service", @@ -4498,6 +4936,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "rle-decode-fast" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422" + [[package]] name = "roaring" version = "0.10.7" @@ -5042,6 +5486,12 @@ version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e51f1e89f093f99e7432c491c382b88a6860a5adbe6bf02574bf0a08efff1978" +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + [[package]] name = "strum" version = "0.26.3" @@ -5136,6 +5586,27 @@ dependencies = [ "syn 2.0.90", ] +[[package]] +name = "system-configuration" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" +dependencies = [ + "bitflags 2.6.0", + "core-foundation 0.9.4", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "tagptr" version = "0.2.0" @@ -5511,6 +5982,16 @@ dependencies = [ "syn 2.0.90", ] +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + [[package]] name = "tokio-rustls" version = "0.24.1" @@ -5705,12 +6186,27 @@ dependencies = [ "typify-impl", ] +[[package]] +name = "unicode-blocks" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b12e05d9e06373163a9bb6bb8c263c261b396643a99445fe6b9811fd376581b" + [[package]] name = "unicode-ident" version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" +[[package]] +name = "unicode-normalization" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" +dependencies = [ + "tinyvec", +] + [[package]] name = "unicode-segmentation" version = "1.12.0" @@ -5814,6 +6310,12 @@ version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3ef4c4aa54d5d05a279399bfa921ec387b7aba77caf7a682ae8d86785b8fdad2" +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "version_check" version = "0.9.5" @@ -6238,6 +6740,12 @@ dependencies = [ "lzma-sys", ] +[[package]] +name = "yada" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aed111bd9e48a802518765906cbdadf0b45afb72b9c81ab049a3b86252adffdd" + [[package]] name = "yoke" version = "0.7.5" diff --git a/python/Cargo.toml b/python/Cargo.toml index 5c5d281e1c..fb3dafcc5a 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -36,7 +36,7 @@ lance-core = { path = "../rust/lance-core" } lance-datagen = { path = "../rust/lance-datagen", optional = true } lance-encoding = { path = "../rust/lance-encoding" } lance-file = { path = "../rust/lance-file" } -lance-index = { path = "../rust/lance-index" } +lance-index = { path = "../rust/lance-index", features = ["tokenizer-lindera", "tokenizer-jieba"] } lance-io = { path = "../rust/lance-io" } lance-linalg = { path = "../rust/lance-linalg" } lance-table = { path = "../rust/lance-table" } diff --git a/python/python/lance/download.py b/python/python/lance/download.py new file mode 100644 index 0000000000..cff42520e4 --- /dev/null +++ b/python/python/lance/download.py @@ -0,0 +1,104 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors + +import os +import shutil +import subprocess +import tarfile +import traceback +from io import BytesIO + +from .lance import language_model_home + +LANGUAGE_MODEL_HOME = language_model_home() + + +def check_lindera(): + if not shutil.which("lindera"): + raise Exception( + "lindera is not installed. Please install it by following https://github.com/lindera/lindera/tree/main/lindera-cli" + ) + + +def import_requests(): + try: + import requests + except Exception: + raise Exception("requests is not installed, Please pip install requests") + return requests + + +def download_jieba(): + dirname = os.path.join(LANGUAGE_MODEL_HOME, "jieba", "default") + os.makedirs(dirname, exist_ok=True) + try: + requests = import_requests() + resp = requests.get( + "https://github.com/messense/jieba-rs/raw/refs/heads/main/src/data/dict.txt" + ) + content = resp.content + with open(os.path.join(dirname, "dict.txt"), "wb") as out: + out.write(content) + except Exception as _: + traceback.print_exc() + print( + "Download jieba language model failed. Please download dict.txt from " + "https://github.com/messense/jieba-rs/tree/main/src/data " + f"and put it in {dirname}" + ) + + +def download_lindera(lm: str): + requests = import_requests() + dirname = os.path.join(LANGUAGE_MODEL_HOME, "lindera", lm) + src_dirname = os.path.join(dirname, "src") + if lm == "ipadic": + url = "https://dlwqk3ibdg1xh.cloudfront.net/mecab-ipadic-2.7.0-20070801.tar.gz" + elif lm == "ko-dic": + url = "https://dlwqk3ibdg1xh.cloudfront.net/mecab-ko-dic-2.1.1-20180720.tar.gz" + elif lm == "unidic": + url = "https://dlwqk3ibdg1xh.cloudfront.net/unidic-mecab-2.1.2.tar.gz" + else: + raise Exception(f"language model {lm} is not supported") + os.makedirs(src_dirname, exist_ok=True) + print(f"downloading language model: {url}") + data = requests.get(url).content + print(f"unzip language model: {url}") + + cwd = os.getcwd() + try: + os.chdir(src_dirname) + with tarfile.open(fileobj=BytesIO(data)) as tar: + tar.extractall() + name = tar.getnames()[0] + cmd = [ + "lindera", + "build", + f"--dictionary-kind={lm}", + os.path.join(src_dirname, name), + os.path.join(dirname, "main"), + ] + print(f"compiling language model: {' '.join(cmd)}") + subprocess.run(cmd) + finally: + os.chdir(cwd) + + +def main(): + import argparse + + parser = argparse.ArgumentParser( + description="Lance tokenizer language model downloader" + ) + parser.add_argument("tokenizer", choices=["jieba", "lindera"]) + parser.add_argument("-l", "--languagemodel") + args = parser.parse_args() + print(f"LANCE_LANGUAGE_MODEL_HOME={LANGUAGE_MODEL_HOME}") + if args.tokenizer == "jieba": + download_jieba() + elif args.tokenizer == "lindera": + download_lindera(args.languagemodel) + + +if __name__ == "__main__": + main() diff --git a/python/python/lance/lance/__init__.pyi b/python/python/lance/lance/__init__.pyi index b9ab1a2d2d..8a4638b909 100644 --- a/python/python/lance/lance/__init__.pyi +++ b/python/python/lance/lance/__init__.pyi @@ -15,6 +15,7 @@ from pathlib import Path from typing import ( Any, + Callable, Dict, Iterable, Iterator, @@ -435,3 +436,4 @@ class BFloat16: def bfloat16_array(values: List[str | None]) -> BFloat16Array: ... __version__: str +language_model_home: Callable[[], str] diff --git a/python/python/tests/models/jieba/default/dict.txt b/python/python/tests/models/jieba/default/dict.txt new file mode 100644 index 0000000000..237b47ca6a --- /dev/null +++ b/python/python/tests/models/jieba/default/dict.txt @@ -0,0 +1,8 @@ +我们 98740 r +都 202780 d +有 423765 v +光明 1219 n +的 318825 uj +前途 1263 n +前 62779 f +途 857 n diff --git a/python/python/tests/models/jieba/invalid_dict/config.json b/python/python/tests/models/jieba/invalid_dict/config.json new file mode 100644 index 0000000000..cf4301aa2b --- /dev/null +++ b/python/python/tests/models/jieba/invalid_dict/config.json @@ -0,0 +1,6 @@ +{ + "main": "../default/dict.txt", + "users": [ + "invalid_user.txt" + ] +} diff --git a/python/python/tests/models/jieba/invalid_dict2/config.json b/python/python/tests/models/jieba/invalid_dict2/config.json new file mode 100644 index 0000000000..d0216419a5 --- /dev/null +++ b/python/python/tests/models/jieba/invalid_dict2/config.json @@ -0,0 +1,3 @@ +{ + "main": "invalid_dict.txt" +} diff --git a/python/python/tests/models/jieba/user_dict/config.json b/python/python/tests/models/jieba/user_dict/config.json new file mode 100644 index 0000000000..0d65334ca2 --- /dev/null +++ b/python/python/tests/models/jieba/user_dict/config.json @@ -0,0 +1,6 @@ +{ + "main": "../default/dict.txt", + "users": [ + "user.txt" + ] +} diff --git a/python/python/tests/models/jieba/user_dict/user.txt b/python/python/tests/models/jieba/user_dict/user.txt new file mode 100644 index 0000000000..bb6ffa4d85 --- /dev/null +++ b/python/python/tests/models/jieba/user_dict/user.txt @@ -0,0 +1 @@ +光明的前途 318825 n diff --git a/python/python/tests/models/lindera/README.md b/python/python/tests/models/lindera/README.md new file mode 100644 index 0000000000..c4073b65d5 --- /dev/null +++ b/python/python/tests/models/lindera/README.md @@ -0,0 +1,28 @@ +# How to build this test language model + +Ipadic model is about 45M. so we created a tiny ipadic in zip. + +- Download language model + +```bash +curl -L -o mecab-ipadic-2.7.0-20070801.tar.gz "https://github.com/lindera-morphology/mecab-ipadic/archive/refs/tags/2.7.0-20070801.tar.gz" +tar xvf mecab-ipadic-2.7.0-20070801.tar.gz +``` + +- Remove csv files in folder + +- Put files in `ipadic/raw` into folder + +- Edit matrix.def, reset last column(weight) into zero, except first row. + +- build + +```bash +lindera build --dictionary-kind=ipadic mecab-ipadic-2.7.0-20070801 main +``` + +- build user dict + +```bash +lindera build --build-user-dictionary --dictionary-kind=ipadic user_dict/userdict.csv user_dict2 +``` diff --git a/python/python/tests/models/lindera/invalid_dict/config.json b/python/python/tests/models/lindera/invalid_dict/config.json new file mode 100644 index 0000000000..b486aeba24 --- /dev/null +++ b/python/python/tests/models/lindera/invalid_dict/config.json @@ -0,0 +1,4 @@ +{ + "main": "../main", + "user": "invalid.bin" +} diff --git a/python/python/tests/models/lindera/invalid_dict2/config.json b/python/python/tests/models/lindera/invalid_dict2/config.json new file mode 100644 index 0000000000..11c22e9f1c --- /dev/null +++ b/python/python/tests/models/lindera/invalid_dict2/config.json @@ -0,0 +1,4 @@ +{ + "main": "../main", + "user": "ipadic_simple_userdic.csv" +} diff --git a/python/python/tests/models/lindera/ipadic/main.zip b/python/python/tests/models/lindera/ipadic/main.zip new file mode 100644 index 0000000000..25966ae2a1 Binary files /dev/null and b/python/python/tests/models/lindera/ipadic/main.zip differ diff --git a/python/python/tests/models/lindera/ipadic/raw/Noun.mock.csv b/python/python/tests/models/lindera/ipadic/raw/Noun.mock.csv new file mode 100644 index 0000000000..4201b57a54 --- /dev/null +++ b/python/python/tests/models/lindera/ipadic/raw/Noun.mock.csv @@ -0,0 +1,3 @@ +����,1293,1293,5686,̾��,��ͭ̾��,�ϰ�,����,*,*,����,�ʥ꥿,�ʥ꥿ +���,1285,1285,553,̾��,����,*,*,*,*,���,��������,�������� +����,1285,1285,7778,̾��,����,*,*,*,*,����,��������,�������� \ No newline at end of file diff --git a/python/python/tests/models/lindera/user_dict/config.json b/python/python/tests/models/lindera/user_dict/config.json new file mode 100644 index 0000000000..e554849af2 --- /dev/null +++ b/python/python/tests/models/lindera/user_dict/config.json @@ -0,0 +1,5 @@ +{ + "main": "../ipadic/main", + "user": "userdic.csv", + "user_kind": "ipadic" +} diff --git a/python/python/tests/models/lindera/user_dict/userdic.csv b/python/python/tests/models/lindera/user_dict/userdic.csv new file mode 100644 index 0000000000..652c3f7791 --- /dev/null +++ b/python/python/tests/models/lindera/user_dict/userdic.csv @@ -0,0 +1 @@ +成田国際空港,カスタム名詞,トウキョウスカイツリー diff --git a/python/python/tests/models/lindera/user_dict2/config.json b/python/python/tests/models/lindera/user_dict2/config.json new file mode 100644 index 0000000000..e06bd8c71b --- /dev/null +++ b/python/python/tests/models/lindera/user_dict2/config.json @@ -0,0 +1,4 @@ +{ + "main": "../ipadic/main", + "user": "userdic.bin" +} diff --git a/python/python/tests/models/lindera/user_dict2/userdic.bin b/python/python/tests/models/lindera/user_dict2/userdic.bin new file mode 100644 index 0000000000..a0410fa079 Binary files /dev/null and b/python/python/tests/models/lindera/user_dict2/userdic.bin differ diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index e58069b4a4..1dadd3c202 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -3,7 +3,9 @@ import os import random +import shutil import string +import zipfile from datetime import date, datetime, timedelta from pathlib import Path @@ -34,6 +36,27 @@ def gen_str(n, split="", char_set=string.ascii_letters + string.digits): return tbl +def set_language_model_path(): + os.environ["LANCE_LANGUAGE_MODEL_HOME"] = os.path.join( + os.path.dirname(__file__), "models" + ) + + +@pytest.fixture() +def lindera_ipadic(): + set_language_model_path() + model_path = os.path.join(os.path.dirname(__file__), "models", "lindera", "ipadic") + cwd = os.getcwd() + try: + os.chdir(model_path) + with zipfile.ZipFile("main.zip", "r") as zip_ref: + zip_ref.extractall() + os.chdir(cwd) + yield + finally: + shutil.rmtree(os.path.join(model_path, "main")) + + @pytest.fixture() def dataset(tmp_path): tbl = create_table() @@ -326,6 +349,170 @@ def test_fts_all_deleted(dataset): dataset.to_table(full_text_query=first_row_doc) +def test_indexed_filter_with_fts_index_with_lindera_ipadic_jp_tokenizer( + tmp_path, lindera_ipadic +): + os.environ["LANCE_LANGUAGE_MODEL_HOME"] = os.path.join( + os.path.dirname(__file__), "models" + ) + data = pa.table( + { + "text": [ + "成田国際空港", + "東京国際空港", + "羽田空港", + ], + } + ) + ds = lance.write_dataset(data, tmp_path, mode="overwrite") + ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera/ipadic") + + results = ds.to_table( + full_text_query="成田", + prefilter=True, + with_row_id=True, + ) + assert results["_rowid"].to_pylist() == [0] + + +def test_lindera_ipadic_jp_tokenizer_invalid_user_dict_path(tmp_path, lindera_ipadic): + data = pa.table( + { + "text": [ + "成田国際空港", + ], + } + ) + ds = lance.write_dataset(data, tmp_path, mode="overwrite") + with pytest.raises(OSError): + ds.create_scalar_index( + "text", "INVERTED", base_tokenizer="lindera/invalid_dict" + ) + + +def test_lindera_ipadic_jp_tokenizer_csv_user_dict_without_type( + tmp_path, lindera_ipadic +): + data = pa.table( + { + "text": [ + "成田国際空港", + ], + } + ) + ds = lance.write_dataset(data, tmp_path, mode="overwrite") + with pytest.raises(OSError): + ds.create_scalar_index( + "text", "INVERTED", base_tokenizer="lindera/invalid_dict2" + ) + + +def test_lindera_ipadic_jp_tokenizer_csv_user_dict(tmp_path, lindera_ipadic): + data = pa.table( + { + "text": [ + "成田国際空港", + "東京国際空港", + "羽田空港", + ], + } + ) + ds = lance.write_dataset(data, tmp_path, mode="overwrite") + ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera/user_dict") + results = ds.to_table( + full_text_query="成田", + prefilter=True, + with_row_id=True, + ) + assert len(results) == 0 + results = ds.to_table( + full_text_query="成田国際空港", + prefilter=True, + with_row_id=True, + ) + assert results["_rowid"].to_pylist() == [0] + + +def test_lindera_ipadic_jp_tokenizer_bin_user_dict(tmp_path, lindera_ipadic): + data = pa.table( + { + "text": [ + "成田国際空港", + ], + } + ) + ds = lance.write_dataset(data, tmp_path, mode="overwrite") + ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera/user_dict2") + + +def test_jieba_tokenizer(tmp_path): + set_language_model_path() + data = pa.table( + { + "text": ["我们都有光明的前途", "光明的前途"], + } + ) + ds = lance.write_dataset(data, tmp_path, mode="overwrite") + ds.create_scalar_index("text", "INVERTED", base_tokenizer="jieba/default") + results = ds.to_table( + full_text_query="我们", + prefilter=True, + with_row_id=True, + ) + assert results["_rowid"].to_pylist() == [0] + + +def test_jieba_invalid_user_dict_tokenizer(tmp_path): + set_language_model_path() + data = pa.table( + { + "text": [ + "我们都有光明的前途", + ], + } + ) + ds = lance.write_dataset(data, tmp_path, mode="overwrite") + with pytest.raises(OSError): + ds.create_scalar_index("text", "INVERTED", base_tokenizer="jieba/invalid_dict") + + +def test_jieba_invalid_main_dict_tokenizer(tmp_path): + set_language_model_path() + data = pa.table( + { + "text": [ + "我们都有光明的前途", + ], + } + ) + ds = lance.write_dataset(data, tmp_path, mode="overwrite") + with pytest.raises(OSError): + ds.create_scalar_index("text", "INVERTED", base_tokenizer="jieba/invalid_dict2") + + +def test_jieba_user_dict_tokenizer(tmp_path): + set_language_model_path() + data = pa.table( + { + "text": ["我们都有光明的前途", "光明的前途"], + } + ) + ds = lance.write_dataset(data, tmp_path, mode="overwrite") + ds.create_scalar_index("text", "INVERTED", base_tokenizer="jieba/user_dict") + results = ds.to_table( + full_text_query="的前", + prefilter=True, + with_row_id=True, + ) + assert len(results) == 0 + results = ds.to_table( + full_text_query="光明的前途", + prefilter=True, + with_row_id=True, + ) + assert results["_rowid"].to_pylist() == [1, 0] + + def test_bitmap_index(tmp_path: Path): """Test create bitmap index""" tbl = pa.Table.from_arrays( diff --git a/python/src/lib.rs b/python/src/lib.rs index 88efa3b0e4..677e492dc9 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -143,6 +143,7 @@ fn lance(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(read_tfrecord))?; m.add_wrapped(wrap_pyfunction!(trace_to_chrome))?; m.add_wrapped(wrap_pyfunction!(manifest_needs_migration))?; + m.add_wrapped(wrap_pyfunction!(language_model_home))?; m.add_wrapped(wrap_pyfunction!(bytes_read_counter))?; m.add_wrapped(wrap_pyfunction!(iops_counter))?; // Debug functions @@ -151,6 +152,7 @@ fn lance(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(debug::format_fragment))?; m.add_wrapped(wrap_pyfunction!(debug::list_transactions))?; m.add("__version__", env!("CARGO_PKG_VERSION"))?; + register_datagen(py, m)?; register_indices(py, m)?; Ok(()) @@ -184,6 +186,21 @@ fn json_to_schema(json: &str) -> PyResult> { Ok(schema.into()) } +#[pyfunction] +pub fn language_model_home() -> PyResult { + let Some(p) = lance_index::scalar::inverted::language_model_home() else { + return Err(pyo3::exceptions::PyValueError::new_err( + "Failed to get language model home", + )); + }; + let Some(pstr) = p.to_str() else { + return Err(pyo3::exceptions::PyValueError::new_err( + "Failed to convert language model home to str", + )); + }; + Ok(String::from(pstr)) +} + /// Infer schema from tfrecord file /// /// Parameters diff --git a/rust/lance-index/Cargo.toml b/rust/lance-index/Cargo.toml index 12d38e5678..e6cf51d2d7 100644 --- a/rust/lance-index/Cargo.toml +++ b/rust/lance-index/Cargo.toml @@ -26,9 +26,11 @@ datafusion-physical-expr.workspace = true datafusion-sql.workspace = true datafusion.workspace = true deepsize.workspace = true +dirs.workspace = true futures.workspace = true half.workspace = true itertools.workspace = true +jieba-rs = { workspace = true, optional = true } lance-arrow.workspace = true lance-core.workspace = true lance-datafusion.workspace = true @@ -50,6 +52,8 @@ serde_json.workspace = true serde.workspace = true snafu.workspace = true tantivy.workspace = true +lindera = { workspace = true, optional = true } +lindera-tantivy = { workspace = true, optional = true } tokio.workspace = true tracing.workspace = true tempfile.workspace = true @@ -68,6 +72,11 @@ test-log.workspace = true datafusion-sql.workspace = true random_word = { version = "0.4.3", features = ["en"] } +[features] +tokenizer-lindera = ["lindera", "lindera-tantivy", "tokenizer-common"] +tokenizer-jieba = ["jieba-rs", "tokenizer-common"] +tokenizer-common = [] + [build-dependencies] prost-build.workspace = true diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs index 440def7a5a..7d34710286 100644 --- a/rust/lance-index/src/scalar/inverted/tokenizer.rs +++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs @@ -1,10 +1,18 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors +use std::{env, path::PathBuf}; + use lance_core::{Error, Result}; use serde::{Deserialize, Serialize}; use snafu::{location, Location}; +#[cfg(feature = "tokenizer-lindera")] +mod lindera; + +#[cfg(feature = "tokenizer-jieba")] +mod jieba; + /// Tokenizer configs #[derive(Debug, Clone, Serialize, Deserialize)] pub struct TokenizerConfig { @@ -12,6 +20,8 @@ pub struct TokenizerConfig { /// - `simple`: splits tokens on whitespace and punctuation /// - `whitespace`: splits tokens on whitespace /// - `raw`: no tokenization + /// - `lindera/*`: Lindera tokenizer + /// - `jieba/*`: Jieba tokenizer /// /// `simple` is recommended for most cases and the default value base_tokenizer: String, @@ -141,9 +151,70 @@ fn build_base_tokenizer_builder(name: &str) -> Result { + let Some(home) = language_model_home() else { + return Err(Error::invalid_input( + format!("unknown base tokenizer {}", name), + location!(), + )); + }; + lindera::LinderaBuilder::load(&home.join(s))?.build() + } + #[cfg(feature = "tokenizer-jieba")] + s if s.starts_with("jieba/") || s == "jieba" => { + let s = if s == "jieba" { "jieba/default" } else { s }; + let Some(home) = language_model_home() else { + return Err(Error::invalid_input( + format!("unknown base tokenizer {}", name), + location!(), + )); + }; + jieba::JiebaBuilder::load(&home.join(s))?.build() + } _ => Err(Error::invalid_input( format!("unknown base tokenizer {}", name), location!(), )), } } + +pub const LANCE_LANGUAGE_MODEL_HOME_ENV_KEY: &str = "LANCE_LANGUAGE_MODEL_HOME"; + +pub const LANCE_LANGUAGE_MODEL_DEFAULT_DIRECTORY: &str = "lance/language_models"; + +pub const LANCE_LANGUAGE_MODEL_CONFIG_FILE: &str = "config.json"; + +pub fn language_model_home() -> Option { + match env::var(LANCE_LANGUAGE_MODEL_HOME_ENV_KEY) { + Ok(p) => Some(PathBuf::from(p)), + Err(_) => dirs::data_local_dir().map(|p| p.join(LANCE_LANGUAGE_MODEL_DEFAULT_DIRECTORY)), + } +} + +#[cfg(feature = "tokenizer-common")] +trait TokenizerBuilder: Sized { + type Config: serde::de::DeserializeOwned + Default; + fn load(p: &std::path::Path) -> Result { + if !p.is_dir() { + return Err(Error::io( + format!("{} is not a valid directory", p.display()), + location!(), + )); + } + use std::{fs::File, io::BufReader}; + let config_path = p.join(LANCE_LANGUAGE_MODEL_CONFIG_FILE); + let config = if config_path.exists() { + let file = File::open(config_path)?; + let reader = BufReader::new(file); + serde_json::from_reader::, Self::Config>(reader)? + } else { + Self::Config::default() + }; + Self::new(config, p) + } + + fn new(config: Self::Config, root: &std::path::Path) -> Result; + + fn build(&self) -> Result; +} diff --git a/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs b/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs new file mode 100644 index 0000000000..95445fb544 --- /dev/null +++ b/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use std::path::{Path, PathBuf}; + +use super::TokenizerBuilder; +use lance_core::{Error, Result}; +use serde::{Deserialize, Serialize}; +use snafu::{location, Location}; + +#[derive(Serialize, Deserialize, Default)] +pub struct JiebaConfig { + main: Option, + users: Option>, +} + +pub struct JiebaBuilder { + root: PathBuf, + config: JiebaConfig, +} + +impl JiebaBuilder { + fn main_dict_path(&self) -> PathBuf { + if let Some(p) = &self.config.main { + return self.root.join(p); + } + self.root.join("dict.txt") + } + + fn user_dict_paths(&self) -> Vec { + let Some(users) = &self.config.users else { + return vec![]; + }; + users.iter().map(|p| self.root.join(p)).collect() + } +} + +impl TokenizerBuilder for JiebaBuilder { + type Config = JiebaConfig; + + fn new(config: Self::Config, root: &Path) -> Result { + Ok(Self { + config, + root: root.to_path_buf(), + }) + } + + fn build(&self) -> Result { + let main_dict_path = &self.main_dict_path(); + let file = std::fs::File::open(main_dict_path)?; + let mut f = std::io::BufReader::new(file); + let mut jieba = jieba_rs::Jieba::with_dict(&mut f).map_err(|e| { + Error::io( + format!( + "load jieba tokenizer dictionary {}, error: {}", + main_dict_path.display(), + e + ), + location!(), + ) + })?; + for user_dict_path in &self.user_dict_paths() { + let file = std::fs::File::open(user_dict_path)?; + let mut f = std::io::BufReader::new(file); + jieba.load_dict(&mut f).map_err(|e| { + Error::io( + format!( + "load jieba tokenizer user dictionary {}, error: {}", + user_dict_path.display(), + e + ), + location!(), + ) + })? + } + let tokenizer = JiebaTokenizer { jieba }; + Ok(tantivy::tokenizer::TextAnalyzer::builder(tokenizer).dynamic()) + } +} + +#[derive(Clone)] +struct JiebaTokenizer { + jieba: jieba_rs::Jieba, +} + +struct JiebaTokenStream { + tokens: Vec, + index: usize, +} + +impl tantivy::tokenizer::TokenStream for JiebaTokenStream { + fn advance(&mut self) -> bool { + if self.index < self.tokens.len() { + self.index += 1; + true + } else { + false + } + } + + fn token(&self) -> &tantivy::tokenizer::Token { + &self.tokens[self.index - 1] + } + + fn token_mut(&mut self) -> &mut tantivy::tokenizer::Token { + &mut self.tokens[self.index - 1] + } +} + +#[cfg(feature = "tokenizer-jieba")] +impl tantivy::tokenizer::Tokenizer for JiebaTokenizer { + type TokenStream<'a> = JiebaTokenStream; + + fn token_stream(&mut self, text: &str) -> JiebaTokenStream { + let mut indices = text.char_indices().collect::>(); + indices.push((text.len(), '\0')); + let orig_tokens = self + .jieba + .tokenize(text, jieba_rs::TokenizeMode::Search, true); + let mut tokens = Vec::new(); + for token in orig_tokens { + tokens.push(tantivy::tokenizer::Token { + offset_from: indices[token.start].0, + offset_to: indices[token.end].0, + position: token.start, + text: String::from(&text[(indices[token.start].0)..(indices[token.end].0)]), + position_length: token.end - token.start, + }); + } + JiebaTokenStream { tokens, index: 0 } + } +} diff --git a/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs b/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs new file mode 100644 index 0000000000..23c8042dd0 --- /dev/null +++ b/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use std::path::{Path, PathBuf}; + +use super::TokenizerBuilder; +use lance_core::{Error, Result}; +use lindera::{ + dictionary::{ + load_dictionary_from_path, load_user_dictionary_from_config, UserDictionaryConfig, + }, + mode::Mode, + segmenter::Segmenter, +}; +use lindera_tantivy::tokenizer::LinderaTokenizer; +use serde::{Deserialize, Serialize}; +use serde_json::{Map, Value}; +use snafu::{location, Location}; + +#[derive(Serialize, Deserialize, Default)] +pub struct LinderaConfig { + main: Option, + user: Option, + user_kind: Option, +} + +pub struct LinderaBuilder { + root: PathBuf, + config: LinderaConfig, +} + +impl LinderaBuilder { + fn main_dict_path(&self) -> PathBuf { + if let Some(p) = &self.config.main { + return self.root.join(p); + } + self.root.join("main") + } + + fn user_dict_config(&self) -> Result> { + let Some(user_dict_path) = &self.config.user else { + return Ok(None); + }; + let mut conf = Map::::new(); + let user_path = self.root.join(user_dict_path); + let Some(p) = user_path.to_str() else { + return Err(Error::io( + format!( + "invalid lindera tokenizer user dictionary path: {}", + user_path.display() + ), + location!(), + )); + }; + conf.insert(String::from("path"), Value::String(String::from(p))); + if let Some(kind) = &self.config.user_kind { + conf.insert(String::from("kind"), Value::String(kind.clone())); + } + Ok(Some(Value::Object(conf))) + } +} + +impl TokenizerBuilder for LinderaBuilder { + type Config = LinderaConfig; + + fn new(config: Self::Config, root: &Path) -> Result { + Ok(Self { + config, + root: root.to_path_buf(), + }) + } + + fn build(&self) -> Result { + let main_path = self.main_dict_path(); + let dictionary = load_dictionary_from_path(main_path.as_path()).map_err(|e| { + Error::io( + format!( + "load lindera tokenizer main dictionary from {}, error: {}", + main_path.display(), + e + ), + location!(), + ) + })?; + let user_dictionary = match self.user_dict_config()? { + Some(conf) => { + let user_dictionary = load_user_dictionary_from_config(&conf).map_err(|e| { + Error::io( + format!("load lindera tokenizer user dictionary, conf:{conf}, err: {e}"), + location!(), + ) + })?; + Some(user_dictionary) + } + None => None, + }; + let mode = Mode::Normal; + let segmenter = Segmenter::new(mode, dictionary, user_dictionary); + let tokenizer = LinderaTokenizer::from_segmenter(segmenter); + Ok(tantivy::tokenizer::TextAnalyzer::builder(tokenizer).dynamic()) + } +}