Skip to content

Commit

Permalink
Merge #251
Browse files Browse the repository at this point in the history
251: Improve khmer segmenter performance by using fst segmenter r=ManyTheFish a=xshadowlegendx

# Pull Request

## Related issue
Fixes [#250](#250)

## What does this PR do?
- [x] add khmer words fst converted from [khmerdict.txt](https://github.com/unicode-org/icu/blob/main/icu4c/source/data/brkitr/dictionaries/khmerdict.txt) using `fst` crate
- [x] change to use fst segmenter instead of `icu_segmenter`
- [x] remove `icu` deps

## PR checklist
Please check if your PR fulfills the following requirements:
- [x] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)?
- [x] Have you read the contributing guidelines?
- [x] Have you made sure that the title is accurate and descriptive of the changes?

Thank you so much for contributing to Meilisearch!


Co-authored-by: xshadowlegendx <[email protected]>
  • Loading branch information
meili-bors[bot] and xshadowlegendx authored Nov 27, 2023
2 parents 6864604 + 1bc027e commit b31e01d
Show file tree
Hide file tree
Showing 4 changed files with 9 additions and 35 deletions.
5 changes: 1 addition & 4 deletions charabia/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,6 @@ unicode-normalization = "0.1.22"
irg-kvariants = "0.1.0"
litemap = "0.6.1"
zerovec = "0.9.3"
icu = { version = "1.3.0", features = ["serde"] , optional = true }
icu_provider_blob = { version = "1.3.0", optional = true }
icu_provider = { version = "1.3.0", features = ["sync"], optional = true }

[features]
default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer"]
Expand Down Expand Up @@ -66,7 +63,7 @@ greek = []
# allow splitting camelCase latin words
latin-camelcase = ["dep:finl_unicode"]

khmer = ["dep:icu", "dep:icu_provider_blob", "dep:icu_provider"]
khmer = []

# allow splitting snake_case latin words
latin-snakecase = ["dep:finl_unicode"]
Expand Down
Binary file removed charabia/dictionaries/bin/icu4x-khmer-keys
Binary file not shown.
Binary file added charabia/dictionaries/fst/khmer/words.fst
Binary file not shown.
39 changes: 8 additions & 31 deletions charabia/src/segmenter/khmer.rs
Original file line number Diff line number Diff line change
@@ -1,29 +1,24 @@
use std::vec;

use icu::segmenter::WordSegmenter;
use fst::raw::Fst;

// Import `Segmenter` trait.
use crate::segmenter::utils::FstSegmenter;
use crate::segmenter::Segmenter;

extern crate alloc; // required as my-data-mod is written for #[no_std]
use icu_provider_blob::BlobDataProvider;

//TIP: Some segmentation Libraries need to initialize a instance of the Segmenter.
// This initialization could be time-consuming and shouldn't be done at each call of `segment_str`.
// In this case, you may want to store the initialized instance in a lazy static like below and call it in `segment_str`.
// Otherwise, just remove below lines.
//
// Put this import at the top of the file.
use once_cell::sync::Lazy;
//
static SEGMENTER: Lazy<WordSegmenter> = Lazy::new(|| {
let blob = include_bytes!("../../dictionaries/bin/icu4x-khmer-keys");

let buffer_provider: BlobDataProvider =
BlobDataProvider::try_new_from_static_blob(blob).expect("failed to load khmer keys");
// dictionary source - https://github.com/unicode-org/icu/blob/main/icu4c/source/data/brkitr/dictionaries/khmerdict.txt
static WORDS_FST: Lazy<Fst<&[u8]>> =
Lazy::new(|| Fst::new(&include_bytes!("../../dictionaries/fst/khmer/words.fst")[..]).unwrap());

WordSegmenter::try_new_dictionary_with_buffer_provider(&buffer_provider)
.expect("failed to initialize khmer word segmenter")
});
static FST_SEGMENTER: Lazy<FstSegmenter> = Lazy::new(|| FstSegmenter::new(&WORDS_FST));

// Make a small documentation of the specialized Segmenter like below.
/// <Script/Language> specialized [`Segmenter`].
Expand All @@ -39,25 +34,7 @@ pub struct KhmerSegmenter;
// All specialized segmenters only need to implement the method `segment_str` of the `Segmenter` trait.
impl Segmenter for KhmerSegmenter {
fn segment_str<'o>(&self, to_segment: &'o str) -> Box<dyn Iterator<Item = &'o str> + 'o> {
let (_, positions) =
SEGMENTER.segment_str(to_segment).fold((None, vec![]), |mut acc, elem| {
if acc.0.is_some() {
acc.1.push((acc.0.unwrap(), elem));
}

acc.0 = Some(elem);

acc
});

// Return the created iterator wrapping it in a Box.
Box::new(
positions
.iter()
.map(|(start, end)| &to_segment[*start..*end])
.collect::<Vec<&str>>()
.into_iter(),
)
FST_SEGMENTER.segment_str(to_segment)
}
}

Expand Down

0 comments on commit b31e01d

Please sign in to comment.