Skip to content

Commit

Permalink
Fix tests
Browse files Browse the repository at this point in the history
  • Loading branch information
ManyTheFish committed Jul 22, 2024
1 parent f9732b2 commit f70289d
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 38 deletions.
6 changes: 3 additions & 3 deletions charabia/src/detection/script_language.rs
Original file line number Diff line number Diff line change
Expand Up @@ -354,11 +354,11 @@ mod test {
#[test]
fn from_into_language() {
assert_eq!(Language::Eng.code(), "eng");
assert_eq!(Language::from_code("eng"), Language::Eng);
assert_eq!(Language::from_code("eng"), Some(Language::Eng));
assert_eq!(Language::Jpn.code(), "jpn");
assert_eq!(Language::from_code("jpn"), Language::Jpn);
assert_eq!(Language::from_code("jpn"), Some(Language::Jpn));
assert_eq!(Language::Cmn.code(), "cmn");
assert_eq!(Language::from_code("cmn"), Language::Cmn);
assert_eq!(Language::from_code("cmn"), Some(Language::Cmn));
}

#[test]
Expand Down
2 changes: 1 addition & 1 deletion charabia/src/normalizer/swedish_recomposition.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use once_cell::sync::Lazy;

use super::Normalizer;
use crate::normalizer::NormalizerOption;
use crate::{Script, Token};
use crate::{Language, Token};

static MATCHING_STR: Lazy<AhoCorasick> = Lazy::new(|| {
AhoCorasick::new(&["A\u{30a}", "a\u{30a}", "A\u{308}", "a\u{308}", "O\u{308}", "o\u{308}"])
Expand Down
2 changes: 1 addition & 1 deletion charabia/src/segmenter/latin/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,5 @@ mod test {
"snake", "_", "case",
];

test_segmenter!(LatinSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Latin, Language::Other);
test_segmenter!(LatinSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Latin, Language::Eng);
}
61 changes: 28 additions & 33 deletions charabia/src/segmenter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ mod thai;
#[cfg(any(feature = "thai", feature = "khmer"))]
mod utils;

pub type SegmenterMap = HashMap<(Script, Option<Language>), Box<dyn Segmenter>>;

/// List of used [`Segmenter`]s linked to their corresponding [`Script`] and [`Language`].
///
/// This list is used after `Script` and `Language` detection to pick the specialized [`Segmenter`].
Expand All @@ -48,39 +50,32 @@ mod utils;
/// A segmenter assigned to `Language::Other` is considered as the default `Segmenter` for any `Language` that uses the assigned `Script`.
/// For example, [`LatinSegmenter`] is assigned to `(Script::Latin, Language::Other)`,
/// meaning that `LatinSegmenter` is the default `Segmenter` for any `Language` that uses `Latin` `Script`.
pub static SEGMENTERS: Lazy<HashMap<(Script, Option<Language>), Box<dyn Segmenter>>> =
Lazy::new(|| {
vec![
// latin segmenter
((Script::Latin, None), Box::new(LatinSegmenter) as Box<dyn Segmenter>),
#[cfg(feature = "swedish-recomposition")]
((Script::Latin, Some(Language::Swe)), Box::new(LatinSegmenter) as Box<dyn Segmenter>),
// chinese segmenter
#[cfg(feature = "chinese-segmentation")]
((Script::Cj, Some(Language::Cmn)), Box::new(ChineseSegmenter) as Box<dyn Segmenter>),
// japanese segmenter
#[cfg(feature = "japanese")]
((Script::Cj, Some(Language::Jpn)), Box::new(JapaneseSegmenter) as Box<dyn Segmenter>),
// korean segmenter
#[cfg(feature = "korean")]
(
(Script::Hangul, Some(Language::Kor)),
Box::new(KoreanSegmenter) as Box<dyn Segmenter>,
),
// thai segmenter
#[cfg(feature = "thai")]
((Script::Thai, Some(Language::Tha)), Box::new(ThaiSegmenter) as Box<dyn Segmenter>),
#[cfg(feature = "khmer")]
((Script::Khmer, Some(Language::Khm)), Box::new(KhmerSegmenter) as Box<dyn Segmenter>),
// arabic segmenter
(
(Script::Arabic, Some(Language::Ara)),
Box::new(ArabicSegmenter) as Box<dyn Segmenter>,
),
]
.into_iter()
.collect()
});
pub static SEGMENTERS: Lazy<SegmenterMap> = Lazy::new(|| {
vec![
// latin segmenter
((Script::Latin, None), Box::new(LatinSegmenter) as Box<dyn Segmenter>),
#[cfg(feature = "swedish-recomposition")]
((Script::Latin, Some(Language::Swe)), Box::new(LatinSegmenter) as Box<dyn Segmenter>),
// chinese segmenter
#[cfg(feature = "chinese-segmentation")]
((Script::Cj, Some(Language::Cmn)), Box::new(ChineseSegmenter) as Box<dyn Segmenter>),
// japanese segmenter
#[cfg(feature = "japanese")]
((Script::Cj, Some(Language::Jpn)), Box::new(JapaneseSegmenter) as Box<dyn Segmenter>),
// korean segmenter
#[cfg(feature = "korean")]
((Script::Hangul, Some(Language::Kor)), Box::new(KoreanSegmenter) as Box<dyn Segmenter>),
// thai segmenter
#[cfg(feature = "thai")]
((Script::Thai, Some(Language::Tha)), Box::new(ThaiSegmenter) as Box<dyn Segmenter>),
#[cfg(feature = "khmer")]
((Script::Khmer, Some(Language::Khm)), Box::new(KhmerSegmenter) as Box<dyn Segmenter>),
// arabic segmenter
((Script::Arabic, Some(Language::Ara)), Box::new(ArabicSegmenter) as Box<dyn Segmenter>),
]
.into_iter()
.collect()
});

/// Picked [`Segmenter`] when no segmenter is specialized to the detected [`Script`].
pub static DEFAULT_SEGMENTER: Lazy<Box<dyn Segmenter>> = Lazy::new(|| Box::new(LatinSegmenter));
Expand Down

0 comments on commit f70289d

Please sign in to comment.