Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify lang detection #299

Merged
merged 8 commits into from
Jul 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions charabia/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,13 @@ exclude = ["dictionaries/txt/thai/words.txt"]

[dependencies]
aho-corasick = "1.1.3"
cow-utils = "0.1"
csv = "1.3.0"
deunicode = "1.6.0"
either = "1.13.0"
finl_unicode = { version= "1.2.0", optional = true }
fst = "0.4"
jieba-rs = { version = "0.7", optional = true }
once_cell = "1.19.0"
serde = "1.0"
serde = "1.0.192"
slice-group-by = "0.3.1"
whatlang = "0.16.4"
lindera = { version = "=0.32.2", default-features = false, optional = true }
Expand All @@ -31,8 +29,6 @@ pinyin = { version = "0.10", default-features = false, features = [
wana_kana = { version = "3.0.0", optional = true }
unicode-normalization = "0.1.23"
irg-kvariants = { path = "../irg-kvariants", version = "=0.1.1" }
litemap = "0.7.3"
zerovec = "0.10.4"

[features]
default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese"]
Expand Down
35 changes: 13 additions & 22 deletions charabia/src/detection/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
use std::collections::HashMap;

pub use script_language::{Language, Script};
use whatlang::Detector;

Expand All @@ -12,11 +10,11 @@ pub struct StrDetection<'o, 'al> {
inner: &'o str,
pub script: Option<Script>,
pub language: Option<Language>,
allow_list: Option<&'al HashMap<Script, Vec<Language>>>,
allow_list: Option<&'al [Language]>,
}

impl<'o, 'al> StrDetection<'o, 'al> {
pub fn new(inner: &'o str, allow_list: Option<&'al HashMap<Script, Vec<Language>>>) -> Self {
pub fn new(inner: &'o str, allow_list: Option<&'al [Language]>) -> Self {
Self { inner, script: None, language: None, allow_list }
}

Expand All @@ -25,10 +23,14 @@ impl<'o, 'al> StrDetection<'o, 'al> {
*self.script.get_or_insert_with(|| Self::detect_script(inner))
}

pub fn language(&mut self) -> Language {
pub fn language(&mut self) -> Option<Language> {
let inner = self.inner;
let script = self.script();
*self.language.get_or_insert_with(|| Self::detect_lang(inner, script, self.allow_list))
self.language = match self.language.take() {
Some(lang) => Some(lang),
None => Self::detect_lang(inner, self.allow_list),
};

self.language
}

/// detect script with whatlang,
Expand All @@ -39,33 +41,22 @@ impl<'o, 'al> StrDetection<'o, 'al> {

/// detect lang with whatlang
/// if no language is detected, return Language::Other
fn detect_lang(
text: &str,
script: Script,
allow_list: Option<&HashMap<Script, Vec<Language>>>,
) -> Language {
fn detect_lang(text: &str, allow_list: Option<&[Language]>) -> Option<Language> {
let detector = allow_list
.and_then(|allow_list| allow_list.get(&script))
.map(|allow_list| allow_list.iter().map(|lang| (*lang).into()).collect())
.map(Detector::with_allowlist)
.unwrap_or_default();

detector.detect_lang(text).map(Language::from).unwrap_or_default()
detector.detect_lang(text).map(Language::from)
}
}

pub trait Detect<'o, 'al> {
fn detect(
&'o self,
allow_list: Option<&'al HashMap<Script, Vec<Language>>>,
) -> StrDetection<'o, 'al>;
fn detect(&'o self, allow_list: Option<&'al [Language]>) -> StrDetection<'o, 'al>;
}

impl<'o, 'al> Detect<'o, 'al> for &str {
fn detect(
&'o self,
allow_list: Option<&'al HashMap<Script, Vec<Language>>>,
) -> StrDetection<'o, 'al> {
fn detect(&'o self, allow_list: Option<&'al [Language]>) -> StrDetection<'o, 'al> {
StrDetection::new(self, allow_list)
}
}
32 changes: 12 additions & 20 deletions charabia/src/detection/script_language.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@ use core::str::FromStr;

#[cfg(test)]
use quickcheck::{Arbitrary, Gen};
use serde::{Deserialize, Serialize};

use super::chars;

macro_rules! make_language {
($($language:tt), +) => {
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)]
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize, PartialOrd, Ord)]
pub enum Language {
$($language),+,
Other,
}
impl From<whatlang::Lang> for Language {
fn from(other: whatlang::Lang) -> Language {
Expand All @@ -24,27 +24,19 @@ macro_rules! make_language {
fn from(other: Language) -> whatlang::Lang {
match other {
$(Language::$language => whatlang::Lang::$language), +,
_other => whatlang::Lang::Eng,
}
}
}

impl Default for Language {
fn default() -> Self {
Self::Other
}
}

impl Language {
pub fn name(&self) -> &'static str {
pub fn code(&self) -> &'static str {
match self {
$(Language::$language => whatlang::Lang::$language.code()), +,
_other => "other",
}
}

pub fn from_name<S: AsRef<str>>(code: S) -> Language {
whatlang::Lang::from_code(code.as_ref()).map(Language::from).unwrap_or_default()
pub fn from_code<S: AsRef<str>>(code: S) -> Option<Language> {
whatlang::Lang::from_code(code.as_ref()).map(Language::from)
}
}
};
Expand Down Expand Up @@ -124,7 +116,7 @@ make_language! {

macro_rules! make_script {
($($script:tt), +) => {
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)]
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize, PartialOrd, Ord)]
pub enum Script {
$($script),+,
Cj,
Expand Down Expand Up @@ -361,12 +353,12 @@ mod test {

#[test]
fn from_into_language() {
assert_eq!(Language::Eng.name(), "eng");
assert_eq!(Language::from_name("eng"), Language::Eng);
assert_eq!(Language::Jpn.name(), "jpn");
assert_eq!(Language::from_name("jpn"), Language::Jpn);
assert_eq!(Language::Cmn.name(), "cmn");
assert_eq!(Language::from_name("cmn"), Language::Cmn);
assert_eq!(Language::Eng.code(), "eng");
assert_eq!(Language::from_code("eng"), Some(Language::Eng));
assert_eq!(Language::Jpn.code(), "jpn");
assert_eq!(Language::from_code("jpn"), Some(Language::Jpn));
assert_eq!(Language::Cmn.code(), "cmn");
assert_eq!(Language::from_code("cmn"), Some(Language::Cmn));
Comment on lines +356 to +361
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure if this is relevant, but I thought we were using https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes. I would have expected just en for English, ja for Japanese, etc.

Looks like we're not using set 1, but the other sets? Is set 1 usable?

Copy link
Member Author

@ManyTheFish ManyTheFish Jul 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These codes are inherited from whatlang, which follows the ISO-639-3, and the bigrams codes come from the ISO-639-1, which is an anterior version.
Is there any issue with using a 3-gram code?

Copy link
Contributor

@dureuill dureuill Jul 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is about user expection. Maybe that's just me, but as a user, I would expect my language codes to be "EN" or "en", not "ENG"

The best solution would be to support both en and eng.

}

#[test]
Expand Down
2 changes: 1 addition & 1 deletion charabia/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ mod detection;
mod token;
mod tokenizer;

pub use detection::{Language, Script};
pub use detection::{Language, Script, StrDetection};
pub use normalizer::Normalize;
pub use segmenter::Segment;
pub use token::{SeparatorKind, Token, TokenKind};
Expand Down
13 changes: 8 additions & 5 deletions charabia/src/normalizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,12 +81,12 @@ pub(crate) const DEFAULT_NORMALIZER_OPTION: NormalizerOption = NormalizerOption
};

/// Iterator over Normalized [`Token`]s.
pub struct NormalizedTokenIter<'o, 'tb> {
token_iter: SegmentedTokenIter<'o, 'tb>,
pub struct NormalizedTokenIter<'o, 'aho, 'lang, 'tb> {
token_iter: SegmentedTokenIter<'o, 'aho, 'lang>,
options: &'tb NormalizerOption<'tb>,
}

impl<'o> Iterator for NormalizedTokenIter<'o, '_> {
impl<'o> Iterator for NormalizedTokenIter<'o, '_, '_, '_> {
type Item = Token<'o>;

fn next(&mut self) -> Option<Self::Item> {
Expand Down Expand Up @@ -232,11 +232,14 @@ impl From<String> for CharOrStr {
}
}

impl<'o, 'tb> SegmentedTokenIter<'o, 'tb> {
impl<'o, 'aho, 'lang> SegmentedTokenIter<'o, 'aho, 'lang> {
/// Normalize [`Token`]s using all the compatible Normalizers.
///
/// A Latin `Token` would not be normalized the same as a Chinese `Token`.
pub fn normalize(self, options: &'tb NormalizerOption<'tb>) -> NormalizedTokenIter<'o, 'tb> {
pub fn normalize<'tb>(
self,
options: &'tb NormalizerOption<'tb>,
) -> NormalizedTokenIter<'o, 'aho, 'lang, 'tb> {
NormalizedTokenIter { token_iter: self, options }
}
}
Expand Down
8 changes: 6 additions & 2 deletions charabia/src/normalizer/swedish_recomposition.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use once_cell::sync::Lazy;

use super::Normalizer;
use crate::normalizer::NormalizerOption;
use crate::{Script, Token};
use crate::{Language, Token};

static MATCHING_STR: Lazy<AhoCorasick> = Lazy::new(|| {
AhoCorasick::new(&["A\u{30a}", "a\u{30a}", "A\u{308}", "a\u{308}", "O\u{308}", "o\u{308}"])
Expand Down Expand Up @@ -77,7 +77,7 @@ impl Normalizer for SwedishRecompositionNormalizer {

// Returns `true` if the Normalizer should be used.
fn should_normalize(&self, token: &Token) -> bool {
token.script == Script::Latin && MATCHING_STR.is_match(token.lemma())
token.language == Some(Language::Swe) && MATCHING_STR.is_match(token.lemma())
}
}

Expand All @@ -101,6 +101,7 @@ mod test {
use crate::normalizer::test::test_normalizer;
use crate::normalizer::Normalizer;
use crate::token::TokenKind;
use crate::Script;

// base tokens to normalize.
fn tokens() -> Vec<Token<'static>> {
Expand All @@ -109,6 +110,7 @@ mod test {
char_end: 13,
byte_end: 19,
script: Script::Latin,
language: Some(Language::Swe),
..Default::default()
}]
}
Expand All @@ -121,6 +123,7 @@ mod test {
char_end: 13,
byte_end: 19,
script: Script::Latin,
language: Some(Language::Swe),
..Default::default()
}]
}
Expand Down Expand Up @@ -148,6 +151,7 @@ mod test {
]),
script: Script::Latin,
kind: TokenKind::Word,
language: Some(Language::Swe),
..Default::default()
}]
}
Expand Down
2 changes: 1 addition & 1 deletion charabia/src/segmenter/latin/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,5 @@ mod test {
"snake", "_", "case",
];

test_segmenter!(LatinSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Latin, Language::Other);
test_segmenter!(LatinSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Latin, Language::Eng);
}
Loading
Loading