From 24a18f06118d38cc684cd2298b28e9041a26d2d3 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 2 Jun 2023 12:29:11 +0800 Subject: [PATCH] simplify api --- Cargo.toml | 2 +- src/tokenizer/alphanum_only.rs | 4 ++-- src/tokenizer/ascii_folding_filter.rs | 4 ++-- src/tokenizer/empty_tokenizer.rs | 2 +- src/tokenizer/facet_tokenizer.rs | 10 +++++----- src/tokenizer/lower_caser.rs | 4 ++-- src/tokenizer/ngram_tokenizer.rs | 10 +++++----- src/tokenizer/raw_tokenizer.rs | 2 +- src/tokenizer/regex_tokenizer.rs | 14 +++++++------- src/tokenizer/remove_long.rs | 4 ++-- src/tokenizer/simple_tokenizer.rs | 12 ++++++------ src/tokenizer/split_compound_words.rs | 4 ++-- src/tokenizer/stemmer.rs | 4 ++-- src/tokenizer/stop_word_filter/mod.rs | 4 ++-- src/tokenizer/tokenizer.rs | 6 +++--- src/tokenizer/whitespace_tokenizer.rs | 12 ++++++------ tokenizer-api/src/lib.rs | 4 ++-- 17 files changed, 51 insertions(+), 51 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 0be3f600e4..6215daa06b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -77,7 +77,7 @@ proptest = "1.0.0" criterion = "0.5" test-log = "0.2.10" env_logger = "0.10.0" -pprof = { version = "0.11.0", features = ["flamegraph", "criterion"] } +pprof = { git = "https://github.com/PSeitz/pprof-rs", rev = "7a92207", features = ["flamegraph", "criterion"] } # temp fork that works with criterion 0.5 futures = "0.3.21" paste = "1.0.11" more-asserts = "0.3.1" diff --git a/src/tokenizer/alphanum_only.rs b/src/tokenizer/alphanum_only.rs index abca2f671a..b40731fd34 100644 --- a/src/tokenizer/alphanum_only.rs +++ b/src/tokenizer/alphanum_only.rs @@ -50,9 +50,9 @@ impl TokenFilter for AlphaNumOnlyFilter { pub struct AlphaNumOnlyFilterWrapper(T); impl Tokenizer for AlphaNumOnlyFilterWrapper { - type TokenStream<'a, 'b> = AlphaNumOnlyFilterStream>; + type TokenStream<'a> = AlphaNumOnlyFilterStream>; - fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> Self::TokenStream<'a, 'b> { + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { AlphaNumOnlyFilterStream { tail: self.0.token_stream(text), } diff --git a/src/tokenizer/ascii_folding_filter.rs b/src/tokenizer/ascii_folding_filter.rs index 65fa51df1f..4b3655febf 100644 --- a/src/tokenizer/ascii_folding_filter.rs +++ b/src/tokenizer/ascii_folding_filter.rs @@ -20,9 +20,9 @@ impl TokenFilter for AsciiFoldingFilter { pub struct AsciiFoldingFilterWrapper(T); impl Tokenizer for AsciiFoldingFilterWrapper { - type TokenStream<'a, 'b> = AsciiFoldingFilterTokenStream>; + type TokenStream<'a> = AsciiFoldingFilterTokenStream>; - fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> Self::TokenStream<'a, 'b> { + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { AsciiFoldingFilterTokenStream { buffer: String::with_capacity(100), tail: self.0.token_stream(text), diff --git a/src/tokenizer/empty_tokenizer.rs b/src/tokenizer/empty_tokenizer.rs index deee002070..46cb78c10d 100644 --- a/src/tokenizer/empty_tokenizer.rs +++ b/src/tokenizer/empty_tokenizer.rs @@ -4,7 +4,7 @@ use crate::tokenizer::{Token, TokenStream, Tokenizer}; pub(crate) struct EmptyTokenizer; impl Tokenizer for EmptyTokenizer { - type TokenStream<'a, 'b> = EmptyTokenStream; + type TokenStream<'a> = EmptyTokenStream; fn token_stream(&mut self, _text: &str) -> EmptyTokenStream { EmptyTokenStream::default() } diff --git a/src/tokenizer/facet_tokenizer.rs b/src/tokenizer/facet_tokenizer.rs index 01bd659eea..568d60ae31 100644 --- a/src/tokenizer/facet_tokenizer.rs +++ b/src/tokenizer/facet_tokenizer.rs @@ -21,15 +21,15 @@ enum State { Terminated, } -pub struct FacetTokenStream<'a, 'b> { +pub struct FacetTokenStream<'a> { text: &'a str, state: State, - token: &'b mut Token, + token: &'a mut Token, } impl Tokenizer for FacetTokenizer { - type TokenStream<'a, 'b> = FacetTokenStream<'a, 'b>; - fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> FacetTokenStream<'a, 'b> { + type TokenStream<'a> = FacetTokenStream<'a>; + fn token_stream<'a>(&'a mut self, text: &'a str) -> FacetTokenStream<'a> { self.token.reset(); self.token.position = 0; FacetTokenStream { @@ -40,7 +40,7 @@ impl Tokenizer for FacetTokenizer { } } -impl<'a, 'b> TokenStream for FacetTokenStream<'a, 'b> { +impl<'a> TokenStream for FacetTokenStream<'a> { fn advance(&mut self) -> bool { match self.state { State::RootFacetNotEmitted => { diff --git a/src/tokenizer/lower_caser.rs b/src/tokenizer/lower_caser.rs index a0efe68b4b..18696b6b75 100644 --- a/src/tokenizer/lower_caser.rs +++ b/src/tokenizer/lower_caser.rs @@ -18,9 +18,9 @@ impl TokenFilter for LowerCaser { pub struct LowerCaserFilter(T); impl Tokenizer for LowerCaserFilter { - type TokenStream<'a, 'b> = LowerCaserTokenStream>; + type TokenStream<'a> = LowerCaserTokenStream>; - fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> Self::TokenStream<'a, 'b> { + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { LowerCaserTokenStream { tail: self.0.token_stream(text), buffer: String::new(), // TODO move to global buffer diff --git a/src/tokenizer/ngram_tokenizer.rs b/src/tokenizer/ngram_tokenizer.rs index e3e4cd59a7..ae54cacf4f 100644 --- a/src/tokenizer/ngram_tokenizer.rs +++ b/src/tokenizer/ngram_tokenizer.rs @@ -121,7 +121,7 @@ impl NgramTokenizer { } /// TokenStream associate to the `NgramTokenizer` -pub struct NgramTokenStream<'a, 'b> { +pub struct NgramTokenStream<'a> { /// parameters ngram_charidx_iterator: StutteringIterator>, /// true if the NgramTokenStream is in prefix mode. @@ -129,12 +129,12 @@ pub struct NgramTokenStream<'a, 'b> { /// input text: &'a str, /// output - token: &'b mut Token, + token: &'a mut Token, } impl Tokenizer for NgramTokenizer { - type TokenStream<'a, 'b> = NgramTokenStream<'a, 'b>; - fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> NgramTokenStream<'a, 'b> { + type TokenStream<'a> = NgramTokenStream<'a>; + fn token_stream<'a>(&'a mut self, text: &'a str) -> NgramTokenStream<'a> { self.token.reset(); NgramTokenStream { ngram_charidx_iterator: StutteringIterator::new( @@ -149,7 +149,7 @@ impl Tokenizer for NgramTokenizer { } } -impl<'a, 'b> TokenStream for NgramTokenStream<'a, 'b> { +impl<'a> TokenStream for NgramTokenStream<'a> { fn advance(&mut self) -> bool { if let Some((offset_from, offset_to)) = self.ngram_charidx_iterator.next() { if self.prefix_only && offset_from > 0 { diff --git a/src/tokenizer/raw_tokenizer.rs b/src/tokenizer/raw_tokenizer.rs index 07079abf20..9bf7ee22a9 100644 --- a/src/tokenizer/raw_tokenizer.rs +++ b/src/tokenizer/raw_tokenizer.rs @@ -12,7 +12,7 @@ pub struct RawTokenStream<'a> { } impl Tokenizer for RawTokenizer { - type TokenStream<'b, 'a> = RawTokenStream<'a>; + type TokenStream<'a> = RawTokenStream<'a>; fn token_stream<'a>(&'a mut self, text: &str) -> RawTokenStream<'a> { self.token.reset(); self.token.position = 0; diff --git a/src/tokenizer/regex_tokenizer.rs b/src/tokenizer/regex_tokenizer.rs index cd502a55d3..f9a10ad201 100644 --- a/src/tokenizer/regex_tokenizer.rs +++ b/src/tokenizer/regex_tokenizer.rs @@ -64,8 +64,8 @@ impl RegexTokenizer { } impl Tokenizer for RegexTokenizer { - type TokenStream<'a, 'b> = RegexTokenStream<'a, 'b>; - fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> RegexTokenStream<'a, 'b> { + type TokenStream<'a> = RegexTokenStream<'a>; + fn token_stream<'a>(&'a mut self, text: &'a str) -> RegexTokenStream<'a> { self.token.reset(); RegexTokenStream { regex: self.regex.clone(), @@ -76,14 +76,14 @@ impl Tokenizer for RegexTokenizer { } } -pub struct RegexTokenStream<'a, 'b> { +pub struct RegexTokenStream<'a> { regex: Regex, text: &'a str, - token: &'b mut Token, + token: &'a mut Token, cursor: usize, } -impl<'a, 'b> TokenStream for RegexTokenStream<'a, 'b> { +impl<'a> TokenStream for RegexTokenStream<'a> { fn advance(&mut self) -> bool { let Some(regex_match) = self.regex.find(self.text) else { return false; @@ -105,11 +105,11 @@ impl<'a, 'b> TokenStream for RegexTokenStream<'a, 'b> { } fn token(&self) -> &Token { - &self.token + self.token } fn token_mut(&mut self) -> &mut Token { - &mut self.token + self.token } } diff --git a/src/tokenizer/remove_long.rs b/src/tokenizer/remove_long.rs index d18f57450b..78f3e731ac 100644 --- a/src/tokenizer/remove_long.rs +++ b/src/tokenizer/remove_long.rs @@ -55,9 +55,9 @@ pub struct RemoveLongFilterWrapper { } impl Tokenizer for RemoveLongFilterWrapper { - type TokenStream<'a, 'b> = RemoveLongFilterStream>; + type TokenStream<'a> = RemoveLongFilterStream>; - fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> Self::TokenStream<'a, 'b> { + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { RemoveLongFilterStream { token_length_limit: self.length_limit, tail: self.inner.token_stream(text), diff --git a/src/tokenizer/simple_tokenizer.rs b/src/tokenizer/simple_tokenizer.rs index ff49efa38e..540dfac477 100644 --- a/src/tokenizer/simple_tokenizer.rs +++ b/src/tokenizer/simple_tokenizer.rs @@ -9,15 +9,15 @@ pub struct SimpleTokenizer { } /// TokenStream produced by the `SimpleTokenizer`. -pub struct SimpleTokenStream<'a, 'b> { +pub struct SimpleTokenStream<'a> { text: &'a str, chars: CharIndices<'a>, - token: &'b mut Token, + token: &'a mut Token, } impl Tokenizer for SimpleTokenizer { - type TokenStream<'a, 'b> = SimpleTokenStream<'a, 'b>; - fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> SimpleTokenStream<'a, 'b> { + type TokenStream<'a> = SimpleTokenStream<'a>; + fn token_stream<'a>(&'a mut self, text: &'a str) -> SimpleTokenStream<'a> { self.token.reset(); SimpleTokenStream { text, @@ -27,7 +27,7 @@ impl Tokenizer for SimpleTokenizer { } } -impl<'a, 'b> SimpleTokenStream<'a, 'b> { +impl<'a> SimpleTokenStream<'a> { // search for the end of the current token. fn search_token_end(&mut self) -> usize { (&mut self.chars) @@ -38,7 +38,7 @@ impl<'a, 'b> SimpleTokenStream<'a, 'b> { } } -impl<'a, 'b> TokenStream for SimpleTokenStream<'a, 'b> { +impl<'a> TokenStream for SimpleTokenStream<'a> { fn advance(&mut self) -> bool { self.token.text.clear(); self.token.position = self.token.position.wrapping_add(1); diff --git a/src/tokenizer/split_compound_words.rs b/src/tokenizer/split_compound_words.rs index b2366b10b7..bcde161cc8 100644 --- a/src/tokenizer/split_compound_words.rs +++ b/src/tokenizer/split_compound_words.rs @@ -97,9 +97,9 @@ pub struct SplitCompoundWordsFilter { } impl Tokenizer for SplitCompoundWordsFilter { - type TokenStream<'a, 'b> = SplitCompoundWordsTokenStream>; + type TokenStream<'a> = SplitCompoundWordsTokenStream>; - fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> Self::TokenStream<'a, 'b> { + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { SplitCompoundWordsTokenStream { dict: self.dict.clone(), tail: self.inner.token_stream(text), diff --git a/src/tokenizer/stemmer.rs b/src/tokenizer/stemmer.rs index 081fd6be4e..4c43b609ab 100644 --- a/src/tokenizer/stemmer.rs +++ b/src/tokenizer/stemmer.rs @@ -98,9 +98,9 @@ pub struct StemmerFilter { } impl Tokenizer for StemmerFilter { - type TokenStream<'a, 'b> = StemmerTokenStream>; + type TokenStream<'a> = StemmerTokenStream>; - fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> Self::TokenStream<'a, 'b> { + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { let stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm); StemmerTokenStream { tail: self.inner.token_stream(text), diff --git a/src/tokenizer/stop_word_filter/mod.rs b/src/tokenizer/stop_word_filter/mod.rs index a1b894130c..80f1fe4938 100644 --- a/src/tokenizer/stop_word_filter/mod.rs +++ b/src/tokenizer/stop_word_filter/mod.rs @@ -88,9 +88,9 @@ pub struct StopWordFilterWrapper { } impl Tokenizer for StopWordFilterWrapper { - type TokenStream<'a, 'b> = StopWordFilterStream>; + type TokenStream<'a> = StopWordFilterStream>; - fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> Self::TokenStream<'a, 'b> { + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { StopWordFilterStream { words: self.words.clone(), tail: self.inner.token_stream(text), diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index 2c70969899..ccab6cda73 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -12,13 +12,13 @@ pub struct TextAnalyzer { /// A boxable `Tokenizer`, with its `TokenStream` type erased. trait BoxableTokenizer: 'static + Send + Sync { /// Creates a boxed token stream for a given `str`. - fn box_token_stream<'a, 'b: 'a>(&'b mut self, text: &'a str) -> BoxTokenStream<'a>; + fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a>; /// Clone this tokenizer. fn box_clone(&self) -> Box; } impl BoxableTokenizer for T { - fn box_token_stream<'a, 'b: 'a>(&'b mut self, text: &'a str) -> BoxTokenStream<'a> { + fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> { self.token_stream(text).into() } fn box_clone(&self) -> Box { @@ -53,7 +53,7 @@ impl TextAnalyzer { } /// Creates a token stream for a given `str`. - pub fn token_stream<'a, 'b: 'a>(&'b mut self, text: &'a str) -> BoxTokenStream<'a> { + pub fn token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> { self.tokenizer.box_token_stream(text) } } diff --git a/src/tokenizer/whitespace_tokenizer.rs b/src/tokenizer/whitespace_tokenizer.rs index 8b411ad054..69a3b05509 100644 --- a/src/tokenizer/whitespace_tokenizer.rs +++ b/src/tokenizer/whitespace_tokenizer.rs @@ -8,15 +8,15 @@ pub struct WhitespaceTokenizer { token: Token, } -pub struct WhitespaceTokenStream<'a, 'b> { +pub struct WhitespaceTokenStream<'a> { text: &'a str, chars: CharIndices<'a>, - token: &'b mut Token, + token: &'a mut Token, } impl Tokenizer for WhitespaceTokenizer { - type TokenStream<'a, 'b> = WhitespaceTokenStream<'a, 'b>; - fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> WhitespaceTokenStream<'a, 'b> { + type TokenStream<'a> = WhitespaceTokenStream<'a>; + fn token_stream<'a>(&'a mut self, text: &'a str) -> WhitespaceTokenStream<'a> { self.token.reset(); WhitespaceTokenStream { text, @@ -26,7 +26,7 @@ impl Tokenizer for WhitespaceTokenizer { } } -impl<'a, 'b> WhitespaceTokenStream<'a, 'b> { +impl<'a> WhitespaceTokenStream<'a> { // search for the end of the current token. fn search_token_end(&mut self) -> usize { (&mut self.chars) @@ -37,7 +37,7 @@ impl<'a, 'b> WhitespaceTokenStream<'a, 'b> { } } -impl<'a, 'b> TokenStream for WhitespaceTokenStream<'a, 'b> { +impl<'a> TokenStream for WhitespaceTokenStream<'a> { fn advance(&mut self) -> bool { self.token.text.clear(); self.token.position = self.token.position.wrapping_add(1); diff --git a/tokenizer-api/src/lib.rs b/tokenizer-api/src/lib.rs index ab0fa8c955..e563443715 100644 --- a/tokenizer-api/src/lib.rs +++ b/tokenizer-api/src/lib.rs @@ -55,9 +55,9 @@ impl Token { /// before indexing. pub trait Tokenizer: 'static + Clone + Send + Sync { /// The token stream returned by this Tokenizer. - type TokenStream<'a, 'b>: TokenStream; + type TokenStream<'a>: TokenStream; /// Creates a token stream for a given `str`. - fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> Self::TokenStream<'a, 'b>; + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a>; } /// Simple wrapper of `Box`.