From 26a0e7b6dee93a4045674c3a3932fdc65b62e331 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Tue, 30 May 2023 17:15:51 +0800 Subject: [PATCH 1/4] tokenizer-api: reduce Tokenizer overhead Previously a new `Token` for each text encountered was created, which contains `String::with_capacity(200)` In the new API the token_stream gets mutable access to the tokenizer, this allows state to be shared (in this PR Token is shared). Ideally the allocation for the BoxTokenStream would also be removed, but this may require some lifetime tricks. --- benches/analyzer.rs | 2 +- examples/pre_tokenized_text.rs | 3 +- examples/stop_words.rs | 2 +- src/core/json_utils.rs | 8 ++-- src/fastfield/mod.rs | 2 +- src/fastfield/writer.rs | 12 ++--- src/indexer/segment_writer.rs | 8 ++-- src/postings/mod.rs | 4 +- src/query/more_like_this/more_like_this.rs | 44 +++++++++-------- src/query/query_parser/query_parser.rs | 16 +++---- src/snippet/mod.rs | 55 +++++++++++++++++----- src/tokenizer/alphanum_only.rs | 10 ++-- src/tokenizer/ascii_folding_filter.rs | 12 ++--- src/tokenizer/empty_tokenizer.rs | 6 +-- src/tokenizer/facet_tokenizer.rs | 32 ++++++------- src/tokenizer/lower_caser.rs | 13 ++--- src/tokenizer/mod.rs | 20 ++++---- src/tokenizer/ngram_tokenizer.rs | 21 +++++---- src/tokenizer/raw_tokenizer.rs | 38 ++++++++------- src/tokenizer/regex_tokenizer.rs | 23 +++++---- src/tokenizer/remove_long.rs | 8 ++-- src/tokenizer/simple_tokenizer.rs | 27 ++++++----- src/tokenizer/split_compound_words.rs | 24 +++++----- src/tokenizer/stemmer.rs | 4 +- src/tokenizer/stop_word_filter/mod.rs | 8 ++-- src/tokenizer/tokenizer.rs | 8 ++-- src/tokenizer/tokenizer_manager.rs | 8 ++-- src/tokenizer/whitespace_tokenizer.rs | 27 ++++++----- tokenizer-api/src/lib.rs | 15 +++++- 29 files changed, 262 insertions(+), 198 deletions(-) diff --git a/benches/analyzer.rs b/benches/analyzer.rs index caebc7153f..7a96fa119a 100644 --- a/benches/analyzer.rs +++ b/benches/analyzer.rs @@ -5,7 +5,7 @@ const ALICE_TXT: &str = include_str!("alice.txt"); pub fn criterion_benchmark(c: &mut Criterion) { let tokenizer_manager = TokenizerManager::default(); - let tokenizer = tokenizer_manager.get("default").unwrap(); + let mut tokenizer = tokenizer_manager.get("default").unwrap(); c.bench_function("default-tokenize-alice", |b| { b.iter(|| { let mut word_count = 0; diff --git a/examples/pre_tokenized_text.rs b/examples/pre_tokenized_text.rs index c6595cef9c..126b598ef0 100644 --- a/examples/pre_tokenized_text.rs +++ b/examples/pre_tokenized_text.rs @@ -17,7 +17,8 @@ use tantivy::{doc, Index, ReloadPolicy}; use tempfile::TempDir; fn pre_tokenize_text(text: &str) -> Vec { - let mut token_stream = SimpleTokenizer.token_stream(text); + let mut tokenizer = SimpleTokenizer::default(); + let mut token_stream = tokenizer.token_stream(text); let mut tokens = vec![]; while token_stream.advance() { tokens.push(token_stream.token().clone()); diff --git a/examples/stop_words.rs b/examples/stop_words.rs index b1c8d7fbb5..4b1f52a57a 100644 --- a/examples/stop_words.rs +++ b/examples/stop_words.rs @@ -50,7 +50,7 @@ fn main() -> tantivy::Result<()> { // This tokenizer lowers all of the text (to help with stop word matching) // then removes all instances of `the` and `and` from the corpus - let tokenizer = TextAnalyzer::builder(SimpleTokenizer) + let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default()) .filter(LowerCaser) .filter(StopWordFilter::remove(vec![ "the".to_string(), diff --git a/src/core/json_utils.rs b/src/core/json_utils.rs index 9432bbc46d..0dec432d89 100644 --- a/src/core/json_utils.rs +++ b/src/core/json_utils.rs @@ -67,7 +67,7 @@ impl IndexingPositionsPerPath { pub(crate) fn index_json_values<'a>( doc: DocId, json_values: impl Iterator>>, - text_analyzer: &TextAnalyzer, + text_analyzer: &mut TextAnalyzer, expand_dots_enabled: bool, term_buffer: &mut Term, postings_writer: &mut dyn PostingsWriter, @@ -93,7 +93,7 @@ pub(crate) fn index_json_values<'a>( fn index_json_object( doc: DocId, json_value: &serde_json::Map, - text_analyzer: &TextAnalyzer, + text_analyzer: &mut TextAnalyzer, json_term_writer: &mut JsonTermWriter, postings_writer: &mut dyn PostingsWriter, ctx: &mut IndexingContext, @@ -117,7 +117,7 @@ fn index_json_object( fn index_json_value( doc: DocId, json_value: &serde_json::Value, - text_analyzer: &TextAnalyzer, + text_analyzer: &mut TextAnalyzer, json_term_writer: &mut JsonTermWriter, postings_writer: &mut dyn PostingsWriter, ctx: &mut IndexingContext, @@ -239,7 +239,7 @@ pub(crate) fn set_fastvalue_and_get_term( pub(crate) fn set_string_and_get_terms( json_term_writer: &mut JsonTermWriter, value: &str, - text_analyzer: &TextAnalyzer, + text_analyzer: &mut TextAnalyzer, ) -> Vec<(usize, Term)> { let mut positions_and_terms = Vec::<(usize, Term)>::new(); json_term_writer.close_path_and_set_type(Type::Str); diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 8046898f1e..d450e3e593 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -1208,7 +1208,7 @@ mod tests { let ff_tokenizer_manager = TokenizerManager::default(); ff_tokenizer_manager.register( "custom_lowercase", - TextAnalyzer::builder(RawTokenizer) + TextAnalyzer::builder(RawTokenizer::default()) .filter(LowerCaser) .build(), ); diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index a1a97bdb44..6389ce8a2c 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -147,7 +147,7 @@ impl FastFieldsWriter { } Value::Str(text_val) => { if let Some(tokenizer) = - &self.per_field_tokenizer[field_value.field().field_id() as usize] + &mut self.per_field_tokenizer[field_value.field().field_id() as usize] { let mut token_stream = tokenizer.token_stream(text_val); token_stream.process(&mut |token: &Token| { @@ -202,7 +202,7 @@ impl FastFieldsWriter { self.json_path_buffer.push_str(field_name); let text_analyzer = - &self.per_field_tokenizer[field_value.field().field_id() as usize]; + &mut self.per_field_tokenizer[field_value.field().field_id() as usize]; record_json_obj_to_columnar_writer( doc_id, @@ -263,7 +263,7 @@ fn record_json_obj_to_columnar_writer( remaining_depth_limit: usize, json_path_buffer: &mut String, columnar_writer: &mut columnar::ColumnarWriter, - tokenizer: &Option, + tokenizer: &mut Option, ) { for (key, child) in json_obj { let len_path = json_path_buffer.len(); @@ -302,7 +302,7 @@ fn record_json_value_to_columnar_writer( mut remaining_depth_limit: usize, json_path_writer: &mut String, columnar_writer: &mut columnar::ColumnarWriter, - tokenizer: &Option, + tokenizer: &mut Option, ) { if remaining_depth_limit == 0 { return; @@ -321,7 +321,7 @@ fn record_json_value_to_columnar_writer( } } serde_json::Value::String(text) => { - if let Some(text_analyzer) = tokenizer { + if let Some(text_analyzer) = tokenizer.as_mut() { let mut token_stream = text_analyzer.token_stream(text); token_stream.process(&mut |token| { columnar_writer.record_str(doc, json_path_writer.as_str(), &token.text); @@ -379,7 +379,7 @@ mod tests { JSON_DEPTH_LIMIT, &mut json_path, &mut columnar_writer, - &None, + &mut None, ); } let mut buffer = Vec::new(); diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index c21d7d5348..ed5f60b244 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -185,10 +185,11 @@ impl SegmentWriter { match field_entry.field_type() { FieldType::Facet(_) => { + let mut facet_tokenizer = FacetTokenizer::default(); // this can be global for value in values { let facet = value.as_facet().ok_or_else(make_schema_error)?; let facet_str = facet.encoded_str(); - let mut facet_tokenizer = FacetTokenizer.token_stream(facet_str); + let mut facet_tokenizer = facet_tokenizer.token_stream(facet_str); let mut indexing_position = IndexingPosition::default(); postings_writer.index_text( doc_id, @@ -208,7 +209,7 @@ impl SegmentWriter { } Value::Str(ref text) => { let text_analyzer = - &self.per_field_text_analyzers[field.field_id() as usize]; + &mut self.per_field_text_analyzers[field.field_id() as usize]; text_analyzer.token_stream(text) } _ => { @@ -304,7 +305,8 @@ impl SegmentWriter { } } FieldType::JsonObject(json_options) => { - let text_analyzer = &self.per_field_text_analyzers[field.field_id() as usize]; + let text_analyzer = + &mut self.per_field_text_analyzers[field.field_id() as usize]; let json_values_it = values.map(|value| value.as_json().ok_or_else(make_schema_error)); index_json_values( diff --git a/src/postings/mod.rs b/src/postings/mod.rs index dd33974459..09265b0858 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -162,7 +162,7 @@ pub mod tests { let index = Index::create_in_ram(schema); index .tokenizers() - .register("simple_no_truncation", SimpleTokenizer); + .register("simple_no_truncation", SimpleTokenizer::default()); let reader = index.reader()?; let mut index_writer = index.writer_for_tests()?; @@ -194,7 +194,7 @@ pub mod tests { let index = Index::create_in_ram(schema); index .tokenizers() - .register("simple_no_truncation", SimpleTokenizer); + .register("simple_no_truncation", SimpleTokenizer::default()); let reader = index.reader()?; let mut index_writer = index.writer_for_tests()?; diff --git a/src/query/more_like_this/more_like_this.rs b/src/query/more_like_this/more_like_this.rs index d18e272758..994dd96c02 100644 --- a/src/query/more_like_this/more_like_this.rs +++ b/src/query/more_like_this/more_like_this.rs @@ -192,45 +192,49 @@ impl MoreLikeThis { }) .collect::>>()?; for fake_str in facets { - FacetTokenizer.token_stream(fake_str).process(&mut |token| { - if self.is_noise_word(token.text.clone()) { - let term = Term::from_field_text(field, &token.text); - *term_frequencies.entry(term).or_insert(0) += 1; - } - }); + FacetTokenizer::default() + .token_stream(fake_str) + .process(&mut |token| { + if self.is_noise_word(token.text.clone()) { + let term = Term::from_field_text(field, &token.text); + *term_frequencies.entry(term).or_insert(0) += 1; + } + }); } } FieldType::Str(text_options) => { - let mut token_streams: Vec = vec![]; - for value in values { match value { Value::PreTokStr(tok_str) => { - token_streams.push(PreTokenizedStream::from(tok_str.clone()).into()); + let mut token_stream: BoxTokenStream = + PreTokenizedStream::from(tok_str.clone()).into(); + token_stream.process(&mut |token| { + if !self.is_noise_word(token.text.clone()) { + let term = Term::from_field_text(field, &token.text); + *term_frequencies.entry(term).or_insert(0) += 1; + } + }); } Value::Str(ref text) => { - if let Some(tokenizer) = text_options + if let Some(mut tokenizer) = text_options .get_indexing_options() .map(|text_indexing_options| { text_indexing_options.tokenizer().to_string() }) .and_then(|tokenizer_name| tokenizer_manager.get(&tokenizer_name)) { - token_streams.push(tokenizer.token_stream(text)); + let mut token_stream = tokenizer.token_stream(text); + token_stream.process(&mut |token| { + if !self.is_noise_word(token.text.clone()) { + let term = Term::from_field_text(field, &token.text); + *term_frequencies.entry(term).or_insert(0) += 1; + } + }); } } _ => (), } } - - for mut token_stream in token_streams { - token_stream.process(&mut |token| { - if !self.is_noise_word(token.text.clone()) { - let term = Term::from_field_text(field, &token.text); - *term_frequencies.entry(term).or_insert(0) += 1; - } - }); - } } FieldType::U64(_) => { for value in values { diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index cfb7cbd5b7..72a735848b 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -403,7 +403,7 @@ impl QueryParser { // This should have been seen earlier really. QueryParserError::FieldNotIndexed(field_entry.name().to_string()) })?; - let text_analyzer = + let mut text_analyzer = self.tokenizer_manager .get(option.tokenizer()) .ok_or_else(|| QueryParserError::UnknownTokenizer { @@ -497,7 +497,7 @@ impl QueryParser { // This should have been seen earlier really. QueryParserError::FieldNotIndexed(field_name.to_string()) })?; - let text_analyzer = self + let mut text_analyzer = self .tokenizer_manager .get(indexing_options.tokenizer()) .ok_or_else(|| QueryParserError::UnknownTokenizer { @@ -511,7 +511,7 @@ impl QueryParser { slop, prefix, indexing_options, - &text_analyzer, + &mut text_analyzer, )? .into_iter() .collect()) @@ -795,7 +795,7 @@ fn generate_literals_for_str( slop: u32, prefix: bool, indexing_options: &TextFieldIndexing, - text_analyzer: &TextAnalyzer, + text_analyzer: &mut TextAnalyzer, ) -> Result, QueryParserError> { let mut terms: Vec<(usize, Term)> = Vec::new(); let mut token_stream = text_analyzer.token_stream(phrase); @@ -840,7 +840,7 @@ fn generate_literals_for_json_object( // This should have been seen earlier really. QueryParserError::FieldNotIndexed(field_name.to_string()) })?; - let text_analyzer = tokenizer_manager + let mut text_analyzer = tokenizer_manager .get(text_options.tokenizer()) .ok_or_else(|| QueryParserError::UnknownTokenizer { field: field_name.to_string(), @@ -858,7 +858,7 @@ fn generate_literals_for_json_object( if let Some(term) = convert_to_fast_value_and_get_term(&mut json_term_writer, phrase) { logical_literals.push(LogicalLiteral::Term(term)); } - let terms = set_string_and_get_terms(&mut json_term_writer, phrase, &text_analyzer); + let terms = set_string_and_get_terms(&mut json_term_writer, phrase, &mut text_analyzer); drop(json_term_writer); if terms.len() <= 1 { for (_, term) in terms { @@ -959,7 +959,7 @@ mod test { let tokenizer_manager = TokenizerManager::default(); tokenizer_manager.register( "en_with_stop_words", - TextAnalyzer::builder(SimpleTokenizer) + TextAnalyzer::builder(SimpleTokenizer::default()) .filter(LowerCaser) .filter(StopWordFilter::remove(vec!["the".to_string()])) .build(), @@ -1463,7 +1463,7 @@ mod test { let index = Index::create_in_ram(schema); index .tokenizers() - .register("customtokenizer", SimpleTokenizer); + .register("customtokenizer", SimpleTokenizer::default()); let query_parser = QueryParser::for_index(&index, vec![title]); assert_eq!( query_parser.parse_query("title:\"happy tax\"").unwrap_err(), diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 09fd9c8d6f..bf3a1f82cb 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -145,7 +145,7 @@ impl Snippet { /// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\ /// has to be a valid string. fn search_fragments( - tokenizer: &TextAnalyzer, + tokenizer: &mut TextAnalyzer, text: &str, terms: &BTreeMap, max_num_chars: usize, @@ -370,8 +370,12 @@ impl SnippetGenerator { /// Generates a snippet for the given text. pub fn snippet(&self, text: &str) -> Snippet { - let fragment_candidates = - search_fragments(&self.tokenizer, text, &self.terms_text, self.max_num_chars); + let fragment_candidates = search_fragments( + &mut self.tokenizer.clone(), + text, + &self.terms_text, + self.max_num_chars, + ); select_best_fragment_combination(&fragment_candidates[..], text) } } @@ -408,7 +412,12 @@ Survey in 2016, 2017, and 2018."#; String::from("rust") => 1.0, String::from("language") => 0.9 }; - let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 100); + let fragments = search_fragments( + &mut From::from(SimpleTokenizer::default()), + TEST_TEXT, + &terms, + 100, + ); assert_eq!(fragments.len(), 7); { let first = &fragments[0]; @@ -435,7 +444,12 @@ Survey in 2016, 2017, and 2018."#; String::from("rust") =>1.0, String::from("language") => 0.9 }; - let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 20); + let fragments = search_fragments( + &mut From::from(SimpleTokenizer::default()), + TEST_TEXT, + &terms, + 20, + ); { let first = &fragments[0]; assert_eq!(first.score, 1.0); @@ -449,7 +463,12 @@ Survey in 2016, 2017, and 2018."#; String::from("rust") =>0.9, String::from("language") => 1.0 }; - let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 20); + let fragments = search_fragments( + &mut From::from(SimpleTokenizer::default()), + TEST_TEXT, + &terms, + 20, + ); // assert_eq!(fragments.len(), 7); { let first = &fragments[0]; @@ -468,7 +487,8 @@ Survey in 2016, 2017, and 2018."#; let mut terms = BTreeMap::new(); terms.insert(String::from("c"), 1.0); - let fragments = search_fragments(&From::from(SimpleTokenizer), text, &terms, 3); + let fragments = + search_fragments(&mut From::from(SimpleTokenizer::default()), text, &terms, 3); assert_eq!(fragments.len(), 1); { @@ -490,7 +510,8 @@ Survey in 2016, 2017, and 2018."#; let mut terms = BTreeMap::new(); terms.insert(String::from("f"), 1.0); - let fragments = search_fragments(&From::from(SimpleTokenizer), text, &terms, 3); + let fragments = + search_fragments(&mut From::from(SimpleTokenizer::default()), text, &terms, 3); assert_eq!(fragments.len(), 2); { @@ -513,7 +534,8 @@ Survey in 2016, 2017, and 2018."#; terms.insert(String::from("f"), 1.0); terms.insert(String::from("a"), 0.9); - let fragments = search_fragments(&From::from(SimpleTokenizer), text, &terms, 7); + let fragments = + search_fragments(&mut From::from(SimpleTokenizer::default()), text, &terms, 7); assert_eq!(fragments.len(), 2); { @@ -535,7 +557,8 @@ Survey in 2016, 2017, and 2018."#; let mut terms = BTreeMap::new(); terms.insert(String::from("z"), 1.0); - let fragments = search_fragments(&From::from(SimpleTokenizer), text, &terms, 3); + let fragments = + search_fragments(&mut From::from(SimpleTokenizer::default()), text, &terms, 3); assert_eq!(fragments.len(), 0); @@ -550,7 +573,8 @@ Survey in 2016, 2017, and 2018."#; let text = "a b c d"; let terms = BTreeMap::new(); - let fragments = search_fragments(&From::from(SimpleTokenizer), text, &terms, 3); + let fragments = + search_fragments(&mut From::from(SimpleTokenizer::default()), text, &terms, 3); assert_eq!(fragments.len(), 0); let snippet = select_best_fragment_combination(&fragments[..], text); @@ -669,7 +693,7 @@ Survey in 2016, 2017, and 2018."#; terms.insert(String::from("bc"), 1.0); let fragments = search_fragments( - &From::from(NgramTokenizer::all_ngrams(2, 2)), + &mut From::from(NgramTokenizer::all_ngrams(2, 2)), text, &terms, 3, @@ -691,7 +715,12 @@ Survey in 2016, 2017, and 2018."#; #[test] fn test_snippet_generator_custom_highlighted_elements() { let terms = btreemap! { String::from("rust") => 1.0, String::from("language") => 0.9 }; - let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 100); + let fragments = search_fragments( + &mut From::from(SimpleTokenizer::default()), + TEST_TEXT, + &terms, + 100, + ); let mut snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT); assert_eq!( snippet.to_html(), diff --git a/src/tokenizer/alphanum_only.rs b/src/tokenizer/alphanum_only.rs index c0175e736e..abca2f671a 100644 --- a/src/tokenizer/alphanum_only.rs +++ b/src/tokenizer/alphanum_only.rs @@ -2,7 +2,7 @@ //! ```rust //! use tantivy::tokenizer::*; //! -//! let tokenizer = TextAnalyzer::builder(RawTokenizer) +//! let mut tokenizer = TextAnalyzer::builder(RawTokenizer::default()) //! .filter(AlphaNumOnlyFilter) //! .build(); //! @@ -11,7 +11,7 @@ //! // contains a space //! assert!(stream.next().is_none()); //! -//! let tokenizer = TextAnalyzer::builder(SimpleTokenizer) +//! let mut tokenizer = TextAnalyzer::builder(SimpleTokenizer::default()) //! .filter(AlphaNumOnlyFilter) //! .build(); //! @@ -50,9 +50,9 @@ impl TokenFilter for AlphaNumOnlyFilter { pub struct AlphaNumOnlyFilterWrapper(T); impl Tokenizer for AlphaNumOnlyFilterWrapper { - type TokenStream<'a> = AlphaNumOnlyFilterStream>; + type TokenStream<'a, 'b> = AlphaNumOnlyFilterStream>; - fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> { + fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> Self::TokenStream<'a, 'b> { AlphaNumOnlyFilterStream { tail: self.0.token_stream(text), } @@ -96,7 +96,7 @@ mod tests { } fn token_stream_helper(text: &str) -> Vec { - let a = TextAnalyzer::builder(SimpleTokenizer) + let mut a = TextAnalyzer::builder(SimpleTokenizer::default()) .filter(AlphaNumOnlyFilter) .build(); let mut token_stream = a.token_stream(text); diff --git a/src/tokenizer/ascii_folding_filter.rs b/src/tokenizer/ascii_folding_filter.rs index beef3ff31c..65fa51df1f 100644 --- a/src/tokenizer/ascii_folding_filter.rs +++ b/src/tokenizer/ascii_folding_filter.rs @@ -20,9 +20,9 @@ impl TokenFilter for AsciiFoldingFilter { pub struct AsciiFoldingFilterWrapper(T); impl Tokenizer for AsciiFoldingFilterWrapper { - type TokenStream<'a> = AsciiFoldingFilterTokenStream>; + type TokenStream<'a, 'b> = AsciiFoldingFilterTokenStream>; - fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> { + fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> Self::TokenStream<'a, 'b> { AsciiFoldingFilterTokenStream { buffer: String::with_capacity(100), tail: self.0.token_stream(text), @@ -1573,7 +1573,7 @@ mod tests { fn folding_helper(text: &str) -> Vec { let mut tokens = Vec::new(); - TextAnalyzer::builder(SimpleTokenizer) + TextAnalyzer::builder(SimpleTokenizer::default()) .filter(AsciiFoldingFilter) .build() .token_stream(text) @@ -1584,10 +1584,10 @@ mod tests { } fn folding_using_raw_tokenizer_helper(text: &str) -> String { - let mut token_stream = TextAnalyzer::builder(RawTokenizer) + let mut tokenizer = TextAnalyzer::builder(RawTokenizer::default()) .filter(AsciiFoldingFilter) - .build() - .token_stream(text); + .build(); + let mut token_stream = tokenizer.token_stream(text); token_stream.advance(); token_stream.token().text.clone() } diff --git a/src/tokenizer/empty_tokenizer.rs b/src/tokenizer/empty_tokenizer.rs index 4f4822206f..deee002070 100644 --- a/src/tokenizer/empty_tokenizer.rs +++ b/src/tokenizer/empty_tokenizer.rs @@ -4,8 +4,8 @@ use crate::tokenizer::{Token, TokenStream, Tokenizer}; pub(crate) struct EmptyTokenizer; impl Tokenizer for EmptyTokenizer { - type TokenStream<'a> = EmptyTokenStream; - fn token_stream(&self, _text: &str) -> EmptyTokenStream { + type TokenStream<'a, 'b> = EmptyTokenStream; + fn token_stream(&mut self, _text: &str) -> EmptyTokenStream { EmptyTokenStream::default() } } @@ -35,7 +35,7 @@ mod tests { #[test] fn test_empty_tokenizer() { - let tokenizer = super::EmptyTokenizer; + let mut tokenizer = super::EmptyTokenizer; let mut empty = tokenizer.token_stream("whatever string"); assert!(!empty.advance()); } diff --git a/src/tokenizer/facet_tokenizer.rs b/src/tokenizer/facet_tokenizer.rs index 3f2f1df2f2..01bd659eea 100644 --- a/src/tokenizer/facet_tokenizer.rs +++ b/src/tokenizer/facet_tokenizer.rs @@ -9,8 +9,10 @@ use crate::schema::FACET_SEP_BYTE; /// - `/america/north_america/canada` /// - `/america/north_america` /// - `/america` -#[derive(Clone)] -pub struct FacetTokenizer; +#[derive(Clone, Default)] +pub struct FacetTokenizer { + token: Token, +} #[derive(Debug)] enum State { @@ -19,28 +21,26 @@ enum State { Terminated, } -pub struct FacetTokenStream<'a> { +pub struct FacetTokenStream<'a, 'b> { text: &'a str, state: State, - token: Token, + token: &'b mut Token, } impl Tokenizer for FacetTokenizer { - type TokenStream<'a> = FacetTokenStream<'a>; - fn token_stream<'a>(&self, text: &'a str) -> FacetTokenStream<'a> { - let token = Token { - position: 0, - ..Default::default() - }; + type TokenStream<'a, 'b> = FacetTokenStream<'a, 'b>; + fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> FacetTokenStream<'a, 'b> { + self.token.reset(); + self.token.position = 0; FacetTokenStream { text, state: State::RootFacetNotEmitted, //< pos is the first char that has not been processed yet. - token, + token: &mut self.token, } } } -impl<'a> TokenStream for FacetTokenStream<'a> { +impl<'a, 'b> TokenStream for FacetTokenStream<'a, 'b> { fn advance(&mut self) -> bool { match self.state { State::RootFacetNotEmitted => { @@ -74,11 +74,11 @@ impl<'a> TokenStream for FacetTokenStream<'a> { } fn token(&self) -> &Token { - &self.token + self.token } fn token_mut(&mut self) -> &mut Token { - &mut self.token + self.token } } @@ -98,7 +98,7 @@ mod tests { let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); tokens.push(format!("{}", facet)); }; - FacetTokenizer + FacetTokenizer::default() .token_stream(facet.encoded_str()) .process(&mut add_token); } @@ -118,7 +118,7 @@ mod tests { let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); // ok test tokens.push(format!("{}", facet)); }; - FacetTokenizer + FacetTokenizer::default() .token_stream(facet.encoded_str()) // ok test .process(&mut add_token); } diff --git a/src/tokenizer/lower_caser.rs b/src/tokenizer/lower_caser.rs index dc10d3e277..a0efe68b4b 100644 --- a/src/tokenizer/lower_caser.rs +++ b/src/tokenizer/lower_caser.rs @@ -18,12 +18,12 @@ impl TokenFilter for LowerCaser { pub struct LowerCaserFilter(T); impl Tokenizer for LowerCaserFilter { - type TokenStream<'a> = LowerCaserTokenStream>; + type TokenStream<'a, 'b> = LowerCaserTokenStream>; - fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> { + fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> Self::TokenStream<'a, 'b> { LowerCaserTokenStream { tail: self.0.token_stream(text), - buffer: String::new(), + buffer: String::new(), // TODO move to global buffer } } } @@ -86,10 +86,11 @@ mod tests { } fn token_stream_helper(text: &str) -> Vec { - let mut token_stream = TextAnalyzer::builder(SimpleTokenizer) + let mut token_stream = TextAnalyzer::builder(SimpleTokenizer::default()) .filter(LowerCaser) - .build() - .token_stream(text); + .build(); + + let mut token_stream = token_stream.token_stream(text); let mut tokens = vec![]; let mut add_token = |token: &Token| { tokens.push(token.clone()); diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 8bd3fd465f..dbc3dd867d 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -66,7 +66,7 @@ //! ```rust //! use tantivy::tokenizer::*; //! -//! let en_stem = TextAnalyzer::builder(SimpleTokenizer) +//! let en_stem = TextAnalyzer::builder(SimpleTokenizer::default()) //! .filter(RemoveLongFilter::limit(40)) //! .filter(LowerCaser) //! .filter(Stemmer::new(Language::English)) @@ -81,7 +81,7 @@ //! # use tantivy::tokenizer::*; //! # use tantivy::Index; //! # -//! let custom_en_tokenizer = SimpleTokenizer; +//! let custom_en_tokenizer = SimpleTokenizer::default(); //! # let schema = Schema::builder().build(); //! let index = Index::create_in_ram(schema); //! index.tokenizers() @@ -113,7 +113,7 @@ //! let index = Index::create_in_ram(schema); //! //! // We need to register our tokenizer : -//! let custom_en_tokenizer = TextAnalyzer::builder(SimpleTokenizer) +//! let custom_en_tokenizer = TextAnalyzer::builder(SimpleTokenizer::default()) //! .filter(RemoveLongFilter::limit(40)) //! .filter(LowerCaser) //! .build(); @@ -188,9 +188,9 @@ pub mod tests { } #[test] - fn test_raw_tokenizer() { + fn test_raw_tokenizer2() { let tokenizer_manager = TokenizerManager::default(); - let en_tokenizer = tokenizer_manager.get("raw").unwrap(); + let mut en_tokenizer = tokenizer_manager.get("raw").unwrap(); let mut tokens: Vec = vec![]; { let mut add_token = |token: &Token| { @@ -208,7 +208,7 @@ pub mod tests { fn test_en_tokenizer() { let tokenizer_manager = TokenizerManager::default(); assert!(tokenizer_manager.get("en_doesnotexist").is_none()); - let en_tokenizer = tokenizer_manager.get("en_stem").unwrap(); + let mut en_tokenizer = tokenizer_manager.get("en_stem").unwrap(); let mut tokens: Vec = vec![]; { let mut add_token = |token: &Token| { @@ -231,13 +231,13 @@ pub mod tests { let tokenizer_manager = TokenizerManager::default(); tokenizer_manager.register( "el_stem", - TextAnalyzer::builder(SimpleTokenizer) + TextAnalyzer::builder(SimpleTokenizer::default()) .filter(RemoveLongFilter::limit(40)) .filter(LowerCaser) .filter(Stemmer::new(Language::Greek)) .build(), ); - let en_tokenizer = tokenizer_manager.get("el_stem").unwrap(); + let mut en_tokenizer = tokenizer_manager.get("el_stem").unwrap(); let mut tokens: Vec = vec![]; { let mut add_token = |token: &Token| { @@ -257,7 +257,7 @@ pub mod tests { #[test] fn test_tokenizer_empty() { let tokenizer_manager = TokenizerManager::default(); - let en_tokenizer = tokenizer_manager.get("en_stem").unwrap(); + let mut en_tokenizer = tokenizer_manager.get("en_stem").unwrap(); { let mut tokens: Vec = vec![]; { @@ -283,7 +283,7 @@ pub mod tests { #[test] fn test_whitespace_tokenizer() { let tokenizer_manager = TokenizerManager::default(); - let ws_tokenizer = tokenizer_manager.get("whitespace").unwrap(); + let mut ws_tokenizer = tokenizer_manager.get("whitespace").unwrap(); let mut tokens: Vec = vec![]; { let mut add_token = |token: &Token| { diff --git a/src/tokenizer/ngram_tokenizer.rs b/src/tokenizer/ngram_tokenizer.rs index b3af1dd03d..e3e4cd59a7 100644 --- a/src/tokenizer/ngram_tokenizer.rs +++ b/src/tokenizer/ngram_tokenizer.rs @@ -33,7 +33,7 @@ use super::{Token, TokenStream, Tokenizer}; /// ```rust /// use tantivy::tokenizer::*; /// -/// let tokenizer = NgramTokenizer::new(2, 3, false); +/// let mut tokenizer = NgramTokenizer::new(2, 3, false); /// let mut stream = tokenizer.token_stream("hello"); /// { /// let token = stream.next().unwrap(); @@ -87,6 +87,7 @@ pub struct NgramTokenizer { max_gram: usize, /// if true, will only parse the leading edge of the input prefix_only: bool, + token: Token, } impl NgramTokenizer { @@ -101,6 +102,7 @@ impl NgramTokenizer { min_gram, max_gram, prefix_only, + token: Token::default(), } } @@ -119,7 +121,7 @@ impl NgramTokenizer { } /// TokenStream associate to the `NgramTokenizer` -pub struct NgramTokenStream<'a> { +pub struct NgramTokenStream<'a, 'b> { /// parameters ngram_charidx_iterator: StutteringIterator>, /// true if the NgramTokenStream is in prefix mode. @@ -127,12 +129,13 @@ pub struct NgramTokenStream<'a> { /// input text: &'a str, /// output - token: Token, + token: &'b mut Token, } impl Tokenizer for NgramTokenizer { - type TokenStream<'a> = NgramTokenStream<'a>; - fn token_stream<'a>(&self, text: &'a str) -> NgramTokenStream<'a> { + type TokenStream<'a, 'b> = NgramTokenStream<'a, 'b>; + fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> NgramTokenStream<'a, 'b> { + self.token.reset(); NgramTokenStream { ngram_charidx_iterator: StutteringIterator::new( CodepointFrontiers::for_str(text), @@ -141,12 +144,12 @@ impl Tokenizer for NgramTokenizer { ), prefix_only: self.prefix_only, text, - token: Token::default(), + token: &mut self.token, } } } -impl<'a> TokenStream for NgramTokenStream<'a> { +impl<'a, 'b> TokenStream for NgramTokenStream<'a, 'b> { fn advance(&mut self) -> bool { if let Some((offset_from, offset_to)) = self.ngram_charidx_iterator.next() { if self.prefix_only && offset_from > 0 { @@ -164,10 +167,10 @@ impl<'a> TokenStream for NgramTokenStream<'a> { } fn token(&self) -> &Token { - &self.token + self.token } fn token_mut(&mut self) -> &mut Token { - &mut self.token + self.token } } diff --git a/src/tokenizer/raw_tokenizer.rs b/src/tokenizer/raw_tokenizer.rs index 901994915f..07079abf20 100644 --- a/src/tokenizer/raw_tokenizer.rs +++ b/src/tokenizer/raw_tokenizer.rs @@ -1,32 +1,34 @@ use super::{Token, TokenStream, Tokenizer}; /// For each value of the field, emit a single unprocessed token. -#[derive(Clone)] -pub struct RawTokenizer; - -pub struct RawTokenStream { +#[derive(Clone, Default)] +pub struct RawTokenizer { token: Token, +} + +pub struct RawTokenStream<'a> { + token: &'a mut Token, has_token: bool, } impl Tokenizer for RawTokenizer { - type TokenStream<'a> = RawTokenStream; - fn token_stream(&self, text: &str) -> RawTokenStream { - let token = Token { - offset_from: 0, - offset_to: text.len(), - position: 0, - text: text.to_string(), - position_length: 1, - }; + type TokenStream<'b, 'a> = RawTokenStream<'a>; + fn token_stream<'a>(&'a mut self, text: &str) -> RawTokenStream<'a> { + self.token.reset(); + self.token.position = 0; + self.token.position_length = 1; + self.token.offset_from = 0; + self.token.offset_to = text.len(); + self.token.text.clear(); + self.token.text.push_str(text); RawTokenStream { - token, + token: &mut self.token, has_token: true, } } } -impl TokenStream for RawTokenStream { +impl<'a> TokenStream for RawTokenStream<'a> { fn advance(&mut self) -> bool { let result = self.has_token; self.has_token = false; @@ -34,11 +36,11 @@ impl TokenStream for RawTokenStream { } fn token(&self) -> &Token { - &self.token + self.token } fn token_mut(&mut self) -> &mut Token { - &mut self.token + self.token } } @@ -55,7 +57,7 @@ mod tests { } fn token_stream_helper(text: &str) -> Vec { - let a = TextAnalyzer::from(RawTokenizer); + let mut a = TextAnalyzer::from(RawTokenizer::default()); let mut token_stream = a.token_stream(text); let mut tokens: Vec = vec![]; let mut add_token = |token: &Token| { diff --git a/src/tokenizer/regex_tokenizer.rs b/src/tokenizer/regex_tokenizer.rs index f65a5cece1..cd502a55d3 100644 --- a/src/tokenizer/regex_tokenizer.rs +++ b/src/tokenizer/regex_tokenizer.rs @@ -22,7 +22,7 @@ use crate::TantivyError; /// ```rust /// use tantivy::tokenizer::*; /// -/// let tokenizer = RegexTokenizer::new(r"'(?:\w*)'").unwrap(); +/// let mut tokenizer = RegexTokenizer::new(r"'(?:\w*)'").unwrap(); /// let mut stream = tokenizer.token_stream("'aaa' bbb 'ccc' 'ddd'"); /// { /// let token = stream.next().unwrap(); @@ -48,6 +48,7 @@ use crate::TantivyError; #[derive(Clone)] pub struct RegexTokenizer { regex: Regex, + token: Token, } impl RegexTokenizer { @@ -55,30 +56,34 @@ impl RegexTokenizer { pub fn new(regex_pattern: &str) -> crate::Result { Regex::new(regex_pattern) .map_err(|_| TantivyError::InvalidArgument(regex_pattern.to_owned())) - .map(|regex| Self { regex }) + .map(|regex| Self { + regex, + token: Token::default(), + }) } } impl Tokenizer for RegexTokenizer { - type TokenStream<'a> = RegexTokenStream<'a>; - fn token_stream<'a>(&self, text: &'a str) -> RegexTokenStream<'a> { + type TokenStream<'a, 'b> = RegexTokenStream<'a, 'b>; + fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> RegexTokenStream<'a, 'b> { + self.token.reset(); RegexTokenStream { regex: self.regex.clone(), text, - token: Token::default(), + token: &mut self.token, cursor: 0, } } } -pub struct RegexTokenStream<'a> { +pub struct RegexTokenStream<'a, 'b> { regex: Regex, text: &'a str, - token: Token, + token: &'b mut Token, cursor: usize, } -impl<'a> TokenStream for RegexTokenStream<'a> { +impl<'a, 'b> TokenStream for RegexTokenStream<'a, 'b> { fn advance(&mut self) -> bool { let Some(regex_match) = self.regex.find(self.text) else { return false; @@ -147,7 +152,7 @@ mod tests { fn token_stream_helper(text: &str, pattern: &str) -> Vec { let r = RegexTokenizer::new(pattern).unwrap(); - let a = TextAnalyzer::from(r); + let mut a = TextAnalyzer::from(r); let mut token_stream = a.token_stream(text); let mut tokens: Vec = vec![]; let mut add_token = |token: &Token| { diff --git a/src/tokenizer/remove_long.rs b/src/tokenizer/remove_long.rs index 933e98adb8..d18f57450b 100644 --- a/src/tokenizer/remove_long.rs +++ b/src/tokenizer/remove_long.rs @@ -2,7 +2,7 @@ //! ```rust //! use tantivy::tokenizer::*; //! -//! let tokenizer = TextAnalyzer::builder(SimpleTokenizer) +//! let mut tokenizer = TextAnalyzer::builder(SimpleTokenizer::default()) //! .filter(RemoveLongFilter::limit(5)) //! .build(); //! @@ -55,9 +55,9 @@ pub struct RemoveLongFilterWrapper { } impl Tokenizer for RemoveLongFilterWrapper { - type TokenStream<'a> = RemoveLongFilterStream>; + type TokenStream<'a, 'b> = RemoveLongFilterStream>; - fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> { + fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> Self::TokenStream<'a, 'b> { RemoveLongFilterStream { token_length_limit: self.length_limit, tail: self.inner.token_stream(text), @@ -103,7 +103,7 @@ mod tests { } fn token_stream_helper(text: &str) -> Vec { - let a = TextAnalyzer::builder(SimpleTokenizer) + let mut a = TextAnalyzer::builder(SimpleTokenizer::default()) .filter(RemoveLongFilter::limit(6)) .build(); let mut token_stream = a.token_stream(text); diff --git a/src/tokenizer/simple_tokenizer.rs b/src/tokenizer/simple_tokenizer.rs index dc9a3b1267..ff49efa38e 100644 --- a/src/tokenizer/simple_tokenizer.rs +++ b/src/tokenizer/simple_tokenizer.rs @@ -3,28 +3,31 @@ use std::str::CharIndices; use super::{Token, TokenStream, Tokenizer}; /// Tokenize the text by splitting on whitespaces and punctuation. -#[derive(Clone)] -pub struct SimpleTokenizer; +#[derive(Clone, Default)] +pub struct SimpleTokenizer { + token: Token, +} /// TokenStream produced by the `SimpleTokenizer`. -pub struct SimpleTokenStream<'a> { +pub struct SimpleTokenStream<'a, 'b> { text: &'a str, chars: CharIndices<'a>, - token: Token, + token: &'b mut Token, } impl Tokenizer for SimpleTokenizer { - type TokenStream<'a> = SimpleTokenStream<'a>; - fn token_stream<'a>(&self, text: &'a str) -> SimpleTokenStream<'a> { + type TokenStream<'a, 'b> = SimpleTokenStream<'a, 'b>; + fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> SimpleTokenStream<'a, 'b> { + self.token.reset(); SimpleTokenStream { text, chars: text.char_indices(), - token: Token::default(), + token: &mut self.token, } } } -impl<'a> SimpleTokenStream<'a> { +impl<'a, 'b> SimpleTokenStream<'a, 'b> { // search for the end of the current token. fn search_token_end(&mut self) -> usize { (&mut self.chars) @@ -35,7 +38,7 @@ impl<'a> SimpleTokenStream<'a> { } } -impl<'a> TokenStream for SimpleTokenStream<'a> { +impl<'a, 'b> TokenStream for SimpleTokenStream<'a, 'b> { fn advance(&mut self) -> bool { self.token.text.clear(); self.token.position = self.token.position.wrapping_add(1); @@ -52,11 +55,11 @@ impl<'a> TokenStream for SimpleTokenStream<'a> { } fn token(&self) -> &Token { - &self.token + self.token } fn token_mut(&mut self) -> &mut Token { - &mut self.token + self.token } } @@ -76,7 +79,7 @@ mod tests { } fn token_stream_helper(text: &str) -> Vec { - let a = TextAnalyzer::from(SimpleTokenizer); + let mut a = TextAnalyzer::from(SimpleTokenizer::default()); let mut token_stream = a.token_stream(text); let mut tokens: Vec = vec![]; let mut add_token = |token: &Token| { diff --git a/src/tokenizer/split_compound_words.rs b/src/tokenizer/split_compound_words.rs index e80e6b31f0..b2366b10b7 100644 --- a/src/tokenizer/split_compound_words.rs +++ b/src/tokenizer/split_compound_words.rs @@ -20,8 +20,8 @@ use super::{Token, TokenFilter, TokenStream, Tokenizer}; /// ```rust /// use tantivy::tokenizer::{SimpleTokenizer, SplitCompoundWords, TextAnalyzer}; /// -/// let tokenizer = -/// TextAnalyzer::builder(SimpleTokenizer) +/// let mut tokenizer = +/// TextAnalyzer::builder(SimpleTokenizer::default()) /// .filter( /// SplitCompoundWords::from_dictionary([ /// "dampf", "schiff", "fahrt", "brot", "backen", "automat", @@ -29,13 +29,13 @@ use super::{Token, TokenFilter, TokenStream, Tokenizer}; /// .unwrap() /// ) /// .build(); -/// -/// let mut stream = tokenizer.token_stream("dampfschifffahrt"); -/// assert_eq!(stream.next().unwrap().text, "dampf"); -/// assert_eq!(stream.next().unwrap().text, "schiff"); -/// assert_eq!(stream.next().unwrap().text, "fahrt"); -/// assert_eq!(stream.next(), None); -/// +/// { +/// let mut stream = tokenizer.token_stream("dampfschifffahrt"); +/// assert_eq!(stream.next().unwrap().text, "dampf"); +/// assert_eq!(stream.next().unwrap().text, "schiff"); +/// assert_eq!(stream.next().unwrap().text, "fahrt"); +/// assert_eq!(stream.next(), None); +/// } /// let mut stream = tokenizer.token_stream("brotbackautomat"); /// assert_eq!(stream.next().unwrap().text, "brotbackautomat"); /// assert_eq!(stream.next(), None); @@ -97,9 +97,9 @@ pub struct SplitCompoundWordsFilter { } impl Tokenizer for SplitCompoundWordsFilter { - type TokenStream<'a> = SplitCompoundWordsTokenStream>; + type TokenStream<'a, 'b> = SplitCompoundWordsTokenStream>; - fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> { + fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> Self::TokenStream<'a, 'b> { SplitCompoundWordsTokenStream { dict: self.dict.clone(), tail: self.inner.token_stream(text), @@ -188,7 +188,7 @@ mod tests { #[test] fn splitting_compound_words_works() { - let tokenizer = TextAnalyzer::builder(SimpleTokenizer) + let mut tokenizer = TextAnalyzer::builder(SimpleTokenizer::default()) .filter(SplitCompoundWords::from_dictionary(["foo", "bar"]).unwrap()) .build(); diff --git a/src/tokenizer/stemmer.rs b/src/tokenizer/stemmer.rs index 3f8a3eead1..081fd6be4e 100644 --- a/src/tokenizer/stemmer.rs +++ b/src/tokenizer/stemmer.rs @@ -98,9 +98,9 @@ pub struct StemmerFilter { } impl Tokenizer for StemmerFilter { - type TokenStream<'a> = StemmerTokenStream>; + type TokenStream<'a, 'b> = StemmerTokenStream>; - fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> { + fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> Self::TokenStream<'a, 'b> { let stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm); StemmerTokenStream { tail: self.inner.token_stream(text), diff --git a/src/tokenizer/stop_word_filter/mod.rs b/src/tokenizer/stop_word_filter/mod.rs index adfbf17d4a..a1b894130c 100644 --- a/src/tokenizer/stop_word_filter/mod.rs +++ b/src/tokenizer/stop_word_filter/mod.rs @@ -2,7 +2,7 @@ //! ```rust //! use tantivy::tokenizer::*; //! -//! let tokenizer = TextAnalyzer::builder(SimpleTokenizer) +//! let mut tokenizer = TextAnalyzer::builder(SimpleTokenizer::default()) //! .filter(StopWordFilter::remove(vec!["the".to_string(), "is".to_string()])) //! .build(); //! @@ -88,9 +88,9 @@ pub struct StopWordFilterWrapper { } impl Tokenizer for StopWordFilterWrapper { - type TokenStream<'a> = StopWordFilterStream>; + type TokenStream<'a, 'b> = StopWordFilterStream>; - fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> { + fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> Self::TokenStream<'a, 'b> { StopWordFilterStream { words: self.words.clone(), tail: self.inner.token_stream(text), @@ -151,7 +151,7 @@ mod tests { "am".to_string(), "i".to_string(), ]; - let a = TextAnalyzer::builder(SimpleTokenizer) + let mut a = TextAnalyzer::builder(SimpleTokenizer::default()) .filter(StopWordFilter::remove(stops)) .build(); let mut token_stream = a.token_stream(text); diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index 65b7815c81..2c70969899 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -12,13 +12,13 @@ pub struct TextAnalyzer { /// A boxable `Tokenizer`, with its `TokenStream` type erased. trait BoxableTokenizer: 'static + Send + Sync { /// Creates a boxed token stream for a given `str`. - fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>; + fn box_token_stream<'a, 'b: 'a>(&'b mut self, text: &'a str) -> BoxTokenStream<'a>; /// Clone this tokenizer. fn box_clone(&self) -> Box; } impl BoxableTokenizer for T { - fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { + fn box_token_stream<'a, 'b: 'a>(&'b mut self, text: &'a str) -> BoxTokenStream<'a> { self.token_stream(text).into() } fn box_clone(&self) -> Box { @@ -53,7 +53,7 @@ impl TextAnalyzer { } /// Creates a token stream for a given `str`. - pub fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { + pub fn token_stream<'a, 'b: 'a>(&'b mut self, text: &'a str) -> BoxTokenStream<'a> { self.tokenizer.box_token_stream(text) } } @@ -71,7 +71,7 @@ impl TextAnalyzerBuilder { /// ```rust /// use tantivy::tokenizer::*; /// - /// let en_stem = TextAnalyzer::builder(SimpleTokenizer) + /// let en_stem = TextAnalyzer::builder(SimpleTokenizer::default()) /// .filter(RemoveLongFilter::limit(40)) /// .filter(LowerCaser) /// .filter(Stemmer::default()) diff --git a/src/tokenizer/tokenizer_manager.rs b/src/tokenizer/tokenizer_manager.rs index e849471bc5..a2be123903 100644 --- a/src/tokenizer/tokenizer_manager.rs +++ b/src/tokenizer/tokenizer_manager.rs @@ -58,23 +58,23 @@ impl Default for TokenizerManager { /// the default pre-configured tokenizers of `tantivy`. fn default() -> TokenizerManager { let manager = TokenizerManager::new(); - manager.register("raw", RawTokenizer); + manager.register("raw", RawTokenizer::default()); manager.register( "default", - TextAnalyzer::builder(SimpleTokenizer) + TextAnalyzer::builder(SimpleTokenizer::default()) .filter(RemoveLongFilter::limit(40)) .filter(LowerCaser) .build(), ); manager.register( "en_stem", - TextAnalyzer::builder(SimpleTokenizer) + TextAnalyzer::builder(SimpleTokenizer::default()) .filter(RemoveLongFilter::limit(40)) .filter(LowerCaser) .filter(Stemmer::new(Language::English)) .build(), ); - manager.register("whitespace", WhitespaceTokenizer); + manager.register("whitespace", WhitespaceTokenizer::default()); manager } } diff --git a/src/tokenizer/whitespace_tokenizer.rs b/src/tokenizer/whitespace_tokenizer.rs index 6de19ddd7d..8b411ad054 100644 --- a/src/tokenizer/whitespace_tokenizer.rs +++ b/src/tokenizer/whitespace_tokenizer.rs @@ -3,27 +3,30 @@ use std::str::CharIndices; use super::{Token, TokenStream, Tokenizer}; /// Tokenize the text by splitting on whitespaces. -#[derive(Clone)] -pub struct WhitespaceTokenizer; +#[derive(Clone, Default)] +pub struct WhitespaceTokenizer { + token: Token, +} -pub struct WhitespaceTokenStream<'a> { +pub struct WhitespaceTokenStream<'a, 'b> { text: &'a str, chars: CharIndices<'a>, - token: Token, + token: &'b mut Token, } impl Tokenizer for WhitespaceTokenizer { - type TokenStream<'a> = WhitespaceTokenStream<'a>; - fn token_stream<'a>(&self, text: &'a str) -> WhitespaceTokenStream<'a> { + type TokenStream<'a, 'b> = WhitespaceTokenStream<'a, 'b>; + fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> WhitespaceTokenStream<'a, 'b> { + self.token.reset(); WhitespaceTokenStream { text, chars: text.char_indices(), - token: Token::default(), + token: &mut self.token, } } } -impl<'a> WhitespaceTokenStream<'a> { +impl<'a, 'b> WhitespaceTokenStream<'a, 'b> { // search for the end of the current token. fn search_token_end(&mut self) -> usize { (&mut self.chars) @@ -34,7 +37,7 @@ impl<'a> WhitespaceTokenStream<'a> { } } -impl<'a> TokenStream for WhitespaceTokenStream<'a> { +impl<'a, 'b> TokenStream for WhitespaceTokenStream<'a, 'b> { fn advance(&mut self) -> bool { self.token.text.clear(); self.token.position = self.token.position.wrapping_add(1); @@ -51,11 +54,11 @@ impl<'a> TokenStream for WhitespaceTokenStream<'a> { } fn token(&self) -> &Token { - &self.token + self.token } fn token_mut(&mut self) -> &mut Token { - &mut self.token + self.token } } @@ -75,7 +78,7 @@ mod tests { } fn token_stream_helper(text: &str) -> Vec { - let a = TextAnalyzer::from(WhitespaceTokenizer); + let mut a = TextAnalyzer::from(WhitespaceTokenizer::default()); let mut token_stream = a.token_stream(text); let mut tokens: Vec = vec![]; let mut add_token = |token: &Token| { diff --git a/tokenizer-api/src/lib.rs b/tokenizer-api/src/lib.rs index f43f8b1d6b..ab0fa8c955 100644 --- a/tokenizer-api/src/lib.rs +++ b/tokenizer-api/src/lib.rs @@ -40,13 +40,24 @@ impl Default for Token { } } +impl Token { + /// reset to default + pub fn reset(&mut self) { + self.offset_from = 0; + self.offset_to = 0; + self.position = usize::MAX; + self.text.clear(); + self.position_length = 1; + } +} + /// `Tokenizer` are in charge of splitting text into a stream of token /// before indexing. pub trait Tokenizer: 'static + Clone + Send + Sync { /// The token stream returned by this Tokenizer. - type TokenStream<'a>: TokenStream; + type TokenStream<'a, 'b>: TokenStream; /// Creates a token stream for a given `str`. - fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a>; + fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> Self::TokenStream<'a, 'b>; } /// Simple wrapper of `Box`. From 90b170b6b8da79c6e5262ab859d791ec42bbef05 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 2 Jun 2023 12:29:11 +0800 Subject: [PATCH 2/4] simplify api --- Cargo.toml | 2 +- src/tokenizer/alphanum_only.rs | 4 ++-- src/tokenizer/ascii_folding_filter.rs | 4 ++-- src/tokenizer/empty_tokenizer.rs | 2 +- src/tokenizer/facet_tokenizer.rs | 10 +++++----- src/tokenizer/lower_caser.rs | 4 ++-- src/tokenizer/ngram_tokenizer.rs | 10 +++++----- src/tokenizer/raw_tokenizer.rs | 2 +- src/tokenizer/regex_tokenizer.rs | 14 +++++++------- src/tokenizer/remove_long.rs | 4 ++-- src/tokenizer/simple_tokenizer.rs | 12 ++++++------ src/tokenizer/split_compound_words.rs | 4 ++-- src/tokenizer/stemmer.rs | 4 ++-- src/tokenizer/stop_word_filter/mod.rs | 4 ++-- src/tokenizer/tokenizer.rs | 6 +++--- src/tokenizer/whitespace_tokenizer.rs | 12 ++++++------ tokenizer-api/src/lib.rs | 4 ++-- 17 files changed, 51 insertions(+), 51 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 0be3f600e4..2e985b2c1a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -77,7 +77,7 @@ proptest = "1.0.0" criterion = "0.5" test-log = "0.2.10" env_logger = "0.10.0" -pprof = { version = "0.11.0", features = ["flamegraph", "criterion"] } +pprof = { git = "https://github.com/PSeitz/pprof-rs/", rev = "53af24b", features = ["flamegraph", "criterion"] } # temp fork that works with criterion 0.5 futures = "0.3.21" paste = "1.0.11" more-asserts = "0.3.1" diff --git a/src/tokenizer/alphanum_only.rs b/src/tokenizer/alphanum_only.rs index abca2f671a..b40731fd34 100644 --- a/src/tokenizer/alphanum_only.rs +++ b/src/tokenizer/alphanum_only.rs @@ -50,9 +50,9 @@ impl TokenFilter for AlphaNumOnlyFilter { pub struct AlphaNumOnlyFilterWrapper(T); impl Tokenizer for AlphaNumOnlyFilterWrapper { - type TokenStream<'a, 'b> = AlphaNumOnlyFilterStream>; + type TokenStream<'a> = AlphaNumOnlyFilterStream>; - fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> Self::TokenStream<'a, 'b> { + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { AlphaNumOnlyFilterStream { tail: self.0.token_stream(text), } diff --git a/src/tokenizer/ascii_folding_filter.rs b/src/tokenizer/ascii_folding_filter.rs index 65fa51df1f..4b3655febf 100644 --- a/src/tokenizer/ascii_folding_filter.rs +++ b/src/tokenizer/ascii_folding_filter.rs @@ -20,9 +20,9 @@ impl TokenFilter for AsciiFoldingFilter { pub struct AsciiFoldingFilterWrapper(T); impl Tokenizer for AsciiFoldingFilterWrapper { - type TokenStream<'a, 'b> = AsciiFoldingFilterTokenStream>; + type TokenStream<'a> = AsciiFoldingFilterTokenStream>; - fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> Self::TokenStream<'a, 'b> { + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { AsciiFoldingFilterTokenStream { buffer: String::with_capacity(100), tail: self.0.token_stream(text), diff --git a/src/tokenizer/empty_tokenizer.rs b/src/tokenizer/empty_tokenizer.rs index deee002070..46cb78c10d 100644 --- a/src/tokenizer/empty_tokenizer.rs +++ b/src/tokenizer/empty_tokenizer.rs @@ -4,7 +4,7 @@ use crate::tokenizer::{Token, TokenStream, Tokenizer}; pub(crate) struct EmptyTokenizer; impl Tokenizer for EmptyTokenizer { - type TokenStream<'a, 'b> = EmptyTokenStream; + type TokenStream<'a> = EmptyTokenStream; fn token_stream(&mut self, _text: &str) -> EmptyTokenStream { EmptyTokenStream::default() } diff --git a/src/tokenizer/facet_tokenizer.rs b/src/tokenizer/facet_tokenizer.rs index 01bd659eea..568d60ae31 100644 --- a/src/tokenizer/facet_tokenizer.rs +++ b/src/tokenizer/facet_tokenizer.rs @@ -21,15 +21,15 @@ enum State { Terminated, } -pub struct FacetTokenStream<'a, 'b> { +pub struct FacetTokenStream<'a> { text: &'a str, state: State, - token: &'b mut Token, + token: &'a mut Token, } impl Tokenizer for FacetTokenizer { - type TokenStream<'a, 'b> = FacetTokenStream<'a, 'b>; - fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> FacetTokenStream<'a, 'b> { + type TokenStream<'a> = FacetTokenStream<'a>; + fn token_stream<'a>(&'a mut self, text: &'a str) -> FacetTokenStream<'a> { self.token.reset(); self.token.position = 0; FacetTokenStream { @@ -40,7 +40,7 @@ impl Tokenizer for FacetTokenizer { } } -impl<'a, 'b> TokenStream for FacetTokenStream<'a, 'b> { +impl<'a> TokenStream for FacetTokenStream<'a> { fn advance(&mut self) -> bool { match self.state { State::RootFacetNotEmitted => { diff --git a/src/tokenizer/lower_caser.rs b/src/tokenizer/lower_caser.rs index a0efe68b4b..18696b6b75 100644 --- a/src/tokenizer/lower_caser.rs +++ b/src/tokenizer/lower_caser.rs @@ -18,9 +18,9 @@ impl TokenFilter for LowerCaser { pub struct LowerCaserFilter(T); impl Tokenizer for LowerCaserFilter { - type TokenStream<'a, 'b> = LowerCaserTokenStream>; + type TokenStream<'a> = LowerCaserTokenStream>; - fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> Self::TokenStream<'a, 'b> { + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { LowerCaserTokenStream { tail: self.0.token_stream(text), buffer: String::new(), // TODO move to global buffer diff --git a/src/tokenizer/ngram_tokenizer.rs b/src/tokenizer/ngram_tokenizer.rs index e3e4cd59a7..ae54cacf4f 100644 --- a/src/tokenizer/ngram_tokenizer.rs +++ b/src/tokenizer/ngram_tokenizer.rs @@ -121,7 +121,7 @@ impl NgramTokenizer { } /// TokenStream associate to the `NgramTokenizer` -pub struct NgramTokenStream<'a, 'b> { +pub struct NgramTokenStream<'a> { /// parameters ngram_charidx_iterator: StutteringIterator>, /// true if the NgramTokenStream is in prefix mode. @@ -129,12 +129,12 @@ pub struct NgramTokenStream<'a, 'b> { /// input text: &'a str, /// output - token: &'b mut Token, + token: &'a mut Token, } impl Tokenizer for NgramTokenizer { - type TokenStream<'a, 'b> = NgramTokenStream<'a, 'b>; - fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> NgramTokenStream<'a, 'b> { + type TokenStream<'a> = NgramTokenStream<'a>; + fn token_stream<'a>(&'a mut self, text: &'a str) -> NgramTokenStream<'a> { self.token.reset(); NgramTokenStream { ngram_charidx_iterator: StutteringIterator::new( @@ -149,7 +149,7 @@ impl Tokenizer for NgramTokenizer { } } -impl<'a, 'b> TokenStream for NgramTokenStream<'a, 'b> { +impl<'a> TokenStream for NgramTokenStream<'a> { fn advance(&mut self) -> bool { if let Some((offset_from, offset_to)) = self.ngram_charidx_iterator.next() { if self.prefix_only && offset_from > 0 { diff --git a/src/tokenizer/raw_tokenizer.rs b/src/tokenizer/raw_tokenizer.rs index 07079abf20..9bf7ee22a9 100644 --- a/src/tokenizer/raw_tokenizer.rs +++ b/src/tokenizer/raw_tokenizer.rs @@ -12,7 +12,7 @@ pub struct RawTokenStream<'a> { } impl Tokenizer for RawTokenizer { - type TokenStream<'b, 'a> = RawTokenStream<'a>; + type TokenStream<'a> = RawTokenStream<'a>; fn token_stream<'a>(&'a mut self, text: &str) -> RawTokenStream<'a> { self.token.reset(); self.token.position = 0; diff --git a/src/tokenizer/regex_tokenizer.rs b/src/tokenizer/regex_tokenizer.rs index cd502a55d3..f9a10ad201 100644 --- a/src/tokenizer/regex_tokenizer.rs +++ b/src/tokenizer/regex_tokenizer.rs @@ -64,8 +64,8 @@ impl RegexTokenizer { } impl Tokenizer for RegexTokenizer { - type TokenStream<'a, 'b> = RegexTokenStream<'a, 'b>; - fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> RegexTokenStream<'a, 'b> { + type TokenStream<'a> = RegexTokenStream<'a>; + fn token_stream<'a>(&'a mut self, text: &'a str) -> RegexTokenStream<'a> { self.token.reset(); RegexTokenStream { regex: self.regex.clone(), @@ -76,14 +76,14 @@ impl Tokenizer for RegexTokenizer { } } -pub struct RegexTokenStream<'a, 'b> { +pub struct RegexTokenStream<'a> { regex: Regex, text: &'a str, - token: &'b mut Token, + token: &'a mut Token, cursor: usize, } -impl<'a, 'b> TokenStream for RegexTokenStream<'a, 'b> { +impl<'a> TokenStream for RegexTokenStream<'a> { fn advance(&mut self) -> bool { let Some(regex_match) = self.regex.find(self.text) else { return false; @@ -105,11 +105,11 @@ impl<'a, 'b> TokenStream for RegexTokenStream<'a, 'b> { } fn token(&self) -> &Token { - &self.token + self.token } fn token_mut(&mut self) -> &mut Token { - &mut self.token + self.token } } diff --git a/src/tokenizer/remove_long.rs b/src/tokenizer/remove_long.rs index d18f57450b..78f3e731ac 100644 --- a/src/tokenizer/remove_long.rs +++ b/src/tokenizer/remove_long.rs @@ -55,9 +55,9 @@ pub struct RemoveLongFilterWrapper { } impl Tokenizer for RemoveLongFilterWrapper { - type TokenStream<'a, 'b> = RemoveLongFilterStream>; + type TokenStream<'a> = RemoveLongFilterStream>; - fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> Self::TokenStream<'a, 'b> { + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { RemoveLongFilterStream { token_length_limit: self.length_limit, tail: self.inner.token_stream(text), diff --git a/src/tokenizer/simple_tokenizer.rs b/src/tokenizer/simple_tokenizer.rs index ff49efa38e..540dfac477 100644 --- a/src/tokenizer/simple_tokenizer.rs +++ b/src/tokenizer/simple_tokenizer.rs @@ -9,15 +9,15 @@ pub struct SimpleTokenizer { } /// TokenStream produced by the `SimpleTokenizer`. -pub struct SimpleTokenStream<'a, 'b> { +pub struct SimpleTokenStream<'a> { text: &'a str, chars: CharIndices<'a>, - token: &'b mut Token, + token: &'a mut Token, } impl Tokenizer for SimpleTokenizer { - type TokenStream<'a, 'b> = SimpleTokenStream<'a, 'b>; - fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> SimpleTokenStream<'a, 'b> { + type TokenStream<'a> = SimpleTokenStream<'a>; + fn token_stream<'a>(&'a mut self, text: &'a str) -> SimpleTokenStream<'a> { self.token.reset(); SimpleTokenStream { text, @@ -27,7 +27,7 @@ impl Tokenizer for SimpleTokenizer { } } -impl<'a, 'b> SimpleTokenStream<'a, 'b> { +impl<'a> SimpleTokenStream<'a> { // search for the end of the current token. fn search_token_end(&mut self) -> usize { (&mut self.chars) @@ -38,7 +38,7 @@ impl<'a, 'b> SimpleTokenStream<'a, 'b> { } } -impl<'a, 'b> TokenStream for SimpleTokenStream<'a, 'b> { +impl<'a> TokenStream for SimpleTokenStream<'a> { fn advance(&mut self) -> bool { self.token.text.clear(); self.token.position = self.token.position.wrapping_add(1); diff --git a/src/tokenizer/split_compound_words.rs b/src/tokenizer/split_compound_words.rs index b2366b10b7..bcde161cc8 100644 --- a/src/tokenizer/split_compound_words.rs +++ b/src/tokenizer/split_compound_words.rs @@ -97,9 +97,9 @@ pub struct SplitCompoundWordsFilter { } impl Tokenizer for SplitCompoundWordsFilter { - type TokenStream<'a, 'b> = SplitCompoundWordsTokenStream>; + type TokenStream<'a> = SplitCompoundWordsTokenStream>; - fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> Self::TokenStream<'a, 'b> { + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { SplitCompoundWordsTokenStream { dict: self.dict.clone(), tail: self.inner.token_stream(text), diff --git a/src/tokenizer/stemmer.rs b/src/tokenizer/stemmer.rs index 081fd6be4e..4c43b609ab 100644 --- a/src/tokenizer/stemmer.rs +++ b/src/tokenizer/stemmer.rs @@ -98,9 +98,9 @@ pub struct StemmerFilter { } impl Tokenizer for StemmerFilter { - type TokenStream<'a, 'b> = StemmerTokenStream>; + type TokenStream<'a> = StemmerTokenStream>; - fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> Self::TokenStream<'a, 'b> { + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { let stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm); StemmerTokenStream { tail: self.inner.token_stream(text), diff --git a/src/tokenizer/stop_word_filter/mod.rs b/src/tokenizer/stop_word_filter/mod.rs index a1b894130c..80f1fe4938 100644 --- a/src/tokenizer/stop_word_filter/mod.rs +++ b/src/tokenizer/stop_word_filter/mod.rs @@ -88,9 +88,9 @@ pub struct StopWordFilterWrapper { } impl Tokenizer for StopWordFilterWrapper { - type TokenStream<'a, 'b> = StopWordFilterStream>; + type TokenStream<'a> = StopWordFilterStream>; - fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> Self::TokenStream<'a, 'b> { + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { StopWordFilterStream { words: self.words.clone(), tail: self.inner.token_stream(text), diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index 2c70969899..ccab6cda73 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -12,13 +12,13 @@ pub struct TextAnalyzer { /// A boxable `Tokenizer`, with its `TokenStream` type erased. trait BoxableTokenizer: 'static + Send + Sync { /// Creates a boxed token stream for a given `str`. - fn box_token_stream<'a, 'b: 'a>(&'b mut self, text: &'a str) -> BoxTokenStream<'a>; + fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a>; /// Clone this tokenizer. fn box_clone(&self) -> Box; } impl BoxableTokenizer for T { - fn box_token_stream<'a, 'b: 'a>(&'b mut self, text: &'a str) -> BoxTokenStream<'a> { + fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> { self.token_stream(text).into() } fn box_clone(&self) -> Box { @@ -53,7 +53,7 @@ impl TextAnalyzer { } /// Creates a token stream for a given `str`. - pub fn token_stream<'a, 'b: 'a>(&'b mut self, text: &'a str) -> BoxTokenStream<'a> { + pub fn token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> { self.tokenizer.box_token_stream(text) } } diff --git a/src/tokenizer/whitespace_tokenizer.rs b/src/tokenizer/whitespace_tokenizer.rs index 8b411ad054..69a3b05509 100644 --- a/src/tokenizer/whitespace_tokenizer.rs +++ b/src/tokenizer/whitespace_tokenizer.rs @@ -8,15 +8,15 @@ pub struct WhitespaceTokenizer { token: Token, } -pub struct WhitespaceTokenStream<'a, 'b> { +pub struct WhitespaceTokenStream<'a> { text: &'a str, chars: CharIndices<'a>, - token: &'b mut Token, + token: &'a mut Token, } impl Tokenizer for WhitespaceTokenizer { - type TokenStream<'a, 'b> = WhitespaceTokenStream<'a, 'b>; - fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> WhitespaceTokenStream<'a, 'b> { + type TokenStream<'a> = WhitespaceTokenStream<'a>; + fn token_stream<'a>(&'a mut self, text: &'a str) -> WhitespaceTokenStream<'a> { self.token.reset(); WhitespaceTokenStream { text, @@ -26,7 +26,7 @@ impl Tokenizer for WhitespaceTokenizer { } } -impl<'a, 'b> WhitespaceTokenStream<'a, 'b> { +impl<'a> WhitespaceTokenStream<'a> { // search for the end of the current token. fn search_token_end(&mut self) -> usize { (&mut self.chars) @@ -37,7 +37,7 @@ impl<'a, 'b> WhitespaceTokenStream<'a, 'b> { } } -impl<'a, 'b> TokenStream for WhitespaceTokenStream<'a, 'b> { +impl<'a> TokenStream for WhitespaceTokenStream<'a> { fn advance(&mut self) -> bool { self.token.text.clear(); self.token.position = self.token.position.wrapping_add(1); diff --git a/tokenizer-api/src/lib.rs b/tokenizer-api/src/lib.rs index ab0fa8c955..e563443715 100644 --- a/tokenizer-api/src/lib.rs +++ b/tokenizer-api/src/lib.rs @@ -55,9 +55,9 @@ impl Token { /// before indexing. pub trait Tokenizer: 'static + Clone + Send + Sync { /// The token stream returned by this Tokenizer. - type TokenStream<'a, 'b>: TokenStream; + type TokenStream<'a>: TokenStream; /// Creates a token stream for a given `str`. - fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> Self::TokenStream<'a, 'b>; + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a>; } /// Simple wrapper of `Box`. From fa6e121e24e7424af80d6562d28931fd0a166cc9 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 8 Jun 2023 16:56:56 +0800 Subject: [PATCH 3/4] move lowercase and ascii folding buffer to global --- src/tokenizer/ascii_folding_filter.rs | 27 +++++++++++++++++---------- src/tokenizer/lower_caser.rs | 27 +++++++++++++++++---------- 2 files changed, 34 insertions(+), 20 deletions(-) diff --git a/src/tokenizer/ascii_folding_filter.rs b/src/tokenizer/ascii_folding_filter.rs index 4b3655febf..da8039e175 100644 --- a/src/tokenizer/ascii_folding_filter.rs +++ b/src/tokenizer/ascii_folding_filter.rs @@ -12,38 +12,45 @@ impl TokenFilter for AsciiFoldingFilter { type Tokenizer = AsciiFoldingFilterWrapper; fn transform(self, tokenizer: T) -> AsciiFoldingFilterWrapper { - AsciiFoldingFilterWrapper(tokenizer) + AsciiFoldingFilterWrapper { + tokenizer, + buffer: String::new(), + } } } #[derive(Clone)] -pub struct AsciiFoldingFilterWrapper(T); +pub struct AsciiFoldingFilterWrapper { + tokenizer: T, + buffer: String, +} impl Tokenizer for AsciiFoldingFilterWrapper { - type TokenStream<'a> = AsciiFoldingFilterTokenStream>; + type TokenStream<'a> = AsciiFoldingFilterTokenStream<'a, T::TokenStream<'a>>; fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { + self.buffer.clear(); AsciiFoldingFilterTokenStream { - buffer: String::with_capacity(100), - tail: self.0.token_stream(text), + buffer: &mut self.buffer, + tail: self.tokenizer.token_stream(text), } } } -pub struct AsciiFoldingFilterTokenStream { - buffer: String, +pub struct AsciiFoldingFilterTokenStream<'a, T> { + buffer: &'a mut String, tail: T, } -impl TokenStream for AsciiFoldingFilterTokenStream { +impl<'a, T: TokenStream> TokenStream for AsciiFoldingFilterTokenStream<'a, T> { fn advance(&mut self) -> bool { if !self.tail.advance() { return false; } if !self.token_mut().text.is_ascii() { // ignore its already ascii - to_ascii(&self.tail.token().text, &mut self.buffer); - mem::swap(&mut self.tail.token_mut().text, &mut self.buffer); + to_ascii(&self.tail.token().text, self.buffer); + mem::swap(&mut self.tail.token_mut().text, self.buffer); } true } diff --git a/src/tokenizer/lower_caser.rs b/src/tokenizer/lower_caser.rs index 18696b6b75..56792ba827 100644 --- a/src/tokenizer/lower_caser.rs +++ b/src/tokenizer/lower_caser.rs @@ -10,26 +10,33 @@ impl TokenFilter for LowerCaser { type Tokenizer = LowerCaserFilter; fn transform(self, tokenizer: T) -> Self::Tokenizer { - LowerCaserFilter(tokenizer) + LowerCaserFilter { + tokenizer, + buffer: String::new(), + } } } #[derive(Clone)] -pub struct LowerCaserFilter(T); +pub struct LowerCaserFilter { + tokenizer: T, + buffer: String, +} impl Tokenizer for LowerCaserFilter { - type TokenStream<'a> = LowerCaserTokenStream>; + type TokenStream<'a> = LowerCaserTokenStream<'a, T::TokenStream<'a>>; fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { + self.buffer.clear(); LowerCaserTokenStream { - tail: self.0.token_stream(text), - buffer: String::new(), // TODO move to global buffer + tail: self.tokenizer.token_stream(text), + buffer: &mut self.buffer, } } } -pub struct LowerCaserTokenStream { - buffer: String, +pub struct LowerCaserTokenStream<'a, T> { + buffer: &'a mut String, tail: T, } @@ -44,7 +51,7 @@ fn to_lowercase_unicode(text: &str, output: &mut String) { } } -impl TokenStream for LowerCaserTokenStream { +impl<'a, T: TokenStream> TokenStream for LowerCaserTokenStream<'a, T> { fn advance(&mut self) -> bool { if !self.tail.advance() { return false; @@ -53,8 +60,8 @@ impl TokenStream for LowerCaserTokenStream { // fast track for ascii. self.token_mut().text.make_ascii_lowercase(); } else { - to_lowercase_unicode(&self.tail.token().text, &mut self.buffer); - mem::swap(&mut self.tail.token_mut().text, &mut self.buffer); + to_lowercase_unicode(&self.tail.token().text, self.buffer); + mem::swap(&mut self.tail.token_mut().text, self.buffer); } true } From 2e44397babcaa6ae86279ad4321ece6d51a0af1a Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 8 Jun 2023 17:16:37 +0800 Subject: [PATCH 4/4] empty Token text as default --- tokenizer-api/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tokenizer-api/src/lib.rs b/tokenizer-api/src/lib.rs index e563443715..adb37a0b4f 100644 --- a/tokenizer-api/src/lib.rs +++ b/tokenizer-api/src/lib.rs @@ -34,7 +34,7 @@ impl Default for Token { offset_from: 0, offset_to: 0, position: usize::MAX, - text: String::with_capacity(200), + text: String::new(), position_length: 1, } }