Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tokenizer-api: reduce Tokenizer overhead #2062

Merged
merged 4 commits into from
Jun 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ proptest = "1.0.0"
criterion = "0.5"
test-log = "0.2.10"
env_logger = "0.10.0"
pprof = { version = "0.11.0", features = ["flamegraph", "criterion"] }
pprof = { git = "https://github.com/PSeitz/pprof-rs/", rev = "53af24b", features = ["flamegraph", "criterion"] } # temp fork that works with criterion 0.5
futures = "0.3.21"
paste = "1.0.11"
more-asserts = "0.3.1"
Expand Down
2 changes: 1 addition & 1 deletion benches/analyzer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ const ALICE_TXT: &str = include_str!("alice.txt");

pub fn criterion_benchmark(c: &mut Criterion) {
let tokenizer_manager = TokenizerManager::default();
let tokenizer = tokenizer_manager.get("default").unwrap();
let mut tokenizer = tokenizer_manager.get("default").unwrap();
c.bench_function("default-tokenize-alice", |b| {
b.iter(|| {
let mut word_count = 0;
Expand Down
3 changes: 2 additions & 1 deletion examples/pre_tokenized_text.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ use tantivy::{doc, Index, ReloadPolicy};
use tempfile::TempDir;

fn pre_tokenize_text(text: &str) -> Vec<Token> {
let mut token_stream = SimpleTokenizer.token_stream(text);
let mut tokenizer = SimpleTokenizer::default();
let mut token_stream = tokenizer.token_stream(text);
let mut tokens = vec![];
while token_stream.advance() {
tokens.push(token_stream.token().clone());
Expand Down
2 changes: 1 addition & 1 deletion examples/stop_words.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ fn main() -> tantivy::Result<()> {

// This tokenizer lowers all of the text (to help with stop word matching)
// then removes all instances of `the` and `and` from the corpus
let tokenizer = TextAnalyzer::builder(SimpleTokenizer)
let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
.filter(LowerCaser)
.filter(StopWordFilter::remove(vec![
"the".to_string(),
Expand Down
8 changes: 4 additions & 4 deletions src/core/json_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ impl IndexingPositionsPerPath {
pub(crate) fn index_json_values<'a>(
doc: DocId,
json_values: impl Iterator<Item = crate::Result<&'a serde_json::Map<String, serde_json::Value>>>,
text_analyzer: &TextAnalyzer,
text_analyzer: &mut TextAnalyzer,
expand_dots_enabled: bool,
term_buffer: &mut Term,
postings_writer: &mut dyn PostingsWriter,
Expand All @@ -93,7 +93,7 @@ pub(crate) fn index_json_values<'a>(
fn index_json_object(
doc: DocId,
json_value: &serde_json::Map<String, serde_json::Value>,
text_analyzer: &TextAnalyzer,
text_analyzer: &mut TextAnalyzer,
json_term_writer: &mut JsonTermWriter,
postings_writer: &mut dyn PostingsWriter,
ctx: &mut IndexingContext,
Expand All @@ -117,7 +117,7 @@ fn index_json_object(
fn index_json_value(
doc: DocId,
json_value: &serde_json::Value,
text_analyzer: &TextAnalyzer,
text_analyzer: &mut TextAnalyzer,
json_term_writer: &mut JsonTermWriter,
postings_writer: &mut dyn PostingsWriter,
ctx: &mut IndexingContext,
Expand Down Expand Up @@ -239,7 +239,7 @@ pub(crate) fn set_fastvalue_and_get_term<T: FastValue>(
pub(crate) fn set_string_and_get_terms(
json_term_writer: &mut JsonTermWriter,
value: &str,
text_analyzer: &TextAnalyzer,
text_analyzer: &mut TextAnalyzer,
) -> Vec<(usize, Term)> {
let mut positions_and_terms = Vec::<(usize, Term)>::new();
json_term_writer.close_path_and_set_type(Type::Str);
Expand Down
2 changes: 1 addition & 1 deletion src/fastfield/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1208,7 +1208,7 @@ mod tests {
let ff_tokenizer_manager = TokenizerManager::default();
ff_tokenizer_manager.register(
"custom_lowercase",
TextAnalyzer::builder(RawTokenizer)
TextAnalyzer::builder(RawTokenizer::default())
.filter(LowerCaser)
.build(),
);
Expand Down
12 changes: 6 additions & 6 deletions src/fastfield/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ impl FastFieldsWriter {
}
Value::Str(text_val) => {
if let Some(tokenizer) =
&self.per_field_tokenizer[field_value.field().field_id() as usize]
&mut self.per_field_tokenizer[field_value.field().field_id() as usize]
{
let mut token_stream = tokenizer.token_stream(text_val);
token_stream.process(&mut |token: &Token| {
Expand Down Expand Up @@ -202,7 +202,7 @@ impl FastFieldsWriter {
self.json_path_buffer.push_str(field_name);

let text_analyzer =
&self.per_field_tokenizer[field_value.field().field_id() as usize];
&mut self.per_field_tokenizer[field_value.field().field_id() as usize];

record_json_obj_to_columnar_writer(
doc_id,
Expand Down Expand Up @@ -263,7 +263,7 @@ fn record_json_obj_to_columnar_writer(
remaining_depth_limit: usize,
json_path_buffer: &mut String,
columnar_writer: &mut columnar::ColumnarWriter,
tokenizer: &Option<TextAnalyzer>,
tokenizer: &mut Option<TextAnalyzer>,
) {
for (key, child) in json_obj {
let len_path = json_path_buffer.len();
Expand Down Expand Up @@ -302,7 +302,7 @@ fn record_json_value_to_columnar_writer(
mut remaining_depth_limit: usize,
json_path_writer: &mut String,
columnar_writer: &mut columnar::ColumnarWriter,
tokenizer: &Option<TextAnalyzer>,
tokenizer: &mut Option<TextAnalyzer>,
) {
if remaining_depth_limit == 0 {
return;
Expand All @@ -321,7 +321,7 @@ fn record_json_value_to_columnar_writer(
}
}
serde_json::Value::String(text) => {
if let Some(text_analyzer) = tokenizer {
if let Some(text_analyzer) = tokenizer.as_mut() {
let mut token_stream = text_analyzer.token_stream(text);
token_stream.process(&mut |token| {
columnar_writer.record_str(doc, json_path_writer.as_str(), &token.text);
Expand Down Expand Up @@ -379,7 +379,7 @@ mod tests {
JSON_DEPTH_LIMIT,
&mut json_path,
&mut columnar_writer,
&None,
&mut None,
);
}
let mut buffer = Vec::new();
Expand Down
8 changes: 5 additions & 3 deletions src/indexer/segment_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -185,10 +185,11 @@ impl SegmentWriter {

match field_entry.field_type() {
FieldType::Facet(_) => {
let mut facet_tokenizer = FacetTokenizer::default(); // this can be global
for value in values {
let facet = value.as_facet().ok_or_else(make_schema_error)?;
let facet_str = facet.encoded_str();
let mut facet_tokenizer = FacetTokenizer.token_stream(facet_str);
let mut facet_tokenizer = facet_tokenizer.token_stream(facet_str);
let mut indexing_position = IndexingPosition::default();
postings_writer.index_text(
doc_id,
Expand All @@ -208,7 +209,7 @@ impl SegmentWriter {
}
Value::Str(ref text) => {
let text_analyzer =
&self.per_field_text_analyzers[field.field_id() as usize];
&mut self.per_field_text_analyzers[field.field_id() as usize];
text_analyzer.token_stream(text)
}
_ => {
Expand Down Expand Up @@ -304,7 +305,8 @@ impl SegmentWriter {
}
}
FieldType::JsonObject(json_options) => {
let text_analyzer = &self.per_field_text_analyzers[field.field_id() as usize];
let text_analyzer =
&mut self.per_field_text_analyzers[field.field_id() as usize];
let json_values_it =
values.map(|value| value.as_json().ok_or_else(make_schema_error));
index_json_values(
Expand Down
4 changes: 2 additions & 2 deletions src/postings/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ pub mod tests {
let index = Index::create_in_ram(schema);
index
.tokenizers()
.register("simple_no_truncation", SimpleTokenizer);
.register("simple_no_truncation", SimpleTokenizer::default());
let reader = index.reader()?;
let mut index_writer = index.writer_for_tests()?;

Expand Down Expand Up @@ -194,7 +194,7 @@ pub mod tests {
let index = Index::create_in_ram(schema);
index
.tokenizers()
.register("simple_no_truncation", SimpleTokenizer);
.register("simple_no_truncation", SimpleTokenizer::default());
let reader = index.reader()?;
let mut index_writer = index.writer_for_tests()?;

Expand Down
44 changes: 24 additions & 20 deletions src/query/more_like_this/more_like_this.rs
Original file line number Diff line number Diff line change
Expand Up @@ -192,45 +192,49 @@ impl MoreLikeThis {
})
.collect::<Result<Vec<_>>>()?;
for fake_str in facets {
FacetTokenizer.token_stream(fake_str).process(&mut |token| {
if self.is_noise_word(token.text.clone()) {
let term = Term::from_field_text(field, &token.text);
*term_frequencies.entry(term).or_insert(0) += 1;
}
});
FacetTokenizer::default()
.token_stream(fake_str)
.process(&mut |token| {
if self.is_noise_word(token.text.clone()) {
let term = Term::from_field_text(field, &token.text);
*term_frequencies.entry(term).or_insert(0) += 1;
}
});
}
}
FieldType::Str(text_options) => {
let mut token_streams: Vec<BoxTokenStream> = vec![];

for value in values {
match value {
Value::PreTokStr(tok_str) => {
token_streams.push(PreTokenizedStream::from(tok_str.clone()).into());
let mut token_stream: BoxTokenStream =
PreTokenizedStream::from(tok_str.clone()).into();
token_stream.process(&mut |token| {
if !self.is_noise_word(token.text.clone()) {
let term = Term::from_field_text(field, &token.text);
*term_frequencies.entry(term).or_insert(0) += 1;
}
});
}
Value::Str(ref text) => {
if let Some(tokenizer) = text_options
if let Some(mut tokenizer) = text_options
.get_indexing_options()
.map(|text_indexing_options| {
text_indexing_options.tokenizer().to_string()
})
.and_then(|tokenizer_name| tokenizer_manager.get(&tokenizer_name))
{
token_streams.push(tokenizer.token_stream(text));
let mut token_stream = tokenizer.token_stream(text);
token_stream.process(&mut |token| {
if !self.is_noise_word(token.text.clone()) {
let term = Term::from_field_text(field, &token.text);
*term_frequencies.entry(term).or_insert(0) += 1;
}
});
}
}
_ => (),
}
}

for mut token_stream in token_streams {
token_stream.process(&mut |token| {
if !self.is_noise_word(token.text.clone()) {
let term = Term::from_field_text(field, &token.text);
*term_frequencies.entry(term).or_insert(0) += 1;
}
});
}
}
FieldType::U64(_) => {
for value in values {
Expand Down
16 changes: 8 additions & 8 deletions src/query/query_parser/query_parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,7 @@ impl QueryParser {
// This should have been seen earlier really.
QueryParserError::FieldNotIndexed(field_entry.name().to_string())
})?;
let text_analyzer =
let mut text_analyzer =
self.tokenizer_manager
.get(option.tokenizer())
.ok_or_else(|| QueryParserError::UnknownTokenizer {
Expand Down Expand Up @@ -497,7 +497,7 @@ impl QueryParser {
// This should have been seen earlier really.
QueryParserError::FieldNotIndexed(field_name.to_string())
})?;
let text_analyzer = self
let mut text_analyzer = self
.tokenizer_manager
.get(indexing_options.tokenizer())
.ok_or_else(|| QueryParserError::UnknownTokenizer {
Expand All @@ -511,7 +511,7 @@ impl QueryParser {
slop,
prefix,
indexing_options,
&text_analyzer,
&mut text_analyzer,
)?
.into_iter()
.collect())
Expand Down Expand Up @@ -795,7 +795,7 @@ fn generate_literals_for_str(
slop: u32,
prefix: bool,
indexing_options: &TextFieldIndexing,
text_analyzer: &TextAnalyzer,
text_analyzer: &mut TextAnalyzer,
) -> Result<Option<LogicalLiteral>, QueryParserError> {
let mut terms: Vec<(usize, Term)> = Vec::new();
let mut token_stream = text_analyzer.token_stream(phrase);
Expand Down Expand Up @@ -840,7 +840,7 @@ fn generate_literals_for_json_object(
// This should have been seen earlier really.
QueryParserError::FieldNotIndexed(field_name.to_string())
})?;
let text_analyzer = tokenizer_manager
let mut text_analyzer = tokenizer_manager
.get(text_options.tokenizer())
.ok_or_else(|| QueryParserError::UnknownTokenizer {
field: field_name.to_string(),
Expand All @@ -858,7 +858,7 @@ fn generate_literals_for_json_object(
if let Some(term) = convert_to_fast_value_and_get_term(&mut json_term_writer, phrase) {
logical_literals.push(LogicalLiteral::Term(term));
}
let terms = set_string_and_get_terms(&mut json_term_writer, phrase, &text_analyzer);
let terms = set_string_and_get_terms(&mut json_term_writer, phrase, &mut text_analyzer);
drop(json_term_writer);
if terms.len() <= 1 {
for (_, term) in terms {
Expand Down Expand Up @@ -959,7 +959,7 @@ mod test {
let tokenizer_manager = TokenizerManager::default();
tokenizer_manager.register(
"en_with_stop_words",
TextAnalyzer::builder(SimpleTokenizer)
TextAnalyzer::builder(SimpleTokenizer::default())
.filter(LowerCaser)
.filter(StopWordFilter::remove(vec!["the".to_string()]))
.build(),
Expand Down Expand Up @@ -1463,7 +1463,7 @@ mod test {
let index = Index::create_in_ram(schema);
index
.tokenizers()
.register("customtokenizer", SimpleTokenizer);
.register("customtokenizer", SimpleTokenizer::default());
let query_parser = QueryParser::for_index(&index, vec![title]);
assert_eq!(
query_parser.parse_query("title:\"happy tax\"").unwrap_err(),
Expand Down
Loading