Skip to content

Commit

Permalink
Make construction of LevenshteinAutomatonBuilder for FuzzyTermQuery …
Browse files Browse the repository at this point in the history
…instances lazy.
  • Loading branch information
adamreichold committed Jan 4, 2023
1 parent f97eee8 commit b288f2b
Showing 1 changed file with 34 additions and 43 deletions.
77 changes: 34 additions & 43 deletions src/query/fuzzy_query.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
use std::collections::HashMap;
use std::ops::Range;

use levenshtein_automata::{Distance, LevenshteinAutomatonBuilder, DFA};
use once_cell::sync::Lazy;
use once_cell::sync::OnceCell;
use tantivy_fst::Automaton;

use crate::query::{AutomatonWeight, EnableScoring, Query, Weight};
Expand Down Expand Up @@ -34,22 +31,6 @@ impl Automaton for DfaWrapper {
}
}

/// A range of Levenshtein distances that we will build DFAs for our terms
/// The computation is exponential, so best keep it to low single digits
const VALID_LEVENSHTEIN_DISTANCE_RANGE: Range<u8> = 0..3;

static LEV_BUILDER: Lazy<HashMap<(u8, bool), LevenshteinAutomatonBuilder>> = Lazy::new(|| {
let mut lev_builder_cache = HashMap::new();
// TODO make population lazy on a `(distance, val)` basis
for distance in VALID_LEVENSHTEIN_DISTANCE_RANGE {
for &transposition in &[false, true] {
let lev_automaton_builder = LevenshteinAutomatonBuilder::new(distance, transposition);
lev_builder_cache.insert((distance, transposition), lev_automaton_builder);
}
}
lev_builder_cache
});

/// A Fuzzy Query matches all of the documents
/// containing a specific term that is within
/// Levenshtein distance
Expand Down Expand Up @@ -129,30 +110,40 @@ impl FuzzyTermQuery {
}

fn specialized_weight(&self) -> crate::Result<AutomatonWeight<DfaWrapper>> {
// LEV_BUILDER is a HashMap, whose `get` method returns an Option
match LEV_BUILDER.get(&(self.distance, self.transposition_cost_one)) {
// Unwrap the option and build the Ok(AutomatonWeight)
Some(automaton_builder) => {
let term_text = self.term.as_str().ok_or_else(|| {
crate::TantivyError::InvalidArgument(
"The fuzzy term query requires a string term.".to_string(),
)
})?;
let automaton = if self.prefix {
automaton_builder.build_prefix_dfa(term_text)
} else {
automaton_builder.build_dfa(term_text)
};
Ok(AutomatonWeight::new(
self.term.field(),
DfaWrapper(automaton),
))
}
None => Err(InvalidArgument(format!(
"Levenshtein distance of {} is not allowed. Choose a value in the {:?} range",
self.distance, VALID_LEVENSHTEIN_DISTANCE_RANGE
))),
if self.distance as usize >= AUTOMATON_BUILDER.len() {
return Err(InvalidArgument(format!(
"Levenshtein distance of {} is not allowed. Choose a value less than {}",
self.distance,
AUTOMATON_BUILDER.len()
)));
}

static AUTOMATON_BUILDER: [[OnceCell<LevenshteinAutomatonBuilder>; 2]; 3] = [
[OnceCell::new(), OnceCell::new()],
[OnceCell::new(), OnceCell::new()],
[OnceCell::new(), OnceCell::new()],
];

let automaton_builder = AUTOMATON_BUILDER[self.distance as usize]
[self.transposition_cost_one as usize]
.get_or_init(|| {
LevenshteinAutomatonBuilder::new(self.distance, self.transposition_cost_one)
});

let term_text = self.term.as_str().ok_or_else(|| {
crate::TantivyError::InvalidArgument(
"The fuzzy term query requires a string term.".to_string(),
)
})?;
let automaton = if self.prefix {
automaton_builder.build_prefix_dfa(term_text)
} else {
automaton_builder.build_dfa(term_text)
};
Ok(AutomatonWeight::new(
self.term.field(),
DfaWrapper(automaton),
))
}
}

Expand Down

0 comments on commit b288f2b

Please sign in to comment.