diff --git a/nlprule/src/rule/engine/composition.rs b/nlprule/src/rule/engine/composition.rs index dc53593..47c04d6 100644 --- a/nlprule/src/rule/engine/composition.rs +++ b/nlprule/src/rule/engine/composition.rs @@ -335,15 +335,12 @@ impl Group { }) } - pub fn text<'a>(&self, text: &'a str) -> &'a str { + pub fn text<'a>(&self, sentence: &'a MatchSentence<'a>) -> &'a str { if self.span.char().start >= self.span.char().end { return ""; } - let mut char_indices: Vec<_> = text.char_indices().map(|(i, _)| i).collect(); - char_indices.push(text.len()); - - &text[char_indices[self.span.char().start]..char_indices[self.span.char().end]] + sentence.slice(self.span.clone()) } } @@ -393,6 +390,11 @@ impl<'t> MatchSentence<'t> { self.sentence.text() } + pub fn slice(&self, span: Span) -> &str { + let span = span.lshift(self.span().start()); + &self.text()[span.byte().clone()] + } + pub fn tagger(&self) -> &'t Tagger { self.sentence.tagger() } diff --git a/nlprule/src/rule/engine/mod.rs b/nlprule/src/rule/engine/mod.rs index f7adb03..22ec069 100644 --- a/nlprule/src/rule/engine/mod.rs +++ b/nlprule/src/rule/engine/mod.rs @@ -125,7 +125,10 @@ impl<'a, 't> Iterator for EngineMatches<'a, 't> { .get(&byte_span.end) .expect("byte index is at char boundary"); - groups.push(Group::new(Span::new(byte_span, char_start..char_end))); + groups.push(Group::new( + Span::new(byte_span, char_start..char_end) + .rshift(sentence.span().start()), + )); } else { groups.push(Group::new(Span::default())); } diff --git a/nlprule/src/rule/grammar.rs b/nlprule/src/rule/grammar.rs index c8a9c1f..e4a9cb3 100644 --- a/nlprule/src/rule/grammar.rs +++ b/nlprule/src/rule/grammar.rs @@ -93,7 +93,7 @@ pub struct Match { impl Match { fn apply(&self, sentence: &MatchSentence, graph: &MatchGraph) -> Option { - let text = graph.by_id(self.id).text(sentence.text()); + let text = graph.by_id(self.id).text(sentence); let mut text = if let Some(replacer) = &self.pos_replacer { replacer.apply(text, sentence)? @@ -170,7 +170,7 @@ impl Synthesizer { .chars() .next() // a word is expected to always have at least one char, but be defensive here .map_or(false, char::is_uppercase)) - || first_token.span().byte().start == 0 + || first_token.span().start() == sentence.span().start() }) .unwrap_or(false); diff --git a/nlprule/src/rule/mod.rs b/nlprule/src/rule/mod.rs index 0aea1bd..b684df1 100644 --- a/nlprule/src/rule/mod.rs +++ b/nlprule/src/rule/mod.rs @@ -9,8 +9,8 @@ use crate::{ use itertools::Itertools; use log::{error, info, warn}; use serde::{Deserialize, Serialize}; +use std::collections::HashSet; use std::fmt; -use std::{collections::HashSet, ops::Range}; pub(crate) mod disambiguation; pub(crate) mod engine; @@ -94,7 +94,30 @@ pub struct DisambiguationRule { } #[derive(Default)] -pub(crate) struct Changes(Vec>>>); +pub(crate) struct Changes(Vec>>); + +// This is only used in tests at the moment. +// Could maybe be made generic. +impl Changes { + fn lshift(self, position: Position) -> Self { + Changes( + self.0 + .into_iter() + .map(|spans| { + spans + .into_iter() + .map(|group_spans| { + group_spans + .into_iter() + .map(|span| span.lshift(position)) + .collect() + }) + .collect() + }) + .collect(), + ) + } +} impl Changes { pub fn is_empty(&self) -> bool { @@ -113,7 +136,7 @@ impl DisambiguationRule { return Changes::default(); } - let mut all_byte_spans = Vec::new(); + let mut all_spans = Vec::new(); for graph in self.engine.get_matches(sentence, self.start, self.end) { if let Some(unification) = &self.unification { @@ -128,39 +151,34 @@ impl DisambiguationRule { } } - let mut byte_spans = Vec::new(); + let mut spans = Vec::new(); for group_idx in GraphId::range(&self.start, &self.end) { let group = graph.by_id(group_idx); - let group_byte_spans: HashSet<_> = group - .tokens(sentence) - .map(|x| x.span().byte().clone()) - .collect(); + let group_spans: HashSet<_> = + group.tokens(sentence).map(|x| x.span().clone()).collect(); - byte_spans.push(group_byte_spans); + spans.push(group_spans); } - all_byte_spans.push(byte_spans); + all_spans.push(spans); } - Changes(all_byte_spans) + Changes(all_spans) } pub(crate) fn change<'t>(&'t self, sentence: &mut IncompleteSentence<'t>, changes: Changes) { log::info!("applying {}", self.id); - for byte_spans in changes.0 { + for spans in changes.0 { let mut groups = Vec::new(); let mut refs = sentence.iter_mut().collect::>(); - for group_byte_spans in byte_spans { + for group_spans in spans { let mut group = Vec::new(); - while let Some(i) = refs - .iter() - .position(|x| group_byte_spans.contains(&x.span().byte())) - { + while let Some(i) = refs.iter().position(|x| group_spans.contains(&x.span())) { group.push(refs.remove(i)); } @@ -189,8 +207,15 @@ impl DisambiguationRule { .expect("test text must not be empty"), Some(&self.id), ); - let sentence_before_complete = sentence_before.clone().into_sentence(); - let changes = self.apply(&MatchSentence::new(&sentence_before_complete)); + + // shift the sentence to the right before matching to make sure + // nothing assumes the sentene starts from absolute index zero + let shift_delta = Position { byte: 1, char: 1 }; + let sentence_before_complete = + sentence_before.clone().rshift(shift_delta).into_sentence(); + let changes = self + .apply(&MatchSentence::new(&sentence_before_complete)) + .lshift(shift_delta); let mut sentence_after = sentence_before.clone(); @@ -319,10 +344,7 @@ impl<'a, 't> Iterator for Suggestions<'a, 't> { return None; } - let text_before = &sentence.text()[Span::from_positions(start, end) - .lshift(sentence.span().start()) - .byte() - .clone()]; + let text_before = sentence.slice(Span::from_positions(start, end)); // fix e. g. "Super , dass" let replacements: Vec = replacements @@ -450,6 +472,11 @@ impl Rule { pub fn test(&self, tokenizer: &Tokenizer) -> bool { let mut passes = Vec::new(); + // make sure relative position is handled correctly + // shifting the entire sentence must be a no-op as far as the matcher is concerned + // if the suggestions are shifted back + let shift_delta = Position { byte: 1, char: 1 }; + for test in self.examples.iter() { // by convention examples are always considered as one sentence even if the sentencizer would split let sentence = tokenizer @@ -458,9 +485,14 @@ impl Rule { .tokenize(&test.text()) .expect("test text must not be empty."), ) + .rshift(shift_delta) .into_sentence(); + info!("Sentence: {:#?}", sentence); - let suggestions: Vec<_> = self.apply(&MatchSentence::new(&sentence)).collect(); + let suggestions: Vec<_> = self + .apply(&MatchSentence::new(&sentence)) + .map(|s| s.lshift(shift_delta)) + .collect(); let pass = if suggestions.len() > 1 { false diff --git a/nlprule/src/types.rs b/nlprule/src/types.rs index 775ddb0..cb2d5fa 100644 --- a/nlprule/src/types.rs +++ b/nlprule/src/types.rs @@ -721,4 +721,10 @@ impl Suggestion { self.span = self.span.rshift(position); self } + + /// Shift the span left by the specified amount. + pub fn lshift(mut self, position: Position) -> Self { + self.span = self.span.lshift(position); + self + } }