Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix bug with matching logic assuming sentence starts from index 0 #63

Merged
merged 1 commit into from
Apr 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions nlprule/src/rule/engine/composition.rs
Original file line number Diff line number Diff line change
Expand Up @@ -335,15 +335,12 @@ impl Group {
})
}

pub fn text<'a>(&self, text: &'a str) -> &'a str {
pub fn text<'a>(&self, sentence: &'a MatchSentence<'a>) -> &'a str {
if self.span.char().start >= self.span.char().end {
return "";
}

let mut char_indices: Vec<_> = text.char_indices().map(|(i, _)| i).collect();
char_indices.push(text.len());

&text[char_indices[self.span.char().start]..char_indices[self.span.char().end]]
sentence.slice(self.span.clone())
}
}

Expand Down Expand Up @@ -393,6 +390,11 @@ impl<'t> MatchSentence<'t> {
self.sentence.text()
}

pub fn slice(&self, span: Span) -> &str {
let span = span.lshift(self.span().start());
&self.text()[span.byte().clone()]
}

pub fn tagger(&self) -> &'t Tagger {
self.sentence.tagger()
}
Expand Down
5 changes: 4 additions & 1 deletion nlprule/src/rule/engine/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,10 @@ impl<'a, 't> Iterator for EngineMatches<'a, 't> {
.get(&byte_span.end)
.expect("byte index is at char boundary");

groups.push(Group::new(Span::new(byte_span, char_start..char_end)));
groups.push(Group::new(
Span::new(byte_span, char_start..char_end)
.rshift(sentence.span().start()),
));
} else {
groups.push(Group::new(Span::default()));
}
Expand Down
4 changes: 2 additions & 2 deletions nlprule/src/rule/grammar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ pub struct Match {

impl Match {
fn apply(&self, sentence: &MatchSentence, graph: &MatchGraph) -> Option<String> {
let text = graph.by_id(self.id).text(sentence.text());
let text = graph.by_id(self.id).text(sentence);

let mut text = if let Some(replacer) = &self.pos_replacer {
replacer.apply(text, sentence)?
Expand Down Expand Up @@ -170,7 +170,7 @@ impl Synthesizer {
.chars()
.next() // a word is expected to always have at least one char, but be defensive here
.map_or(false, char::is_uppercase))
|| first_token.span().byte().start == 0
|| first_token.span().start() == sentence.span().start()
})
.unwrap_or(false);

Expand Down
80 changes: 56 additions & 24 deletions nlprule/src/rule/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ use crate::{
use itertools::Itertools;
use log::{error, info, warn};
use serde::{Deserialize, Serialize};
use std::collections::HashSet;
use std::fmt;
use std::{collections::HashSet, ops::Range};

pub(crate) mod disambiguation;
pub(crate) mod engine;
Expand Down Expand Up @@ -94,7 +94,30 @@ pub struct DisambiguationRule {
}

#[derive(Default)]
pub(crate) struct Changes(Vec<Vec<HashSet<Range<usize>>>>);
pub(crate) struct Changes(Vec<Vec<HashSet<Span>>>);

// This is only used in tests at the moment.
// Could maybe be made generic.
impl Changes {
fn lshift(self, position: Position) -> Self {
Changes(
self.0
.into_iter()
.map(|spans| {
spans
.into_iter()
.map(|group_spans| {
group_spans
.into_iter()
.map(|span| span.lshift(position))
.collect()
})
.collect()
})
.collect(),
)
}
}

impl Changes {
pub fn is_empty(&self) -> bool {
Expand All @@ -113,7 +136,7 @@ impl DisambiguationRule {
return Changes::default();
}

let mut all_byte_spans = Vec::new();
let mut all_spans = Vec::new();

for graph in self.engine.get_matches(sentence, self.start, self.end) {
if let Some(unification) = &self.unification {
Expand All @@ -128,39 +151,34 @@ impl DisambiguationRule {
}
}

let mut byte_spans = Vec::new();
let mut spans = Vec::new();

for group_idx in GraphId::range(&self.start, &self.end) {
let group = graph.by_id(group_idx);

let group_byte_spans: HashSet<_> = group
.tokens(sentence)
.map(|x| x.span().byte().clone())
.collect();
let group_spans: HashSet<_> =
group.tokens(sentence).map(|x| x.span().clone()).collect();

byte_spans.push(group_byte_spans);
spans.push(group_spans);
}

all_byte_spans.push(byte_spans);
all_spans.push(spans);
}

Changes(all_byte_spans)
Changes(all_spans)
}

pub(crate) fn change<'t>(&'t self, sentence: &mut IncompleteSentence<'t>, changes: Changes) {
log::info!("applying {}", self.id);

for byte_spans in changes.0 {
for spans in changes.0 {
let mut groups = Vec::new();
let mut refs = sentence.iter_mut().collect::<Vec<_>>();

for group_byte_spans in byte_spans {
for group_spans in spans {
let mut group = Vec::new();

while let Some(i) = refs
.iter()
.position(|x| group_byte_spans.contains(&x.span().byte()))
{
while let Some(i) = refs.iter().position(|x| group_spans.contains(&x.span())) {
group.push(refs.remove(i));
}

Expand Down Expand Up @@ -189,8 +207,15 @@ impl DisambiguationRule {
.expect("test text must not be empty"),
Some(&self.id),
);
let sentence_before_complete = sentence_before.clone().into_sentence();
let changes = self.apply(&MatchSentence::new(&sentence_before_complete));

// shift the sentence to the right before matching to make sure
// nothing assumes the sentene starts from absolute index zero
let shift_delta = Position { byte: 1, char: 1 };
let sentence_before_complete =
sentence_before.clone().rshift(shift_delta).into_sentence();
let changes = self
.apply(&MatchSentence::new(&sentence_before_complete))
.lshift(shift_delta);

let mut sentence_after = sentence_before.clone();

Expand Down Expand Up @@ -319,10 +344,7 @@ impl<'a, 't> Iterator for Suggestions<'a, 't> {
return None;
}

let text_before = &sentence.text()[Span::from_positions(start, end)
.lshift(sentence.span().start())
.byte()
.clone()];
let text_before = sentence.slice(Span::from_positions(start, end));

// fix e. g. "Super , dass"
let replacements: Vec<String> = replacements
Expand Down Expand Up @@ -450,6 +472,11 @@ impl Rule {
pub fn test(&self, tokenizer: &Tokenizer) -> bool {
let mut passes = Vec::new();

// make sure relative position is handled correctly
// shifting the entire sentence must be a no-op as far as the matcher is concerned
// if the suggestions are shifted back
let shift_delta = Position { byte: 1, char: 1 };

for test in self.examples.iter() {
// by convention examples are always considered as one sentence even if the sentencizer would split
let sentence = tokenizer
Expand All @@ -458,9 +485,14 @@ impl Rule {
.tokenize(&test.text())
.expect("test text must not be empty."),
)
.rshift(shift_delta)
.into_sentence();

info!("Sentence: {:#?}", sentence);
let suggestions: Vec<_> = self.apply(&MatchSentence::new(&sentence)).collect();
let suggestions: Vec<_> = self
.apply(&MatchSentence::new(&sentence))
.map(|s| s.lshift(shift_delta))
.collect();

let pass = if suggestions.len() > 1 {
false
Expand Down
6 changes: 6 additions & 0 deletions nlprule/src/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -721,4 +721,10 @@ impl Suggestion {
self.span = self.span.rshift(position);
self
}

/// Shift the span left by the specified amount.
pub fn lshift(mut self, position: Position) -> Self {
self.span = self.span.lshift(position);
self
}
}