Skip to content

Commit

Permalink
Turns out we introduced a regression because bad code. (#1060)
Browse files Browse the repository at this point in the history
  • Loading branch information
Narsil authored Sep 16, 2022
1 parent 7bfab48 commit 7c146d9
Showing 1 changed file with 30 additions and 4 deletions.
34 changes: 30 additions & 4 deletions tokenizers/src/decoders/wordpiece.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,12 @@ impl Decoder for WordPiece {
.iter_mut()
.enumerate()
.map(|(i, token)| {
if token.starts_with(&self.prefix) {
*token = token.replacen(&self.prefix, "", 1);
} else if i != 0 {
*token = format!(" {}", token);
if i != 0 {
if token.starts_with(&self.prefix) {
*token = token.replacen(&self.prefix, "", 1);
} else {
*token = format!(" {}", token);
}
}
if self.cleanup {
*token = cleanup(token);
Expand All @@ -62,3 +64,27 @@ impl Decoder for WordPiece {
.collect::<Result<_>>()
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn wordpiece_decoder() {
let decoder = WordPiece::new("##".to_string(), false);

assert_eq!(
decoder
.decode(vec![
"##uelo".to_string(),
"Ara".to_string(),
"##új".to_string(),
"##o".to_string(),
"No".to_string(),
"##guera".to_string()
])
.unwrap(),
"##uelo Araújo Noguera"
);
}
}

0 comments on commit 7c146d9

Please sign in to comment.