Skip to content

Commit

Permalink
Adding overly complex overflowing test.
Browse files Browse the repository at this point in the history
  • Loading branch information
Narsil committed Aug 31, 2022
1 parent 0c2f646 commit 1a7d0ae
Show file tree
Hide file tree
Showing 2 changed files with 121 additions and 16 deletions.
131 changes: 117 additions & 14 deletions tokenizers/src/processors/template.rs
Original file line number Diff line number Diff line change
Expand Up @@ -892,22 +892,26 @@ mod tests {
assert_eq!(processor.added_tokens(true), 3);

use crate::Token;
let encoding = Encoding::from_tokens(
let mut encoding = Encoding::from_tokens(
vec![
Token::new(12, "Hello".into(), (0, 5)),
Token::new(14, "there".into(), (6, 11)),
Token::new(13, "you".into(), (12, 15)),
],
0,
);
let pair = Encoding::from_tokens(
let overflowing = Encoding::from_tokens(vec![Token::new(13, "you".into(), (12, 15))], 0);
encoding.set_overflowing(vec![overflowing]);

let mut pair = Encoding::from_tokens(
vec![
Token::new(15, "pair".into(), (0, 4)),
Token::new(16, "with".into(), (5, 9)),
Token::new(17, "info".into(), (10, 14)),
],
0,
);
let pair_overflowing =
Encoding::from_tokens(vec![Token::new(17, "info".into(), (10, 14))], 0);
pair.set_overflowing(vec![pair_overflowing]);

let single_encoding = processor.process(encoding.clone(), None, true).unwrap();
assert_eq!(
Expand All @@ -925,38 +929,137 @@ mod tests {
vec![(0, 0), (0, 5), (6, 11), (0, 0)],
vec![1, 0, 0, 1],
vec![1, 1, 1, 1],
vec![],
vec![Encoding::new(
vec![1, 13, 0],
vec![0, 0, 0],
vec!["[CLS]".into(), "you".into(), "[SEP]".into()],
vec![None, None, None],
vec![(0, 0), (12, 15), (0, 0)],
vec![1, 0, 1],
vec![1, 1, 1],
vec![],
HashMap::from_iter(vec![(0, 1..2)]),
)],
HashMap::from_iter(vec![(0, 1..3)]),
)
);
assert_eq!(single_encoding.token_to_sequence(2), Some(0));
assert_eq!(single_encoding.token_to_sequence(3), None);
let pair_encoding = processor.process(encoding, Some(pair), true).unwrap();
println!("{pair_encoding:#?}");
assert_eq!(
pair_encoding,
Encoding::new(
vec![1, 12, 14, 0, 15, 0],
vec![0, 0, 0, 0, 1, 1],
vec![1, 12, 14, 0, 15, 16, 0],
vec![0, 0, 0, 0, 1, 1, 1],
vec![
"[CLS]".into(),
"Hello".into(),
"there".into(),
"[SEP]".into(),
"pair".into(),
"with".into(),
"[SEP]".into()
],
vec![None, None, None, None, None, None],
vec![(0, 0), (0, 5), (6, 11), (0, 0), (0, 4), (0, 0)],
vec![1, 0, 0, 1, 0, 1],
vec![1, 1, 1, 1, 1, 1],
vec![],
HashMap::from_iter(vec![(0, 1..3), (1, 4..5)]),
vec![None, None, None, None, None, None, None],
vec![(0, 0), (0, 5), (6, 11), (0, 0), (0, 4), (5, 9), (0, 0)],
vec![1, 0, 0, 1, 0, 0, 1],
vec![1, 1, 1, 1, 1, 1, 1],
vec![
Encoding::new(
vec![1, 13, 0, 15, 16, 0],
vec![0, 0, 0, 1, 1, 1],
vec![
"[CLS]".into(),
"you".into(),
"[SEP]".into(),
"pair".into(),
"with".into(),
"[SEP]".into()
],
vec![None, None, None, None, None, None],
vec![(0, 0), (12, 15), (0, 0), (0, 4), (5, 9), (0, 0)],
vec![1, 0, 1, 0, 0, 1],
vec![1, 1, 1, 1, 1, 1],
vec![Encoding::new(
vec![1, 13, 0, 17, 0],
vec![0, 0, 0, 0, 1],
vec![
"[CLS]".into(),
"you".into(),
"[SEP]".into(),
"info".into(),
"[SEP]".into()
],
vec![None, None, None, None, None,],
vec![(0, 0), (12, 15), (0, 0), (10, 14), (0, 0)],
vec![1, 0, 1, 0, 1],
vec![1, 1, 1, 1, 1],
vec![],
HashMap::from_iter(vec![(0, 1..2), (1, 3..4)]),
),],
HashMap::from_iter(vec![(1, 3..5), (0, 1..2)]),
),
Encoding::new(
vec![1, 13, 0, 17, 0],
vec![0, 0, 0, 0, 1],
vec![
"[CLS]".into(),
"you".into(),
"[SEP]".into(),
"info".into(),
"[SEP]".into()
],
vec![None, None, None, None, None,],
vec![(0, 0), (12, 15), (0, 0), (10, 14), (0, 0)],
vec![1, 0, 1, 0, 1],
vec![1, 1, 1, 1, 1],
vec![],
HashMap::from_iter(vec![(0, 1..2), (1, 3..4)]),
),
Encoding::new(
vec![1, 12, 14, 0, 17, 0],
vec![0, 0, 0, 0, 0, 1],
vec![
"[CLS]".into(),
"Hello".into(),
"there".into(),
"[SEP]".into(),
"info".into(),
"[SEP]".into()
],
vec![None, None, None, None, None, None],
vec![(0, 0), (0, 5), (6, 11), (0, 0), (10, 14), (0, 0)],
vec![1, 0, 0, 1, 0, 1],
vec![1, 1, 1, 1, 1, 1],
vec![Encoding::new(
vec![1, 13, 0, 17, 0],
vec![0, 0, 0, 0, 1],
vec![
"[CLS]".into(),
"you".into(),
"[SEP]".into(),
"info".into(),
"[SEP]".into()
],
vec![None, None, None, None, None,],
vec![(0, 0), (12, 15), (0, 0), (10, 14), (0, 0)],
vec![1, 0, 1, 0, 1],
vec![1, 1, 1, 1, 1],
vec![],
HashMap::from_iter(vec![(0, 1..2), (1, 3..4)]),
),],
HashMap::from_iter(vec![(0, 1..3), (1, 4..5)]),
)
],
HashMap::from_iter(vec![(0, 1..3), (1, 4..6)]),
)
);
assert_eq!(pair_encoding.token_to_sequence(2), Some(0));
assert_eq!(pair_encoding.token_to_sequence(3), None);
assert_eq!(pair_encoding.token_to_sequence(4), Some(1));
assert_eq!(pair_encoding.token_to_sequence(5), None);
assert_eq!(pair_encoding.token_to_sequence(5), Some(1));
assert_eq!(pair_encoding.token_to_sequence(6), None);
}
#[test]
fn pair_must_use_both_sequences() {
Expand Down
6 changes: 4 additions & 2 deletions tokenizers/src/tokenizer/encoding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,10 @@ impl Encoding {
&self.overflowing
}

pub fn set_overflowing(&mut self, overflowing: Vec<Encoding>) {
self.overflowing = overflowing;
}

pub fn get_overflowing_mut(&mut self) -> &mut Vec<Encoding> {
&mut self.overflowing
}
Expand Down Expand Up @@ -405,8 +409,6 @@ impl Encoding {
// Handle merging the overflowing parts too: Combine them all
// In most of the cases, we expect `pair.overflowing.len() == 0`
let mut overflowings = vec![];
println!("Overflowing self {:?}", self.overflowing);
println!("Overflowing pair {:?}", pair.overflowing);

// 1. All our overflowings with all the others
for self_o in &self.overflowing {
Expand Down

0 comments on commit 1a7d0ae

Please sign in to comment.