Adds a wordpiece to whole-token converter. This is useful for looking…

… at predictions. This change also introduces some tests for the code-to-subtoken library. PiperOrigin-RevId: 373478060
huanmei9 · May 12, 2021 · db60322 · db60322
1 parent affafc6
commit db60322
Show file tree

Hide file tree

Showing 3 changed files with 230 additions and 7 deletions.
diff --git a/cubert/code_to_subtokenized_sentences.py b/cubert/code_to_subtokenized_sentences.py
@@ -15,7 +15,7 @@
 
 """This modules demonstrates how to convert code to subtokenized sentences."""
 import itertools
-from typing import List, Text
+from typing import List, Sequence, Tuple
 
 
 from absl import logging
@@ -26,6 +26,129 @@
 from cubert import unified_tokenizer
 
 
+def wordpiece_ids_from_wordpiece_tokens(
+    wordpiece_subtokens,
+    subword_tokenizer):
+  return tuple(
+      subword_tokenizer._subtoken_string_to_id[w]  # pylint: disable=protected-access
+      for w in wordpiece_subtokens)
+
+
+def next_whole_token(
+    wordpiece_subtokens,
+    initial_tokenizer,
+    subword_tokenizer):
+  """Greedily reconstitutes a whole token from a WordPiece list.
+
+  This function assumes that the wordpiece subtokens were constructed correctly
+  from a correctly subtokenized CuBERT tokenizer, but the sequence may be
+  truncated and thus incomplete.
+
+  The implementation is done in two stages: recognizing the first whole token
+  and then finding the correspondence of that first whole token to a prefix of
+  the subtoken sequence.
+
+  The implementation assumes that untokenization can do the best job on the full
+  context. So, it first untokenizes the whole sequence, and chooses the first
+  whole token.
+
+  To figure out the subtoken prefix that corresponds to that whole token, the
+  implementation greedily untokenizes longer and longer subtoken prefixes, until
+  the whole token is recognized in the output.
+
+  The reason for this somewhat expensive implementation is that the logic for
+  merging subtokens (for WordPiece and then for CuBERT) is intricate, and does
+  not export how many initial subtokens were consumed for each output token of
+  the next higher abstraction. What's more, a subtoken may align itself with
+  the previous or the next whole token, when the subtoken sequence is
+  incomplete.
+
+  Args:
+    wordpiece_subtokens: The subtokens to scan through.
+    initial_tokenizer: A CuBERT tokenizer.
+    subword_tokenizer: A SubwordTextEncoder.
+
+  Returns:
+    The first whole token matched, and the end index of the first subtoken index
+    after the first whole token. wordpiece_subtokens[0:end_index] should be
+    the subtokens corresponding to the whole token returned.
+
+  Raises:
+    ValueError if no whole token can be parsed.
+  """
+
+  wordpiece_ids = wordpiece_ids_from_wordpiece_tokens(wordpiece_subtokens,
+                                                      subword_tokenizer)
+  full_cubert_subtokens: List[str] = (
+      subword_tokenizer._subtoken_ids_to_tokens(  # pylint: disable=protected-access
+          wordpiece_ids))
+
+  full_cubert_subtokens.append(
+      unified_tokenizer.quote_special(unified_tokenizer.TokenKind.EOS.name))
+
+  full_whole_tokens = initial_tokenizer.untokenize_agnostic(
+      full_cubert_subtokens)
+
+  if len(full_whole_tokens) < 2:
+    # It all came out a jumble. Reject it.
+    raise ValueError(f'Whole tokens {full_whole_tokens} ended up '
+                     f'undifferentiable in {wordpiece_subtokens}.')
+
+  whole_token = full_whole_tokens[0]
+
+  for end_index in range(1, len(wordpiece_ids) + 1):
+    prefix_list = wordpiece_ids[:end_index]
+    partial_cubert_subtokens: List[str] = (
+        subword_tokenizer._subtoken_ids_to_tokens(  # pylint: disable=protected-access
+            prefix_list))
+
+    # We strip EOS in `code_to_cubert_sentences`, so we have to add it back
+    # here.
+    partial_cubert_subtokens.append(
+        unified_tokenizer.quote_special(unified_tokenizer.TokenKind.EOS.name))
+
+    partial_whole_tokens = initial_tokenizer.untokenize_agnostic(
+        partial_cubert_subtokens)
+    if len(partial_whole_tokens) > 1:
+      if partial_whole_tokens[0] == whole_token:
+        return whole_token, end_index
+
+  # We got here because we couldn't match the whole token we found from the
+  # full sequence
+  raise ValueError('Could not find a whole token in %r' %
+                   (wordpiece_subtokens,))
+
+
+def wordpiece_subtokens_to_code(
+    wordpiece_subtokens,
+    initial_tokenizer,
+    subword_tokenizer):
+  """Reverses the Wordpiece-to-CuBERT Subtoken-to-whole token conversion."""
+  # We have to map WordPiece subtoken strings back to WordPiece vocabulary IDs.
+  wordpiece_ids = wordpiece_ids_from_wordpiece_tokens(wordpiece_subtokens,
+                                                      subword_tokenizer)
+
+  return wordpiece_ids_to_code(wordpiece_ids, initial_tokenizer,
+                               subword_tokenizer)
+
+
+def wordpiece_ids_to_code(
+    wordpiece_ids,
+    initial_tokenizer,
+    subword_tokenizer):
+  """Reverses the Wordpiece-to-CuBERT Subtoken-to-whole token conversion."""
+  cubert_subtokens: List[str] = (
+      subword_tokenizer._subtoken_ids_to_tokens(  # pylint: disable=protected-access
+          wordpiece_ids))
+
+  # We strip EOS in `code_to_cubert_sentences`, so we have to add it back here.
+  cubert_subtokens.append(
+      unified_tokenizer.quote_special(unified_tokenizer.TokenKind.EOS.name))
+
+  code = initial_tokenizer.untokenize(cubert_subtokens)
+  return code
+
+
 def code_to_cubert_sentences(
     code,
     initial_tokenizer,
@@ -46,7 +169,7 @@ def code_to_cubert_sentences(
   Returns:
     A list of sentences.
   """
-  tokens = initial_tokenizer.tokenize(code)[:-1]  # type: List[Text]  # pytype: disable=annotation-type-mismatch
+  tokens: Sequence[str] = initial_tokenizer.tokenize(code)[:-1]
   logging.vlog(5, 'Code >>>%s<<< is tokenized into >>>%s<<<.', code, tokens)
 
   # This will split the list into sublists of non-NEWLINE tokens (key is
@@ -100,21 +223,21 @@ def code_to_cubert_sentences(
   # drops any trailing \n's before tokenizing, but for the purpose of forming
   # properly terminated sentences, we always end sentences in a NEWLINE token.
   sentences = [s + [unified_tokenizer.NEWLINE] for s in raw_sentences
-              ]  # type: List[List[Text]]
+              ]  # type: List[List[str]]
   logging.vlog(5, 'Tokens are split into sentences: >>>%s<<<.',
                sentences)
 
   # Now we have to encode tokens using the subword text encoder, expanding the
   # sentences.
-  subtokenized_sentences = []  # type: List[List[Text]]
+  subtokenized_sentences = []  # type: List[List[str]]
   for sentence in sentences:
     encoded_tokens = [subword_tokenizer.encode_without_tokenizing(t)
                       for t in sentence]  # type: List[List[int]]
     logging.vlog(5, 'Sentence encoded into >>>%s<<<.', encoded_tokens)
     flattened_encodings = sum(encoded_tokens, [])  # type: List[int]
     logging.vlog(5, 'Flattened into >>>%s<<<.', flattened_encodings)
     decoded_tokens = subword_tokenizer.decode_list(
-        flattened_encodings)  # type: List[Text]
+        flattened_encodings)  # type: List[str]
     logging.vlog(5, 'Sentence re-decoded into >>>%s<<<.', decoded_tokens)
 
     subtokenized_sentences.append(decoded_tokens)

diff --git a/cubert/code_to_subtokenized_sentences_test.py b/cubert/code_to_subtokenized_sentences_test.py
@@ -0,0 +1,96 @@
+# coding=utf-8
+# Copyright 2021 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for code_to_subtokenized_sentences."""
+from typing import Sequence
+
+
+from absl.testing import absltest
+from absl.testing import parameterized
+from tensor2tensor.data_generators import text_encoder
+
+from cubert import code_to_subtokenized_sentences
+from cubert import python_tokenizer
+
+
+class CodeToSubtokenizedSentencesTest(parameterized.TestCase):
+
+  _CODE = 'def foo_bar(): return("ab, cd")'
+  _RECONSTITUTED_CODE = 'def foo_bar ():return ("ab, cd")\n'
+  _WORDPIECE_VOCABULARY = ('def_', 'foo\\u^_', 'bar_', '(_', ')_', ':_', 'ret',
+                           'urn_', '"^_', 'ab^_', ',^_', ' ^_', 'cd^_', '"_',
+                           '\\u\\u\\uNEWLINE\\u\\u\\u_', 'uv', 'xy', 'yz_',
+                           'ww^_', 'www_')
+  _QUOTED_VOCABULARY = (f'"{w}"' for w in _WORDPIECE_VOCABULARY)
+  _VOCABULARY_CONTENT = '\n'.join(_QUOTED_VOCABULARY)
+  _WORDPIECE_SUBTOKENS = [
+      'def_', 'foo\\u^_', 'bar_', '(_', ')_', ':_', 'ret', 'urn_', '(_', '"^_',
+      'ab^_', ',^_', ' ^_', 'cd^_', '"_', ')_', '\\u\\u\\uNEWLINE\\u\\u\\u_'
+  ]
+
+  def setUp(self):
+    super().setUp()
+    self.tokenizer = python_tokenizer.PythonTokenizer()
+    subword_vocabulary_path = self.create_tempfile(
+        content=self._VOCABULARY_CONTENT).full_path
+    self.subword_text_encoder = text_encoder.SubwordTextEncoder(
+        subword_vocabulary_path)
+
+  def test_code_to_sentences(self):
+    sentences = code_to_subtokenized_sentences.code_to_cubert_sentences(
+        self._CODE, self.tokenizer, self.subword_text_encoder)
+    self.assertEqual([self._WORDPIECE_SUBTOKENS], sentences)
+
+  def test_wordpiece_tokens_to_code(self):
+    code = code_to_subtokenized_sentences.wordpiece_subtokens_to_code(
+        self._WORDPIECE_SUBTOKENS, self.tokenizer, self.subword_text_encoder)
+    self.assertEqual(self._RECONSTITUTED_CODE, code)
+
+  @parameterized.named_parameters(
+      ('complete_wordpiece_only', ('uv', 'yz_'), 'uvyz', 2),
+      ('complete_wordpiece_prefix', ('uv', 'yz_', 'yz_'), 'uvyz', 2),
+      ('complete_wordpiece_before_incomplete', ('uv', 'yz_', 'xy'), 'uvyz', 2),
+      ('complete_cubert_only', ('uv', 'ww^_', 'yz_'), 'uvwwyz', 3),
+      ('complete_cubert_prefix', ('uv', 'ww^_', 'yz_', 'yz_'), 'uvwwyz', 3),
+      ('complete_cubert_before_incomplete',
+       ('uv', 'ww^_', 'yz_', 'xy'), 'uvwwyz', 3),
+      ('complete_cubert_before_incomplete_cubert',
+       ('uv', 'ww^_', 'yz_', 'ww^_'), 'uvwwyz', 3),
+      # Here SubwordTextEncoder accepts a partial WordPiece subtoken, and we
+      # interpret it as best we can.
+      ('complete_cubert_subtoken_before_incomplete_wordpiece',
+       ('uv', 'ww^_', 'xy'), 'uvwwxy', 3),
+      # The SubwordTextEncoder accepts a partial WordPiece subtoken.
+      ('incomplete_wordpiece_only', ('uv',), 'uv', 1),
+  )
+  def test_next_whole_token(self, subtokens,
+                            expected_whole_token,
+                            expected_end_index):
+    actual_whole_result = code_to_subtokenized_sentences.next_whole_token(
+        subtokens, self.tokenizer, self.subword_text_encoder)
+    self.assertEqual((expected_whole_token, expected_end_index),
+                     actual_whole_result)
+
+  @parameterized.named_parameters(
+      ('incomplete_cubert_only', ('uv', 'ww^_')),
+  )
+  def test_next_whole_token_fails(self, subtokens):
+    with self.assertRaises(ValueError):
+      _ = code_to_subtokenized_sentences.next_whole_token(
+          subtokens, self.tokenizer, self.subword_text_encoder)
+
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/cubert/cubert_tokenizer.py b/cubert/cubert_tokenizer.py
@@ -176,8 +176,8 @@ def tokenize(self, source_code):
     subtokens = unified_tokenizer.flatten_subtoken_lists(multi_tokens)
     return subtokens
 
-  def untokenize(self, token_list):
-    """Untokenizes via `untokenize_abstract`."""
+  def untokenize_agnostic(self, token_list):
+    """Turns CuBERT subtokens into whole tokens."""
     # Untokenize agnostic.
     if (not token_list or token_list[-1] != unified_tokenizer.quote_special(
         unified_tokenizer.TokenKind.EOS.name)):
@@ -190,7 +190,11 @@ def untokenize(self, token_list):
         token_list,
         sanitization_mapping=self.mappings,
         sentinel=unified_tokenizer.SENTINEL)
+    return whole_tokens
 
+  def untokenize(self, token_list):
+    """Untokenizes via `untokenize_abstract`."""
+    whole_tokens = self.untokenize_agnostic(token_list)
     return self.untokenize_abstract(whole_tokens)