Skip to content

Commit

Permalink
Adds a wordpiece to whole-token converter. This is useful for looking…
Browse files Browse the repository at this point in the history
… at predictions.

This change also introduces some tests for the code-to-subtoken library.

PiperOrigin-RevId: 373478060
  • Loading branch information
maniatis authored and copybara-github committed May 12, 2021
1 parent affafc6 commit db60322
Show file tree
Hide file tree
Showing 3 changed files with 230 additions and 7 deletions.
133 changes: 128 additions & 5 deletions cubert/code_to_subtokenized_sentences.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

"""This modules demonstrates how to convert code to subtokenized sentences."""
import itertools
from typing import List, Text
from typing import List, Sequence, Tuple


from absl import logging
Expand All @@ -26,6 +26,129 @@
from cubert import unified_tokenizer


def wordpiece_ids_from_wordpiece_tokens(
wordpiece_subtokens,
subword_tokenizer):
return tuple(
subword_tokenizer._subtoken_string_to_id[w] # pylint: disable=protected-access
for w in wordpiece_subtokens)


def next_whole_token(
wordpiece_subtokens,
initial_tokenizer,
subword_tokenizer):
"""Greedily reconstitutes a whole token from a WordPiece list.
This function assumes that the wordpiece subtokens were constructed correctly
from a correctly subtokenized CuBERT tokenizer, but the sequence may be
truncated and thus incomplete.
The implementation is done in two stages: recognizing the first whole token
and then finding the correspondence of that first whole token to a prefix of
the subtoken sequence.
The implementation assumes that untokenization can do the best job on the full
context. So, it first untokenizes the whole sequence, and chooses the first
whole token.
To figure out the subtoken prefix that corresponds to that whole token, the
implementation greedily untokenizes longer and longer subtoken prefixes, until
the whole token is recognized in the output.
The reason for this somewhat expensive implementation is that the logic for
merging subtokens (for WordPiece and then for CuBERT) is intricate, and does
not export how many initial subtokens were consumed for each output token of
the next higher abstraction. What's more, a subtoken may align itself with
the previous or the next whole token, when the subtoken sequence is
incomplete.
Args:
wordpiece_subtokens: The subtokens to scan through.
initial_tokenizer: A CuBERT tokenizer.
subword_tokenizer: A SubwordTextEncoder.
Returns:
The first whole token matched, and the end index of the first subtoken index
after the first whole token. wordpiece_subtokens[0:end_index] should be
the subtokens corresponding to the whole token returned.
Raises:
ValueError if no whole token can be parsed.
"""

wordpiece_ids = wordpiece_ids_from_wordpiece_tokens(wordpiece_subtokens,
subword_tokenizer)
full_cubert_subtokens: List[str] = (
subword_tokenizer._subtoken_ids_to_tokens( # pylint: disable=protected-access
wordpiece_ids))

full_cubert_subtokens.append(
unified_tokenizer.quote_special(unified_tokenizer.TokenKind.EOS.name))

full_whole_tokens = initial_tokenizer.untokenize_agnostic(
full_cubert_subtokens)

if len(full_whole_tokens) < 2:
# It all came out a jumble. Reject it.
raise ValueError(f'Whole tokens {full_whole_tokens} ended up '
f'undifferentiable in {wordpiece_subtokens}.')

whole_token = full_whole_tokens[0]

for end_index in range(1, len(wordpiece_ids) + 1):
prefix_list = wordpiece_ids[:end_index]
partial_cubert_subtokens: List[str] = (
subword_tokenizer._subtoken_ids_to_tokens( # pylint: disable=protected-access
prefix_list))

# We strip EOS in `code_to_cubert_sentences`, so we have to add it back
# here.
partial_cubert_subtokens.append(
unified_tokenizer.quote_special(unified_tokenizer.TokenKind.EOS.name))

partial_whole_tokens = initial_tokenizer.untokenize_agnostic(
partial_cubert_subtokens)
if len(partial_whole_tokens) > 1:
if partial_whole_tokens[0] == whole_token:
return whole_token, end_index

# We got here because we couldn't match the whole token we found from the
# full sequence
raise ValueError('Could not find a whole token in %r' %
(wordpiece_subtokens,))


def wordpiece_subtokens_to_code(
wordpiece_subtokens,
initial_tokenizer,
subword_tokenizer):
"""Reverses the Wordpiece-to-CuBERT Subtoken-to-whole token conversion."""
# We have to map WordPiece subtoken strings back to WordPiece vocabulary IDs.
wordpiece_ids = wordpiece_ids_from_wordpiece_tokens(wordpiece_subtokens,
subword_tokenizer)

return wordpiece_ids_to_code(wordpiece_ids, initial_tokenizer,
subword_tokenizer)


def wordpiece_ids_to_code(
wordpiece_ids,
initial_tokenizer,
subword_tokenizer):
"""Reverses the Wordpiece-to-CuBERT Subtoken-to-whole token conversion."""
cubert_subtokens: List[str] = (
subword_tokenizer._subtoken_ids_to_tokens( # pylint: disable=protected-access
wordpiece_ids))

# We strip EOS in `code_to_cubert_sentences`, so we have to add it back here.
cubert_subtokens.append(
unified_tokenizer.quote_special(unified_tokenizer.TokenKind.EOS.name))

code = initial_tokenizer.untokenize(cubert_subtokens)
return code


def code_to_cubert_sentences(
code,
initial_tokenizer,
Expand All @@ -46,7 +169,7 @@ def code_to_cubert_sentences(
Returns:
A list of sentences.
"""
tokens = initial_tokenizer.tokenize(code)[:-1] # type: List[Text] # pytype: disable=annotation-type-mismatch
tokens: Sequence[str] = initial_tokenizer.tokenize(code)[:-1]
logging.vlog(5, 'Code >>>%s<<< is tokenized into >>>%s<<<.', code, tokens)

# This will split the list into sublists of non-NEWLINE tokens (key is
Expand Down Expand Up @@ -100,21 +223,21 @@ def code_to_cubert_sentences(
# drops any trailing \n's before tokenizing, but for the purpose of forming
# properly terminated sentences, we always end sentences in a NEWLINE token.
sentences = [s + [unified_tokenizer.NEWLINE] for s in raw_sentences
] # type: List[List[Text]]
] # type: List[List[str]]
logging.vlog(5, 'Tokens are split into sentences: >>>%s<<<.',
sentences)

# Now we have to encode tokens using the subword text encoder, expanding the
# sentences.
subtokenized_sentences = [] # type: List[List[Text]]
subtokenized_sentences = [] # type: List[List[str]]
for sentence in sentences:
encoded_tokens = [subword_tokenizer.encode_without_tokenizing(t)
for t in sentence] # type: List[List[int]]
logging.vlog(5, 'Sentence encoded into >>>%s<<<.', encoded_tokens)
flattened_encodings = sum(encoded_tokens, []) # type: List[int]
logging.vlog(5, 'Flattened into >>>%s<<<.', flattened_encodings)
decoded_tokens = subword_tokenizer.decode_list(
flattened_encodings) # type: List[Text]
flattened_encodings) # type: List[str]
logging.vlog(5, 'Sentence re-decoded into >>>%s<<<.', decoded_tokens)

subtokenized_sentences.append(decoded_tokens)
Expand Down
96 changes: 96 additions & 0 deletions cubert/code_to_subtokenized_sentences_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# coding=utf-8
# Copyright 2021 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Tests for code_to_subtokenized_sentences."""
from typing import Sequence


from absl.testing import absltest
from absl.testing import parameterized
from tensor2tensor.data_generators import text_encoder

from cubert import code_to_subtokenized_sentences
from cubert import python_tokenizer


class CodeToSubtokenizedSentencesTest(parameterized.TestCase):

_CODE = 'def foo_bar(): return("ab, cd")'
_RECONSTITUTED_CODE = 'def foo_bar ():return ("ab, cd")\n'
_WORDPIECE_VOCABULARY = ('def_', 'foo\\u^_', 'bar_', '(_', ')_', ':_', 'ret',
'urn_', '"^_', 'ab^_', ',^_', ' ^_', 'cd^_', '"_',
'\\u\\u\\uNEWLINE\\u\\u\\u_', 'uv', 'xy', 'yz_',
'ww^_', 'www_')
_QUOTED_VOCABULARY = (f'"{w}"' for w in _WORDPIECE_VOCABULARY)
_VOCABULARY_CONTENT = '\n'.join(_QUOTED_VOCABULARY)
_WORDPIECE_SUBTOKENS = [
'def_', 'foo\\u^_', 'bar_', '(_', ')_', ':_', 'ret', 'urn_', '(_', '"^_',
'ab^_', ',^_', ' ^_', 'cd^_', '"_', ')_', '\\u\\u\\uNEWLINE\\u\\u\\u_'
]

def setUp(self):
super().setUp()
self.tokenizer = python_tokenizer.PythonTokenizer()
subword_vocabulary_path = self.create_tempfile(
content=self._VOCABULARY_CONTENT).full_path
self.subword_text_encoder = text_encoder.SubwordTextEncoder(
subword_vocabulary_path)

def test_code_to_sentences(self):
sentences = code_to_subtokenized_sentences.code_to_cubert_sentences(
self._CODE, self.tokenizer, self.subword_text_encoder)
self.assertEqual([self._WORDPIECE_SUBTOKENS], sentences)

def test_wordpiece_tokens_to_code(self):
code = code_to_subtokenized_sentences.wordpiece_subtokens_to_code(
self._WORDPIECE_SUBTOKENS, self.tokenizer, self.subword_text_encoder)
self.assertEqual(self._RECONSTITUTED_CODE, code)

@parameterized.named_parameters(
('complete_wordpiece_only', ('uv', 'yz_'), 'uvyz', 2),
('complete_wordpiece_prefix', ('uv', 'yz_', 'yz_'), 'uvyz', 2),
('complete_wordpiece_before_incomplete', ('uv', 'yz_', 'xy'), 'uvyz', 2),
('complete_cubert_only', ('uv', 'ww^_', 'yz_'), 'uvwwyz', 3),
('complete_cubert_prefix', ('uv', 'ww^_', 'yz_', 'yz_'), 'uvwwyz', 3),
('complete_cubert_before_incomplete',
('uv', 'ww^_', 'yz_', 'xy'), 'uvwwyz', 3),
('complete_cubert_before_incomplete_cubert',
('uv', 'ww^_', 'yz_', 'ww^_'), 'uvwwyz', 3),
# Here SubwordTextEncoder accepts a partial WordPiece subtoken, and we
# interpret it as best we can.
('complete_cubert_subtoken_before_incomplete_wordpiece',
('uv', 'ww^_', 'xy'), 'uvwwxy', 3),
# The SubwordTextEncoder accepts a partial WordPiece subtoken.
('incomplete_wordpiece_only', ('uv',), 'uv', 1),
)
def test_next_whole_token(self, subtokens,
expected_whole_token,
expected_end_index):
actual_whole_result = code_to_subtokenized_sentences.next_whole_token(
subtokens, self.tokenizer, self.subword_text_encoder)
self.assertEqual((expected_whole_token, expected_end_index),
actual_whole_result)

@parameterized.named_parameters(
('incomplete_cubert_only', ('uv', 'ww^_')),
)
def test_next_whole_token_fails(self, subtokens):
with self.assertRaises(ValueError):
_ = code_to_subtokenized_sentences.next_whole_token(
subtokens, self.tokenizer, self.subword_text_encoder)


if __name__ == '__main__':
absltest.main()
8 changes: 6 additions & 2 deletions cubert/cubert_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,8 @@ def tokenize(self, source_code):
subtokens = unified_tokenizer.flatten_subtoken_lists(multi_tokens)
return subtokens

def untokenize(self, token_list):
"""Untokenizes via `untokenize_abstract`."""
def untokenize_agnostic(self, token_list):
"""Turns CuBERT subtokens into whole tokens."""
# Untokenize agnostic.
if (not token_list or token_list[-1] != unified_tokenizer.quote_special(
unified_tokenizer.TokenKind.EOS.name)):
Expand All @@ -190,7 +190,11 @@ def untokenize(self, token_list):
token_list,
sanitization_mapping=self.mappings,
sentinel=unified_tokenizer.SENTINEL)
return whole_tokens

def untokenize(self, token_list):
"""Untokenizes via `untokenize_abstract`."""
whole_tokens = self.untokenize_agnostic(token_list)
return self.untokenize_abstract(whole_tokens)


Expand Down

0 comments on commit db60322

Please sign in to comment.