Skip to content

Commit

Permalink
Makes keywords size-splittable, to account for long reserved words an…
Browse files Browse the repository at this point in the history
…d keywords.

This ensures that long reserved keywords, or programmatic keywords (e.g., Python's INDENT token spellings) are kept to a reasonable subtoken length.

PiperOrigin-RevId: 392552460
  • Loading branch information
maniatis authored and copybara-github committed Aug 24, 2021
1 parent f8cf4e0 commit 564c56d
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 4 deletions.
7 changes: 5 additions & 2 deletions cubert/unified_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ def fill_range_with_whitespace(start,
TokenKind.IDENTIFIER,
TokenKind.WHITESPACE,
TokenKind.ERROR,
TokenKind.KEYWORD,
)

_UPPERCASE = r'\p{Lu}'
Expand Down Expand Up @@ -598,8 +599,10 @@ def _shorten_subtokens(
"""Further subtokenizes any subtokens that are too long.
At this point, we're done with all heuristic splitting. Now split what's left
by length if need be. We don't do anything about keywords or other
punctuation.
by length if need be. We do allow keyword splitting by length, because in
some cases (e.g., Python indentation), what we tag as a keyword can have
arbitrary length. We don't split punctuation, but that might also be revised
in the future.
Args:
token_lists: List of subtoken lists, of which only those of kinds
Expand Down
5 changes: 3 additions & 2 deletions cubert/unified_tokenizer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@ class SplitAgnosticTest(parameterized.TestCase):
_IDENTIFIER = unified_tokenizer.TokenKind.IDENTIFIER
_NUMBER = unified_tokenizer.TokenKind.NUMBER
_STRING = unified_tokenizer.TokenKind.STRING
_KEYWORD = unified_tokenizer.TokenKind.KEYWORD

@parameterized.named_parameters(
(
Expand Down Expand Up @@ -317,6 +318,7 @@ class SplitAgnosticTest(parameterized.TestCase):
('1234', _NUMBER),
('bb', _COMMENT),
('11', _NUMBER),
('___INDENT___ ', _KEYWORD),
],
2,
[
Expand All @@ -329,20 +331,19 @@ class SplitAgnosticTest(parameterized.TestCase):
(['12', '34'], _NUMBER),
(['bb'], _COMMENT),
(['11'], _NUMBER),
(['__', '_I', 'ND', 'EN', 'T_', '__', ' ', ' '], _KEYWORD),
],
),
(
'language_strings_unaffected',
[
('punctuation', unified_tokenizer.TokenKind.PUNCTUATION),
('keyworddddd', unified_tokenizer.TokenKind.KEYWORD),
('newlineeeee', unified_tokenizer.TokenKind.NEWLINE),
('eosssssssss', unified_tokenizer.TokenKind.EOS),
],
2,
[
(['punctuation'], unified_tokenizer.TokenKind.PUNCTUATION),
(['keyworddddd'], unified_tokenizer.TokenKind.KEYWORD),
(['newlineeeee'], unified_tokenizer.TokenKind.NEWLINE),
(['eosssssssss'], unified_tokenizer.TokenKind.EOS),
],
Expand Down

0 comments on commit 564c56d

Please sign in to comment.