Makes keywords size-splittable, to account for long reserved words an…

…d keywords. This ensures that long reserved keywords, or programmatic keywords (e.g., Python's INDENT token spellings) are kept to a reasonable subtoken length. PiperOrigin-RevId: 392552460
huanmei9 · Aug 24, 2021 · 564c56d · 564c56d
1 parent f8cf4e0
commit 564c56d
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 4 deletions.
diff --git a/cubert/unified_tokenizer.py b/cubert/unified_tokenizer.py
@@ -176,6 +176,7 @@ def fill_range_with_whitespace(start,
     TokenKind.IDENTIFIER,
     TokenKind.WHITESPACE,
     TokenKind.ERROR,
+    TokenKind.KEYWORD,
 )
 
 _UPPERCASE = r'\p{Lu}'
@@ -598,8 +599,10 @@ def _shorten_subtokens(
   """Further subtokenizes any subtokens that are too long.
 
   At this point, we're done with all heuristic splitting. Now split what's left
-  by length if need be. We don't do anything about keywords or other
-  punctuation.
+  by length if need be. We do allow keyword splitting by length, because in
+  some cases (e.g., Python indentation), what we tag as a keyword can have
+  arbitrary length. We don't split punctuation, but that might also be revised
+  in the future.
 
   Args:
     token_lists: List of subtoken lists, of which only those of kinds

diff --git a/cubert/unified_tokenizer_test.py b/cubert/unified_tokenizer_test.py
@@ -251,6 +251,7 @@ class SplitAgnosticTest(parameterized.TestCase):
   _IDENTIFIER = unified_tokenizer.TokenKind.IDENTIFIER
   _NUMBER = unified_tokenizer.TokenKind.NUMBER
   _STRING = unified_tokenizer.TokenKind.STRING
+  _KEYWORD = unified_tokenizer.TokenKind.KEYWORD
 
   @parameterized.named_parameters(
       (
@@ -317,6 +318,7 @@ class SplitAgnosticTest(parameterized.TestCase):
               ('1234', _NUMBER),
               ('bb', _COMMENT),
               ('11', _NUMBER),
+              ('___INDENT___   ', _KEYWORD),
           ],
           2,
           [
@@ -329,20 +331,19 @@ class SplitAgnosticTest(parameterized.TestCase):
               (['12', '34'], _NUMBER),
               (['bb'], _COMMENT),
               (['11'], _NUMBER),
+              (['__', '_I', 'ND', 'EN', 'T_', '__', '  ', ' '], _KEYWORD),
           ],
       ),
       (
           'language_strings_unaffected',
           [
               ('punctuation', unified_tokenizer.TokenKind.PUNCTUATION),
-              ('keyworddddd', unified_tokenizer.TokenKind.KEYWORD),
               ('newlineeeee', unified_tokenizer.TokenKind.NEWLINE),
               ('eosssssssss', unified_tokenizer.TokenKind.EOS),
           ],
           2,
           [
               (['punctuation'], unified_tokenizer.TokenKind.PUNCTUATION),
-              (['keyworddddd'], unified_tokenizer.TokenKind.KEYWORD),
               (['newlineeeee'], unified_tokenizer.TokenKind.NEWLINE),
               (['eosssssssss'], unified_tokenizer.TokenKind.EOS),
           ],