Removes default specialization Python tokenizer.

Previously, the Python tokenizer was throwing away comment tokens by default. This change removes that default, but still ensures that old tests (which assumed that default) pass. PiperOrigin-RevId: 351193054
huanmei9 · Jan 11, 2021 · 74c8e25 · 74c8e25
1 parent 0f6d747
commit 74c8e25
Show file tree

Hide file tree

Showing 3 changed files with 7 additions and 18 deletions.
diff --git a/cubert/cubert_tokenizer.py b/cubert/cubert_tokenizer.py
@@ -17,9 +17,9 @@
 
 import abc
 import tokenize
+from typing import Collection
 from typing import Dict
 from typing import Iterable
-from typing import List
 from typing import Mapping
 from typing import Sequence
 from typing import Text
@@ -42,7 +42,7 @@ class CuBertTokenizer(abc.ABC):
 
   def __init__(self, max_output_token_length = MAX_OUTPUT_TOKEN_LENGTH,
                reserved = ()):
-    self.types_to_skip = []
+    self.types_to_skip = ()
     self.reserved = reserved
     self.mappings: Dict[str, str]
     self.update_mappings({
@@ -87,14 +87,15 @@ def untokenize_abstract(self, whole_tokens):
     """
 
   def update_types_to_skip(
-      self, types_to_skip):
+      self, types_to_skip
+  ):
     """Replaces the set of token types that are ignored.
 
     Each tokenizer may provide different semantics with respect to this list,
     and may ignore it altogether.
 
     Args:
-      types_to_skip: List of types (from the constants in the `token` module) or
+      types_to_skip: Types (from the constants in the `token` module) or
         `unified_tokenizer.TokenKind`. Note that some of those constants are
         actually defined in the `tokenize` module.
     """

diff --git a/cubert/python_tokenizer.py b/cubert/python_tokenizer.py
@@ -70,12 +70,6 @@ class PythonTokenizer(cubert_tokenizer.CuBertTokenizer):
       tokenize.NEWLINE, tokenize.DEDENT, tokenize.NL
   ]
 
-  def __init__(self, *args, **kwargs):
-    super(PythonTokenizer, self).__init__(*args, **kwargs)
-
-    # By default, we drop COMMENT tokens.
-    self.update_types_to_skip([unified_tokenizer.TokenKind.COMMENT])
-
   def tokenize_and_abstract(
       self,
       source_code):

diff --git a/cubert/python_tokenizer_test.py b/cubert/python_tokenizer_test.py
@@ -173,15 +173,9 @@ def test_python_tokenize_abstract_returns_positioning(
           'a = b\n#comment\nc = d',
           ['a', '=', 'b', _NEWLINE_NAME, '___NL___', 'c', '=', 'd', _EOS_NAME],
       ),
-      (
-          'comments_are_skipped_by_default',
-          None,
-          'a = b\n#comment\nc = d',
-          ['a', '=', 'b', _NEWLINE_NAME, '___NL___', 'c', '=', 'd', _EOS_NAME],
-      ),
       (
           'comments_are_not_skipped_when_desired',
-          [],
+          (),
           '# Comment.',
           # This NL token is preserved as ___NL___ and not replaced by
           # ___NLCHAR___ because it's introduced by the Python lexer for
@@ -347,7 +341,7 @@ def test_python_tokenize_respects_mappings(self, mappings, source, expected):
           '  # My comment\n'
           '  pass',
           'def f ():\n'
-          '\n'
+          '# My comment\n'  # Preserved
           '  pass ',
       ),
       (