Skip to content

Commit

Permalink
Removes default specialization Python tokenizer.
Browse files Browse the repository at this point in the history
Previously, the Python tokenizer was throwing away comment tokens by default. This change removes that default, but still ensures that old tests (which assumed that default) pass.

PiperOrigin-RevId: 351193054
  • Loading branch information
maniatis authored and copybara-github committed Jan 11, 2021
1 parent 0f6d747 commit 74c8e25
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 18 deletions.
9 changes: 5 additions & 4 deletions cubert/cubert_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@

import abc
import tokenize
from typing import Collection
from typing import Dict
from typing import Iterable
from typing import List
from typing import Mapping
from typing import Sequence
from typing import Text
Expand All @@ -42,7 +42,7 @@ class CuBertTokenizer(abc.ABC):

def __init__(self, max_output_token_length = MAX_OUTPUT_TOKEN_LENGTH,
reserved = ()):
self.types_to_skip = []
self.types_to_skip = ()
self.reserved = reserved
self.mappings: Dict[str, str]
self.update_mappings({
Expand Down Expand Up @@ -87,14 +87,15 @@ def untokenize_abstract(self, whole_tokens):
"""

def update_types_to_skip(
self, types_to_skip):
self, types_to_skip
):
"""Replaces the set of token types that are ignored.
Each tokenizer may provide different semantics with respect to this list,
and may ignore it altogether.
Args:
types_to_skip: List of types (from the constants in the `token` module) or
types_to_skip: Types (from the constants in the `token` module) or
`unified_tokenizer.TokenKind`. Note that some of those constants are
actually defined in the `tokenize` module.
"""
Expand Down
6 changes: 0 additions & 6 deletions cubert/python_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,6 @@ class PythonTokenizer(cubert_tokenizer.CuBertTokenizer):
tokenize.NEWLINE, tokenize.DEDENT, tokenize.NL
]

def __init__(self, *args, **kwargs):
super(PythonTokenizer, self).__init__(*args, **kwargs)

# By default, we drop COMMENT tokens.
self.update_types_to_skip([unified_tokenizer.TokenKind.COMMENT])

def tokenize_and_abstract(
self,
source_code):
Expand Down
10 changes: 2 additions & 8 deletions cubert/python_tokenizer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,15 +173,9 @@ def test_python_tokenize_abstract_returns_positioning(
'a = b\n#comment\nc = d',
['a', '=', 'b', _NEWLINE_NAME, '___NL___', 'c', '=', 'd', _EOS_NAME],
),
(
'comments_are_skipped_by_default',
None,
'a = b\n#comment\nc = d',
['a', '=', 'b', _NEWLINE_NAME, '___NL___', 'c', '=', 'd', _EOS_NAME],
),
(
'comments_are_not_skipped_when_desired',
[],
(),
'# Comment.',
# This NL token is preserved as ___NL___ and not replaced by
# ___NLCHAR___ because it's introduced by the Python lexer for
Expand Down Expand Up @@ -347,7 +341,7 @@ def test_python_tokenize_respects_mappings(self, mappings, source, expected):
' # My comment\n'
' pass',
'def f ():\n'
'\n'
'# My comment\n' # Preserved
' pass ',
),
(
Expand Down

0 comments on commit 74c8e25

Please sign in to comment.