Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve non ascii checker #5643

Merged
merged 17 commits into from
Jan 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CONTRIBUTORS.txt
Original file line number Diff line number Diff line change
Expand Up @@ -595,3 +595,6 @@ contributors:
* Eero Vuojolahti: contributor

* Kian-Meng, Ang: contributor

* Carli* Freudenberg (CarliJoy): contributor
- Improve non-ascii-name checker
9 changes: 9 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,15 @@ Release date: TBA

Closes #5588

* Rewrote checker for ``non-ascii-name``.
It now ensures __all__ Python names are ASCII and also properly
checks the names of imports (``non-ascii-module-import``) as
well as file names (``non-ascii-file-name``) and emits their respective new warnings.

Non ASCII characters could be homoglyphs (look alike characters) and hard to
enter on a non specialized keyboard.
See `Confusable Characters in PEP 672 <https://www.python.org/dev/peps/pep-0672/#confusable-characters-in-identifiers>`_

* When run in parallel mode ``pylint`` now pickles the data passed to subprocesses with
the ``dill`` package. The ``dill`` package has therefore been added as a dependency.

Expand Down
9 changes: 9 additions & 0 deletions doc/whatsnew/2.13.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,15 @@ New checkers

Closes #5460

* Rewrote Checker of ``non-ascii-name``.
It now ensures __all__ Python names are ASCII and also properly
checks the names of imports (``non-ascii-module-import``) as
well as file names (``non-ascii-file-name``) and emits their respective new warnings.

Non ASCII characters could be homoglyphs (look alike characters) and hard to
enter on a non specialized keyboard.
See `Confusable Characters in PEP 672 <https://www.python.org/dev/peps/pep-0672/#confusable-characters-in-identifiers>`_

Removed checkers
================

Expand Down
9 changes: 8 additions & 1 deletion pylint/checkers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,14 @@
15: stdlib
16: python3
17: refactoring
18-50: not yet used: reserved for future internal checkers.
.
.
.
24: non-ascii-names
25-50: not yet used: reserved for future internal checkers.
This file is not updated. Use
script/get_unused_message_id_category.py
to get the next free checker id.
51-99: perhaps used: reserved for external checkers

The raw_metrics checker has no number associated since it doesn't emit any
Expand Down
27 changes: 5 additions & 22 deletions pylint/checkers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1737,11 +1737,6 @@ class NameChecker(_BasicChecker):
]
},
),
"C0144": (
'%s name "%s" contains a non-ASCII unicode character',
"non-ascii-name",
"Used when the name contains at least one non-ASCII unicode character.",
),
"W0111": (
"Name %s will become a keyword in Python %s",
"assign-to-new-keyword",
Expand Down Expand Up @@ -1838,7 +1833,6 @@ def __init__(self, linter):
self._name_hints = {}
self._good_names_rgxs_compiled = []
self._bad_names_rgxs_compiled = []
self._non_ascii_rgx_compiled = re.compile("[^\u0000-\u007F]")

def open(self):
self.linter.stats.reset_bad_names()
Expand Down Expand Up @@ -1878,7 +1872,7 @@ def _create_naming_rules(self):

return regexps, hints

@utils.check_messages("disallowed-name", "invalid-name", "non-ascii-name")
@utils.check_messages("disallowed-name", "invalid-name")
def visit_module(self, node: nodes.Module) -> None:
self._check_name("module", node.name.split(".")[-1], node)
self._bad_names = {}
Expand All @@ -1904,19 +1898,15 @@ def leave_module(self, _: nodes.Module) -> None:
for args in warnings:
self._raise_name_warning(prevalent_group, *args)

@utils.check_messages(
"disallowed-name", "invalid-name", "assign-to-new-keyword", "non-ascii-name"
)
@utils.check_messages("disallowed-name", "invalid-name", "assign-to-new-keyword")
def visit_classdef(self, node: nodes.ClassDef) -> None:
self._check_assign_to_new_keyword_violation(node.name, node)
self._check_name("class", node.name, node)
for attr, anodes in node.instance_attrs.items():
if not any(node.instance_attr_ancestors(attr)):
self._check_name("attr", attr, anodes[0])

@utils.check_messages(
"disallowed-name", "invalid-name", "assign-to-new-keyword", "non-ascii-name"
)
@utils.check_messages("disallowed-name", "invalid-name", "assign-to-new-keyword")
def visit_functiondef(self, node: nodes.FunctionDef) -> None:
# Do not emit any warnings if the method is just an implementation
# of a base class method.
Expand Down Expand Up @@ -1944,14 +1934,12 @@ def visit_functiondef(self, node: nodes.FunctionDef) -> None:

visit_asyncfunctiondef = visit_functiondef

@utils.check_messages("disallowed-name", "invalid-name", "non-ascii-name")
@utils.check_messages("disallowed-name", "invalid-name")
def visit_global(self, node: nodes.Global) -> None:
for name in node.names:
self._check_name("const", name, node)

@utils.check_messages(
"disallowed-name", "invalid-name", "assign-to-new-keyword", "non-ascii-name"
)
@utils.check_messages("disallowed-name", "invalid-name", "assign-to-new-keyword")
def visit_assignname(self, node: nodes.AssignName) -> None:
"""check module level assigned names"""
self._check_assign_to_new_keyword_violation(node.name, node)
Expand Down Expand Up @@ -2041,11 +2029,6 @@ def _name_disallowed_by_regex(self, name: str) -> bool:

def _check_name(self, node_type, name, node, confidence=interfaces.HIGH):
"""check for a name using the type's regexp"""
non_ascii_match = self._non_ascii_rgx_compiled.match(name)
if non_ascii_match is not None:
self._raise_name_warning(
None, node, node_type, name, confidence, warning="non-ascii-name"
)

def _should_exempt_from_invalid_name(node):
if node_type == "variable":
Expand Down
201 changes: 201 additions & 0 deletions pylint/checkers/non_ascii_names.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
# Copyright (c) 2021-2022 Carli Freudenberg <[email protected]>

# Licensed under the GPL: https://www.gnu.org/licenses/old-licenses/gpl-2.0.html
# For details: https://github.com/PyCQA/pylint/blob/main/LICENSE
"""All alphanumeric unicode character are allowed in Python but due
to similarities in how they look they can be confused.

See: https://www.python.org/dev/peps/pep-0672/#confusable-characters-in-identifiers

The following checkers are intended to make users are aware of these issues.
"""

import sys
from typing import Optional, Union

from astroid import nodes

from pylint import constants, interfaces, lint
from pylint.checkers import base_checker, utils

if sys.version_info[:2] >= (3, 7):
# pylint: disable-next=fixme
# TODO: Remove after 3.6 has been deprecated
Py37Str = str
else:

class Py37Str(str):
# Allow Python 3.6 compatibility
def isascii(self: str) -> bool:
return all("\u0000" <= x <= "\u007F" for x in self)


NON_ASCII_HELP = (
"Used when the name contains at least one non-ASCII unicode character. "
"See https://www.python.org/dev/peps/pep-0672/#confusable-characters-in-identifiers"
" for a background why this could be bad. \n"
"If your programming guideline defines that you are programming in "
"English, then there should be no need for non ASCII characters in "
"Python Names. If not you can simply disable this check."
)


class NonAsciiNameChecker(base_checker.BaseChecker):
"""A strict name checker only allowing ASCII

Note: This check only checks Names, so it ignores the content of
docstrings and comments!
"""

__implements__ = interfaces.IAstroidChecker
priority = -1

msgs = {
"C2401": (
'%s name "%s" contains a non-ASCII character, consider renaming it.',
"non-ascii-name",
NON_ASCII_HELP,
{"old_names": [("C0144", "old-non-ascii-name")]},
),
# First %s will always be "file"
"W2402": (
(
'%s name "%s" contains a non-ASCII character. PEP 3131 only allows '
"non-ascii identifiers, not file names."
),
"non-ascii-file-name",
(
# Some = PyCharm at the time of writing didn't display the non_ascii_name_loł
# files and had big troubles with git.
# Probably only a bug shows the problem quite good.
# That's also why this is a warning and not only a convention!
"Some editors don't support non-ASCII file names properly. "
"Even though Python supports UTF-8 files since Python 3.5 this isn't "
"recommended for interoperability. Further reading:\n"
"- https://www.python.org/dev/peps/pep-0489/#export-hook-name\n"
"- https://www.python.org/dev/peps/pep-0672/#confusable-characters-in-identifiers\n"
"- https://bugs.python.org/issue20485\n"
),
),
# First %s will always be "module"
"C2403": (
'%s name "%s" contains a non-ASCII character, use an ASCII-only alias for import.',
"non-ascii-module-import",
NON_ASCII_HELP,
),
}

name = "NonASCII-Checker"

def _check_name(
self, node_type: str, name: Optional[str], node: nodes.NodeNG
) -> None:
"""Check whether a name is using non-ASCII characters."""

if name is None:
# For some nodes i.e. *kwargs from a dict, the name will be empty
return

if not (Py37Str(name).isascii()):
type_label = constants.HUMAN_READABLE_TYPES[node_type]
args = (type_label.capitalize(), name)

msg = "non-ascii-name"

# Some node types have customized messages
if node_type == "file":
msg = "non-ascii-file-name"
elif node_type == "module":
msg = "non-ascii-module-import"

self.add_message(msg, node=node, args=args, confidence=interfaces.HIGH)

@utils.check_messages("non-ascii-name")
def visit_module(self, node: nodes.Module) -> None:
self._check_name("file", node.name.split(".")[-1], node)

@utils.check_messages("non-ascii-name")
def visit_functiondef(
self, node: Union[nodes.FunctionDef, nodes.AsyncFunctionDef]
) -> None:
self._check_name("function", node.name, node)

# Check argument names
arguments = node.args

# Check position only arguments
if arguments.posonlyargs:
for pos_only_arg in arguments.posonlyargs:
self._check_name("argument", pos_only_arg.name, pos_only_arg)

# Check "normal" arguments
if arguments.args:
for arg in arguments.args:
self._check_name("argument", arg.name, arg)

# Check key word only arguments
if arguments.kwonlyargs:
for kwarg in arguments.kwonlyargs:
self._check_name("argument", kwarg.name, kwarg)

visit_asyncfunctiondef = visit_functiondef

@utils.check_messages("non-ascii-name")
def visit_global(self, node: nodes.Global) -> None:
for name in node.names:
self._check_name("const", name, node)

@utils.check_messages("non-ascii-name")
def visit_assignname(self, node: nodes.AssignName) -> None:
"""check module level assigned names"""
# The NameChecker from which this Checker originates knows a lot of different
# versions of variables, i.e. constants, inline variables etc.
# To simplify we use only `variable` here, as we don't need to apply different
# rules to different types of variables.
frame = node.frame()

if isinstance(frame, nodes.FunctionDef):
if node.parent in frame.body:
# Only perform the check if the assignment was done in within the body
# of the function (and not the function parameter definition
# (will be handled in visit_functiondef)
# or within a decorator (handled in visit_call)
self._check_name("variable", node.name, node)
elif isinstance(frame, nodes.ClassDef):
self._check_name("attr", node.name, node)
else:
# Possibilities here:
# - isinstance(node.assign_type(), nodes.Comprehension) == inlinevar
# - isinstance(frame, nodes.Module) == variable (constant?)
# - some other kind of assigment missed but still most likely a variable
self._check_name("variable", node.name, node)

@utils.check_messages("non-ascii-name")
def visit_classdef(self, node: nodes.ClassDef) -> None:
self._check_name("class", node.name, node)
for attr, anodes in node.instance_attrs.items():
if not any(node.instance_attr_ancestors(attr)):
self._check_name("attr", attr, anodes[0])

def _check_module_import(self, node: Union[nodes.ImportFrom, nodes.Import]) -> None:
for module_name, alias in node.names:
name = alias or module_name
self._check_name("module", name, node)

@utils.check_messages("non-ascii-name")
def visit_import(self, node: nodes.Import) -> None:
self._check_module_import(node)

@utils.check_messages("non-ascii-name")
def visit_importfrom(self, node: nodes.ImportFrom) -> None:
self._check_module_import(node)

@utils.check_messages("non-ascii-name")
def visit_call(self, node: nodes.Call) -> None:
"""Check if the used keyword args are correct."""
for keyword in node.keywords:
self._check_name("argument", keyword.arg, keyword)


def register(linter: lint.PyLinter) -> None:
linter.register_checker(NonAsciiNameChecker(linter))
1 change: 1 addition & 0 deletions pylint/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ class WarningScope:
Python {sys.version}"""

HUMAN_READABLE_TYPES = {
"file": "file",
"module": "module",
"const": "constant",
"class": "class",
Expand Down
15 changes: 13 additions & 2 deletions pylint/testutils/checker_test_case.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,18 @@ def assertNoMessages(self):
yield

@contextlib.contextmanager
def assertAddsMessages(self, *messages: MessageTest) -> Generator[None, None, None]:
def assertAddsMessages(
self, *messages: MessageTest, ignore_position: bool = False
) -> Generator[None, None, None]:
"""Assert that exactly the given method adds the given messages.

The list of messages must exactly match *all* the messages added by the
method. Additionally, we check to see whether the args in each message can
actually be substituted into the message string.

Using the keyword argument `ignore_position`, all checks for position
arguments (line, col_offset, ...) will be skipped. This can be used to
just test messages for the correct node.
"""
yield
got = self.linter.release_messages()
Expand All @@ -53,10 +59,15 @@ def assertAddsMessages(self, *messages: MessageTest) -> Generator[None, None, No

for expected_msg, gotten_msg in zip(messages, got):
assert expected_msg.msg_id == gotten_msg.msg_id, msg
assert expected_msg.line == gotten_msg.line, msg
assert expected_msg.node == gotten_msg.node, msg
assert expected_msg.args == gotten_msg.args, msg
assert expected_msg.confidence == gotten_msg.confidence, msg

if ignore_position:
# Do not check for line, col_offset etc...
continue

assert expected_msg.line == gotten_msg.line, msg
assert expected_msg.col_offset == gotten_msg.col_offset, msg
if PY38_PLUS:
# pylint: disable=fixme
Expand Down
Loading