Skip to content

Commit

Permalink
Improve non ascii checker (#5643)
Browse files Browse the repository at this point in the history
* split ``non-ascii-name`` into 3 different msgs

- non-ascii-identifier (replaces non-ascii-name)
- non-ascii-file-name (a warning)
- non-ascii-module-import (only considering the namespace the import is imported in)

Co-authored-by: Daniël van Noord <[email protected]>
Co-authored-by: Pierre Sassoulas <[email protected]>
  • Loading branch information
3 people authored Jan 10, 2022
1 parent 83a4b8b commit d2475b4
Show file tree
Hide file tree
Showing 68 changed files with 945 additions and 30 deletions.
3 changes: 3 additions & 0 deletions CONTRIBUTORS.txt
Original file line number Diff line number Diff line change
Expand Up @@ -595,3 +595,6 @@ contributors:
* Eero Vuojolahti: contributor

* Kian-Meng, Ang: contributor

* Carli* Freudenberg (CarliJoy): contributor
- Improve non-ascii-name checker
9 changes: 9 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,15 @@ Release date: TBA

Closes #5588

* Rewrote checker for ``non-ascii-name``.
It now ensures __all__ Python names are ASCII and also properly
checks the names of imports (``non-ascii-module-import``) as
well as file names (``non-ascii-file-name``) and emits their respective new warnings.

Non ASCII characters could be homoglyphs (look alike characters) and hard to
enter on a non specialized keyboard.
See `Confusable Characters in PEP 672 <https://www.python.org/dev/peps/pep-0672/#confusable-characters-in-identifiers>`_

* When run in parallel mode ``pylint`` now pickles the data passed to subprocesses with
the ``dill`` package. The ``dill`` package has therefore been added as a dependency.

Expand Down
9 changes: 9 additions & 0 deletions doc/whatsnew/2.13.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,15 @@ New checkers

Closes #5460

* Rewrote Checker of ``non-ascii-name``.
It now ensures __all__ Python names are ASCII and also properly
checks the names of imports (``non-ascii-module-import``) as
well as file names (``non-ascii-file-name``) and emits their respective new warnings.

Non ASCII characters could be homoglyphs (look alike characters) and hard to
enter on a non specialized keyboard.
See `Confusable Characters in PEP 672 <https://www.python.org/dev/peps/pep-0672/#confusable-characters-in-identifiers>`_

Removed checkers
================

Expand Down
9 changes: 8 additions & 1 deletion pylint/checkers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,14 @@
15: stdlib
16: python3
17: refactoring
18-50: not yet used: reserved for future internal checkers.
.
.
.
24: non-ascii-names
25-50: not yet used: reserved for future internal checkers.
This file is not updated. Use
script/get_unused_message_id_category.py
to get the next free checker id.
51-99: perhaps used: reserved for external checkers
The raw_metrics checker has no number associated since it doesn't emit any
Expand Down
27 changes: 5 additions & 22 deletions pylint/checkers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1737,11 +1737,6 @@ class NameChecker(_BasicChecker):
]
},
),
"C0144": (
'%s name "%s" contains a non-ASCII unicode character',
"non-ascii-name",
"Used when the name contains at least one non-ASCII unicode character.",
),
"W0111": (
"Name %s will become a keyword in Python %s",
"assign-to-new-keyword",
Expand Down Expand Up @@ -1838,7 +1833,6 @@ def __init__(self, linter):
self._name_hints = {}
self._good_names_rgxs_compiled = []
self._bad_names_rgxs_compiled = []
self._non_ascii_rgx_compiled = re.compile("[^\u0000-\u007F]")

def open(self):
self.linter.stats.reset_bad_names()
Expand Down Expand Up @@ -1878,7 +1872,7 @@ def _create_naming_rules(self):

return regexps, hints

@utils.check_messages("disallowed-name", "invalid-name", "non-ascii-name")
@utils.check_messages("disallowed-name", "invalid-name")
def visit_module(self, node: nodes.Module) -> None:
self._check_name("module", node.name.split(".")[-1], node)
self._bad_names = {}
Expand All @@ -1904,19 +1898,15 @@ def leave_module(self, _: nodes.Module) -> None:
for args in warnings:
self._raise_name_warning(prevalent_group, *args)

@utils.check_messages(
"disallowed-name", "invalid-name", "assign-to-new-keyword", "non-ascii-name"
)
@utils.check_messages("disallowed-name", "invalid-name", "assign-to-new-keyword")
def visit_classdef(self, node: nodes.ClassDef) -> None:
self._check_assign_to_new_keyword_violation(node.name, node)
self._check_name("class", node.name, node)
for attr, anodes in node.instance_attrs.items():
if not any(node.instance_attr_ancestors(attr)):
self._check_name("attr", attr, anodes[0])

@utils.check_messages(
"disallowed-name", "invalid-name", "assign-to-new-keyword", "non-ascii-name"
)
@utils.check_messages("disallowed-name", "invalid-name", "assign-to-new-keyword")
def visit_functiondef(self, node: nodes.FunctionDef) -> None:
# Do not emit any warnings if the method is just an implementation
# of a base class method.
Expand Down Expand Up @@ -1944,14 +1934,12 @@ def visit_functiondef(self, node: nodes.FunctionDef) -> None:

visit_asyncfunctiondef = visit_functiondef

@utils.check_messages("disallowed-name", "invalid-name", "non-ascii-name")
@utils.check_messages("disallowed-name", "invalid-name")
def visit_global(self, node: nodes.Global) -> None:
for name in node.names:
self._check_name("const", name, node)

@utils.check_messages(
"disallowed-name", "invalid-name", "assign-to-new-keyword", "non-ascii-name"
)
@utils.check_messages("disallowed-name", "invalid-name", "assign-to-new-keyword")
def visit_assignname(self, node: nodes.AssignName) -> None:
"""check module level assigned names"""
self._check_assign_to_new_keyword_violation(node.name, node)
Expand Down Expand Up @@ -2041,11 +2029,6 @@ def _name_disallowed_by_regex(self, name: str) -> bool:

def _check_name(self, node_type, name, node, confidence=interfaces.HIGH):
"""check for a name using the type's regexp"""
non_ascii_match = self._non_ascii_rgx_compiled.match(name)
if non_ascii_match is not None:
self._raise_name_warning(
None, node, node_type, name, confidence, warning="non-ascii-name"
)

def _should_exempt_from_invalid_name(node):
if node_type == "variable":
Expand Down
201 changes: 201 additions & 0 deletions pylint/checkers/non_ascii_names.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
# Copyright (c) 2021-2022 Carli Freudenberg <[email protected]>

# Licensed under the GPL: https://www.gnu.org/licenses/old-licenses/gpl-2.0.html
# For details: https://github.com/PyCQA/pylint/blob/main/LICENSE
"""All alphanumeric unicode character are allowed in Python but due
to similarities in how they look they can be confused.
See: https://www.python.org/dev/peps/pep-0672/#confusable-characters-in-identifiers
The following checkers are intended to make users are aware of these issues.
"""

import sys
from typing import Optional, Union

from astroid import nodes

from pylint import constants, interfaces, lint
from pylint.checkers import base_checker, utils

if sys.version_info[:2] >= (3, 7):
# pylint: disable-next=fixme
# TODO: Remove after 3.6 has been deprecated
Py37Str = str
else:

class Py37Str(str):
# Allow Python 3.6 compatibility
def isascii(self: str) -> bool:
return all("\u0000" <= x <= "\u007F" for x in self)


NON_ASCII_HELP = (
"Used when the name contains at least one non-ASCII unicode character. "
"See https://www.python.org/dev/peps/pep-0672/#confusable-characters-in-identifiers"
" for a background why this could be bad. \n"
"If your programming guideline defines that you are programming in "
"English, then there should be no need for non ASCII characters in "
"Python Names. If not you can simply disable this check."
)


class NonAsciiNameChecker(base_checker.BaseChecker):
"""A strict name checker only allowing ASCII
Note: This check only checks Names, so it ignores the content of
docstrings and comments!
"""

__implements__ = interfaces.IAstroidChecker
priority = -1

msgs = {
"C2401": (
'%s name "%s" contains a non-ASCII character, consider renaming it.',
"non-ascii-name",
NON_ASCII_HELP,
{"old_names": [("C0144", "old-non-ascii-name")]},
),
# First %s will always be "file"
"W2402": (
(
'%s name "%s" contains a non-ASCII character. PEP 3131 only allows '
"non-ascii identifiers, not file names."
),
"non-ascii-file-name",
(
# Some = PyCharm at the time of writing didn't display the non_ascii_name_loł
# files and had big troubles with git.
# Probably only a bug shows the problem quite good.
# That's also why this is a warning and not only a convention!
"Some editors don't support non-ASCII file names properly. "
"Even though Python supports UTF-8 files since Python 3.5 this isn't "
"recommended for interoperability. Further reading:\n"
"- https://www.python.org/dev/peps/pep-0489/#export-hook-name\n"
"- https://www.python.org/dev/peps/pep-0672/#confusable-characters-in-identifiers\n"
"- https://bugs.python.org/issue20485\n"
),
),
# First %s will always be "module"
"C2403": (
'%s name "%s" contains a non-ASCII character, use an ASCII-only alias for import.',
"non-ascii-module-import",
NON_ASCII_HELP,
),
}

name = "NonASCII-Checker"

def _check_name(
self, node_type: str, name: Optional[str], node: nodes.NodeNG
) -> None:
"""Check whether a name is using non-ASCII characters."""

if name is None:
# For some nodes i.e. *kwargs from a dict, the name will be empty
return

if not (Py37Str(name).isascii()):
type_label = constants.HUMAN_READABLE_TYPES[node_type]
args = (type_label.capitalize(), name)

msg = "non-ascii-name"

# Some node types have customized messages
if node_type == "file":
msg = "non-ascii-file-name"
elif node_type == "module":
msg = "non-ascii-module-import"

self.add_message(msg, node=node, args=args, confidence=interfaces.HIGH)

@utils.check_messages("non-ascii-name")
def visit_module(self, node: nodes.Module) -> None:
self._check_name("file", node.name.split(".")[-1], node)

@utils.check_messages("non-ascii-name")
def visit_functiondef(
self, node: Union[nodes.FunctionDef, nodes.AsyncFunctionDef]
) -> None:
self._check_name("function", node.name, node)

# Check argument names
arguments = node.args

# Check position only arguments
if arguments.posonlyargs:
for pos_only_arg in arguments.posonlyargs:
self._check_name("argument", pos_only_arg.name, pos_only_arg)

# Check "normal" arguments
if arguments.args:
for arg in arguments.args:
self._check_name("argument", arg.name, arg)

# Check key word only arguments
if arguments.kwonlyargs:
for kwarg in arguments.kwonlyargs:
self._check_name("argument", kwarg.name, kwarg)

visit_asyncfunctiondef = visit_functiondef

@utils.check_messages("non-ascii-name")
def visit_global(self, node: nodes.Global) -> None:
for name in node.names:
self._check_name("const", name, node)

@utils.check_messages("non-ascii-name")
def visit_assignname(self, node: nodes.AssignName) -> None:
"""check module level assigned names"""
# The NameChecker from which this Checker originates knows a lot of different
# versions of variables, i.e. constants, inline variables etc.
# To simplify we use only `variable` here, as we don't need to apply different
# rules to different types of variables.
frame = node.frame()

if isinstance(frame, nodes.FunctionDef):
if node.parent in frame.body:
# Only perform the check if the assignment was done in within the body
# of the function (and not the function parameter definition
# (will be handled in visit_functiondef)
# or within a decorator (handled in visit_call)
self._check_name("variable", node.name, node)
elif isinstance(frame, nodes.ClassDef):
self._check_name("attr", node.name, node)
else:
# Possibilities here:
# - isinstance(node.assign_type(), nodes.Comprehension) == inlinevar
# - isinstance(frame, nodes.Module) == variable (constant?)
# - some other kind of assigment missed but still most likely a variable
self._check_name("variable", node.name, node)

@utils.check_messages("non-ascii-name")
def visit_classdef(self, node: nodes.ClassDef) -> None:
self._check_name("class", node.name, node)
for attr, anodes in node.instance_attrs.items():
if not any(node.instance_attr_ancestors(attr)):
self._check_name("attr", attr, anodes[0])

def _check_module_import(self, node: Union[nodes.ImportFrom, nodes.Import]) -> None:
for module_name, alias in node.names:
name = alias or module_name
self._check_name("module", name, node)

@utils.check_messages("non-ascii-name")
def visit_import(self, node: nodes.Import) -> None:
self._check_module_import(node)

@utils.check_messages("non-ascii-name")
def visit_importfrom(self, node: nodes.ImportFrom) -> None:
self._check_module_import(node)

@utils.check_messages("non-ascii-name")
def visit_call(self, node: nodes.Call) -> None:
"""Check if the used keyword args are correct."""
for keyword in node.keywords:
self._check_name("argument", keyword.arg, keyword)


def register(linter: lint.PyLinter) -> None:
linter.register_checker(NonAsciiNameChecker(linter))
1 change: 1 addition & 0 deletions pylint/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ class WarningScope:
Python {sys.version}"""

HUMAN_READABLE_TYPES = {
"file": "file",
"module": "module",
"const": "constant",
"class": "class",
Expand Down
15 changes: 13 additions & 2 deletions pylint/testutils/checker_test_case.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,18 @@ def assertNoMessages(self):
yield

@contextlib.contextmanager
def assertAddsMessages(self, *messages: MessageTest) -> Generator[None, None, None]:
def assertAddsMessages(
self, *messages: MessageTest, ignore_position: bool = False
) -> Generator[None, None, None]:
"""Assert that exactly the given method adds the given messages.
The list of messages must exactly match *all* the messages added by the
method. Additionally, we check to see whether the args in each message can
actually be substituted into the message string.
Using the keyword argument `ignore_position`, all checks for position
arguments (line, col_offset, ...) will be skipped. This can be used to
just test messages for the correct node.
"""
yield
got = self.linter.release_messages()
Expand All @@ -53,10 +59,15 @@ def assertAddsMessages(self, *messages: MessageTest) -> Generator[None, None, No

for expected_msg, gotten_msg in zip(messages, got):
assert expected_msg.msg_id == gotten_msg.msg_id, msg
assert expected_msg.line == gotten_msg.line, msg
assert expected_msg.node == gotten_msg.node, msg
assert expected_msg.args == gotten_msg.args, msg
assert expected_msg.confidence == gotten_msg.confidence, msg

if ignore_position:
# Do not check for line, col_offset etc...
continue

assert expected_msg.line == gotten_msg.line, msg
assert expected_msg.col_offset == gotten_msg.col_offset, msg
if PY38_PLUS:
# pylint: disable=fixme
Expand Down
Loading

0 comments on commit d2475b4

Please sign in to comment.