Skip to content

Commit

Permalink
PS-7968: Implement BiDi scan for PS via Azure pipelines
Browse files Browse the repository at this point in the history
  • Loading branch information
oleksandr-kachan committed Nov 15, 2021
1 parent 7806689 commit 3047d0e
Show file tree
Hide file tree
Showing 2 changed files with 236 additions and 0 deletions.
12 changes: 12 additions & 0 deletions azure-pipelines.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,16 @@
jobs:
- job: BiDiScan
pool:
vmImage: 'ubuntu-18.04'

steps:
- checkout: self
fetchDepth: 32

- script: |
git fetch origin 5.7
python $(Build.SourcesDirectory)/scripts/find_unicode_control.py -p bidi -v $(git diff --name-only --relative --diff-filter AMR origin/5.7 -- . | tr '\n' ' ')
- job:
timeoutInMinutes: 240
pool:
Expand Down
224 changes: 224 additions & 0 deletions scripts/find_unicode_control.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
#!/usr/bin/env python3
"""Find unicode control characters in source files
By default the script takes one or more files or directories and looks for
unicode control characters in all text files. To narrow down the files, provide
a config file with the -c command line, defining a scan_exclude list, which
should be a list of regular expressions matching paths to exclude from the scan.
There is a second mode enabled with -p which when set to 'all', prints all
control characters and when set to 'bidi', prints only the 9 bidirectional
control characters.
"""
from __future__ import print_function

import sys, os, argparse, re, unicodedata, subprocess
import importlib
from stat import *

def _unicode(line, encoding):
if isinstance(line, str):
return line
return line.decode(encoding)

import platform
if platform.python_version()[0] == '2':
_chr = unichr
do_unicode = unicode
else:
_chr = chr
do_unicode = _unicode

scan_exclude = [r'\.git/', r'\.hg/', r'\.desktop$', r'ChangeLog$', r'NEWS$',
r'\.ppd$', r'\.txt$', r'\.directory$']
scan_exclude_mime = [r'text/x-po$', r'text/x-tex$', r'text/x-troff$',
r'text/html$']
verbose_mode = False

# Print to stderr in verbose mode.
def eprint(*args, **kwargs):
if verbose_mode:
print(*args, file=sys.stderr, **kwargs)

# Decode a single latin1 line.
def decodeline(inf):
return do_unicode(inf, 'utf-8')

# Make a text string from a file, attempting to decode from latin1 if necessary.
# Other non-utf-8 locales are not supported at the moment.
def getfiletext(filename):
text = None
with open(filename) as infile:
try:
if detailed_mode:
return [decodeline(inf) for inf in infile]
except Exception as e:
eprint('%s: %s' % (filename, e))
return None

try:
text = decodeline(''.join(infile))
except UnicodeDecodeError:
eprint('%s: Retrying with latin1' % filename)
try:
text = ''.join([decodeline(inf) for inf in infile])
except Exception as e:
eprint('%s: %s' % (filename, e))
if text:
return set(text)
else:
return None

def analyze_text_detailed(filename, text, disallowed, msg):
line = 0
warned = False
for t in text:
line = line + 1
subset = [c for c in t if chr(ord(c)) in disallowed]
if subset:
print('%s:%d %s: %s' % (filename, line, msg, subset))
warned = True

if not warned:
eprint('%s: OK' % filename)

return warned

# Look for disallowed characters in the text. We reduce all characters into a
# set to speed up analysis. FIXME: Add a slow mode to get line numbers in files
# that have these disallowed chars.
def analyze_text(filename, text, disallowed, msg):
if detailed_mode:
return analyze_text_detailed(filename, text, disallowed, msg)

warned = False
if not text.isdisjoint(disallowed):
warned = True
print('%s: %s: %s' % (filename, msg, text & disallowed))
else:
eprint('%s: OK' % filename)

return warned

def should_read(f):
args = ['file', '--mime-type', f]
proc = subprocess.Popen(args, stdout=subprocess.PIPE)
m = [decodeline(x[:-1]) for x in proc.stdout][0].split(':')[1].strip()
# Fast check, just the file name.
if [e for e in scan_exclude if re.search(e, f)]:
return False

# Slower check, mime type.
if not 'text/' in m \
or [e for e in scan_exclude_mime if re.search(e, m)]:
return False
return True

# Get file text and feed into analyze_text.
def analyze_file(f, disallowed, msg):
warned = False
eprint('%s: Reading file' % f)
if should_read(f):
text = getfiletext(f)
if (text and analyze_text(f, text, disallowed, msg)):
warned = True
else:
eprint('%s: SKIPPED' % f)

return warned

# Actual implementation of the recursive descent into directories.
def analyze_any(p, disallowed, msg):
warned = False
mode = os.stat(p).st_mode
if S_ISDIR(mode):
if (analyze_dir(p, disallowed, msg)):
warned = True
elif S_ISREG(mode):
if (analyze_file(p, disallowed, msg)):
warned = True
else:
eprint('%s: UNREADABLE' % p)

return warned

# Recursively analyze files in the directory.
def analyze_dir(d, disallowed, msg):
warned = False
for f in os.listdir(d):
if (analyze_any(os.path.join(d, f), disallowed, msg)):
warned = True

return warned

def analyze_paths(paths, disallowed, msg):
warned = False
for p in paths:
if (analyze_any(p, disallowed, msg)):
warned = True

return warned

# All control characters. We omit the ascii control characters.
def nonprint_unicode(c):
cat = unicodedata.category(c)
if cat.startswith('C') and cat != 'Cc':
return True
return False

if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Look for Unicode control characters")
parser.add_argument('path', metavar='path', nargs='+',
help='Sources to analyze')
parser.add_argument('-p', '--nonprint', required=False,
type=str, choices=['all', 'bidi'],
help='Look for either all non-printable unicode characters or bidirectional control characters.')
parser.add_argument('-v', '--verbose', required=False, action='store_true',
help='Verbose mode.')
parser.add_argument('-d', '--detailed', required=False, action='store_true',
help='Print line numbers where characters occur.')
parser.add_argument('-t', '--notests', required=False,
action='store_true', help='Exclude tests (basically test.* as a component of path).')
parser.add_argument('-c', '--config', required=False, type=str,
help='Configuration file to read settings from.')

args = parser.parse_args()
verbose_mode = args.verbose
detailed_mode = args.detailed

if not args.nonprint:
# Formatting control characters in the unicode space. This includes the
# bidi control characters.
disallowed = set(_chr(c) for c in range(sys.maxunicode) if \
unicodedata.category(_chr(c)) == 'Cf')

msg = 'unicode control characters'
elif args.nonprint == 'all':
# All control characters.
disallowed = set(_chr(c) for c in range(sys.maxunicode) if \
nonprint_unicode(_chr(c)))

msg = 'disallowed characters'
else:
# Only bidi control characters.
disallowed = set([
_chr(0x202a), _chr(0x202b), _chr(0x202c), _chr(0x202d), _chr(0x202e),
_chr(0x2066), _chr(0x2067), _chr(0x2068), _chr(0x2069)])
msg = 'bidirectional control characters'

if args.config:
spec = importlib.util.spec_from_file_location("settings", args.config)
settings = importlib.util.module_from_spec(spec)
spec.loader.exec_module(settings)
if hasattr(settings, 'scan_exclude'):
scan_exclude = scan_exclude + settings.scan_exclude
if hasattr(settings, 'scan_exclude_mime'):
scan_exclude_mime = scan_exclude_mime + settings.scan_exclude_mime

if args.notests:
scan_exclude = scan_exclude + [r'/test[^/]+/']

if (analyze_paths(args.path, disallowed, msg)):
sys.exit(1)

sys.exit(0)

0 comments on commit 3047d0e

Please sign in to comment.