forked from percona/percona-server
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfind_unicode_control.py
224 lines (188 loc) · 7.35 KB
/
find_unicode_control.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
#!/usr/bin/env python3
"""Find unicode control characters in source files
By default the script takes one or more files or directories and looks for
unicode control characters in all text files. To narrow down the files, provide
a config file with the -c command line, defining a scan_exclude list, which
should be a list of regular expressions matching paths to exclude from the scan.
There is a second mode enabled with -p which when set to 'all', prints all
control characters and when set to 'bidi', prints only the 9 bidirectional
control characters.
"""
from __future__ import print_function
import sys, os, argparse, re, unicodedata, subprocess
import importlib
from stat import *
def _unicode(line, encoding):
if isinstance(line, str):
return line
return line.decode(encoding)
import platform
if platform.python_version()[0] == '2':
_chr = unichr
do_unicode = unicode
else:
_chr = chr
do_unicode = _unicode
scan_exclude = [r'\.git/', r'\.hg/', r'\.desktop$', r'ChangeLog$', r'NEWS$',
r'\.ppd$', r'\.txt$', r'\.directory$']
scan_exclude_mime = [r'text/x-po$', r'text/x-tex$', r'text/x-troff$',
r'text/html$']
verbose_mode = False
# Print to stderr in verbose mode.
def eprint(*args, **kwargs):
if verbose_mode:
print(*args, file=sys.stderr, **kwargs)
# Decode a single latin1 line.
def decodeline(inf):
return do_unicode(inf, 'utf-8')
# Make a text string from a file, attempting to decode from latin1 if necessary.
# Other non-utf-8 locales are not supported at the moment.
def getfiletext(filename):
text = None
with open(filename) as infile:
try:
if detailed_mode:
return [decodeline(inf) for inf in infile]
except Exception as e:
eprint('%s: %s' % (filename, e))
return None
try:
text = decodeline(''.join(infile))
except UnicodeDecodeError:
eprint('%s: Retrying with latin1' % filename)
try:
text = ''.join([decodeline(inf) for inf in infile])
except Exception as e:
eprint('%s: %s' % (filename, e))
if text:
return set(text)
else:
return None
def analyze_text_detailed(filename, text, disallowed, msg):
line = 0
warned = False
for t in text:
line = line + 1
subset = [c for c in t if chr(ord(c)) in disallowed]
if subset:
print('%s:%d %s: %s' % (filename, line, msg, subset))
warned = True
if not warned:
eprint('%s: OK' % filename)
return warned
# Look for disallowed characters in the text. We reduce all characters into a
# set to speed up analysis. FIXME: Add a slow mode to get line numbers in files
# that have these disallowed chars.
def analyze_text(filename, text, disallowed, msg):
if detailed_mode:
return analyze_text_detailed(filename, text, disallowed, msg)
warned = False
if not text.isdisjoint(disallowed):
warned = True
print('%s: %s: %s' % (filename, msg, text & disallowed))
else:
eprint('%s: OK' % filename)
return warned
def should_read(f):
args = ['file', '--mime-type', f]
proc = subprocess.Popen(args, stdout=subprocess.PIPE)
m = [decodeline(x[:-1]) for x in proc.stdout][0].split(':')[1].strip()
# Fast check, just the file name.
if [e for e in scan_exclude if re.search(e, f)]:
return False
# Slower check, mime type.
if not 'text/' in m \
or [e for e in scan_exclude_mime if re.search(e, m)]:
return False
return True
# Get file text and feed into analyze_text.
def analyze_file(f, disallowed, msg):
warned = False
eprint('%s: Reading file' % f)
if should_read(f):
text = getfiletext(f)
if (text and analyze_text(f, text, disallowed, msg)):
warned = True
else:
eprint('%s: SKIPPED' % f)
return warned
# Actual implementation of the recursive descent into directories.
def analyze_any(p, disallowed, msg):
warned = False
mode = os.stat(p).st_mode
if S_ISDIR(mode):
if (analyze_dir(p, disallowed, msg)):
warned = True
elif S_ISREG(mode):
if (analyze_file(p, disallowed, msg)):
warned = True
else:
eprint('%s: UNREADABLE' % p)
return warned
# Recursively analyze files in the directory.
def analyze_dir(d, disallowed, msg):
warned = False
for f in os.listdir(d):
if (analyze_any(os.path.join(d, f), disallowed, msg)):
warned = True
return warned
def analyze_paths(paths, disallowed, msg):
warned = False
for p in paths:
if (analyze_any(p, disallowed, msg)):
warned = True
return warned
# All control characters. We omit the ascii control characters.
def nonprint_unicode(c):
cat = unicodedata.category(c)
if cat.startswith('C') and cat != 'Cc':
return True
return False
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Look for Unicode control characters")
parser.add_argument('path', metavar='path', nargs='+',
help='Sources to analyze')
parser.add_argument('-p', '--nonprint', required=False,
type=str, choices=['all', 'bidi'],
help='Look for either all non-printable unicode characters or bidirectional control characters.')
parser.add_argument('-v', '--verbose', required=False, action='store_true',
help='Verbose mode.')
parser.add_argument('-d', '--detailed', required=False, action='store_true',
help='Print line numbers where characters occur.')
parser.add_argument('-t', '--notests', required=False,
action='store_true', help='Exclude tests (basically test.* as a component of path).')
parser.add_argument('-c', '--config', required=False, type=str,
help='Configuration file to read settings from.')
args = parser.parse_args()
verbose_mode = args.verbose
detailed_mode = args.detailed
if not args.nonprint:
# Formatting control characters in the unicode space. This includes the
# bidi control characters.
disallowed = set(_chr(c) for c in range(sys.maxunicode) if \
unicodedata.category(_chr(c)) == 'Cf')
msg = 'unicode control characters'
elif args.nonprint == 'all':
# All control characters.
disallowed = set(_chr(c) for c in range(sys.maxunicode) if \
nonprint_unicode(_chr(c)))
msg = 'disallowed characters'
else:
# Only bidi control characters.
disallowed = set([
_chr(0x202a), _chr(0x202b), _chr(0x202c), _chr(0x202d), _chr(0x202e),
_chr(0x2066), _chr(0x2067), _chr(0x2068), _chr(0x2069)])
msg = 'bidirectional control characters'
if args.config:
spec = importlib.util.spec_from_file_location("settings", args.config)
settings = importlib.util.module_from_spec(spec)
spec.loader.exec_module(settings)
if hasattr(settings, 'scan_exclude'):
scan_exclude = scan_exclude + settings.scan_exclude
if hasattr(settings, 'scan_exclude_mime'):
scan_exclude_mime = scan_exclude_mime + settings.scan_exclude_mime
if args.notests:
scan_exclude = scan_exclude + [r'/test[^/]+/']
if (analyze_paths(args.path, disallowed, msg)):
sys.exit(1)
sys.exit(0)