forked from IBM/Project_CodeNet
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfilter5.awk
executable file
·214 lines (195 loc) · 5.94 KB
/
filter5.awk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
#!/usr/bin/awk -f
# Copyright (c) 2020 International Business Machines Corporation
# Prepared by: Geert Janssen <[email protected]>
# Expects a C/C++ tokenizer generated CSV file as input.
# (tokenize -m csv)
# Outputs one possibly modified token (class or literal) per line.
# Recognizes identifiers and checks them against a list of all standard
# C library functions (from /usr/include header files), CPP directives,
# and some other standard names like stdin, stdout, etc.
# Any other identifiers will simply output as the token class name identifier.
# Keywords are output as themselves, i.e., while is output as while.
# Integer and floating-point literals are collectively output as number.
# Character and string literals are output as their class.
# Some operators and punctuation are distinguished; all others output as
# operator.
# Preprocessor symbols are output as # resp. ##.
# Instead of using the token class names, might as well use a literal
# representative. That way the token sequence is still close to a
# syntactically valid piece of code.
# This approach renders an almost syntactically correct token sequence.
# Also, there is no longer confusion between class name and some identifier.
# Representative: Meaning:
# id any identifier
# [ ] ( ) { } < > any of these punctuators
# ; ? : :: , . ... any of these punctuators
# = any assignment operator (+=, *=, etc.)
# ++ -- auto-increment
# - exception to better handle negative numbers
# * any other operator (* covers more cases than +)
# 0 the number 0
# 1 the number 1
# 123 any integer number except 0 and 1
# 3.14 any floating-point number
# "" any string literal
# 'A' any character literal
# # the preprocessor symbol #
# ## the preprocessor symbol ## (very rare)
function dirname(pathname) {
if (!sub(/\/[^\/]*\/?$/, "", pathname))
return "."
return pathname
}
BEGIN {
# Read the file of all C standard function names, one per line
# 280
# Get directory where this script resides (ugly hack):
getline cmdline <("/proc/"PROCINFO["pid"]"/cmdline")
split(cmdline, fields, "\0")
path = dirname(fields[3])
input = path "/std-C-lib-funcs.txt"
while ((rc = (getline line < input)) > 0) {
standard[line]=1
}
close(input)
if (rc < 0)
print "(W) cannot open " input
# Add some other standard names:
# 20
standard["stdin"]=1
standard["stdout"]=1
standard["stderr"]=1
standard["main"]=1
standard["argv"]=1
standard["argc"]=1
standard["NULL"]=1
standard["FILE"]=1
standard["size_t"]=1
standard["ssize_t"]=1
standard["int8_t"]=1
standard["int16_t"]=1
standard["int32_t"]=1
standard["int64_t"]=1
standard["uint8_t"]=1
standard["uint16_t"]=1
standard["uint32_t"]=1
standard["uint64_t"]=1
standard["uintptr_t"]=1
standard["errno"]=1
# Let's throw in some C++ names as well:
# 11
standard["std"]=1
standard["cin"]=1
standard["cout"]=1
standard["cerr"]=1
standard["endl"]=1
standard["vector"]=1
standard["map"]=1
standard["string"]=1
standard["set"]=1
standard["queue"]=1
standard["ptrdiff_t"]=1
# Treat some CPP identifiers as keywords:
# 11
standard["include"]=1
standard["ifdef"]=1
standard["ifndef"]=1
standard["elif"]=1
standard["endif"]=1
standard["define"]=1
standard["undef"]=1
standard["defined"]=1
standard["__FILE__"]=1
standard["__LINE__"]=1
standard["pragma"]=1
# 17
punctuation = "[](){}<>;?::,...=-"
# unary and binary overlap: + -
#unary_ops = ~ ! + - * & ++ --
#binary_ops = + - * / % ^ & | < > , -> == != <= >= && || << >> <=>
#? = .* ->*
# 0 (all map to =)
assign2_ops = "+= -= *= /= %= &= |="
assign3_ops = "<<= >>="
# Output record separator (space after each print, set with -v):
#ORS=" "
# Saved value of ORS:
saved_ORS=ORS
# (Input) field separator:
FS=","
# Skip header:
getline
}
{
# Reset ORS in case of newline:
ORS=saved_ORS
# Read csv
# Vocabulary size:
# 5 class names: identifier, number, operator, character, string
# 95 C/C++20 keywords
# 280 standard functions
# 42 cpp and other standard names
# 17 punctuation symbols (and minus sign)
# 2 preprocessor symbols # and ##
# 2 integer constants 0,1
# === +
# 443
# Check for identifier in column 3 (class)
if ($3 == "identifier") {
# Separate out certain identifiers:
if ($4 in standard)
print $4
else
# (use a shorter identifier representative)
print "id"
}
else if ($3 == "operator") {
# Separate out certain punctuation:
if (index(punctuation, $4) != 0)
print $4
else
# Mind the escaped comma "," which is eaten by FS:
if ($4 == "\"" && $5 == "\"")
print ","
else
# Distinguish unary and binary? No.
# All assign_ops become =:
if (length($4) == 2 && index(assign2_ops, $4) != 0 ||
length($4) == 3 && index(assign3_ops, $4) != 0)
print "="
else
if ($4 == "++" || $4 == "--")
print $4
else
# (use a real operator; need one that can be used
# prefix, infix, and postfix)
print "*"
}
else if ($3 == "keyword" || $3 == "preprocessor")
print $4
else if ($3 == "integer") {
# Allow some specific values: 0 and 1
# (minus sign is not part of number literal!)
if (index("0 1", $4) != 0)
print $4
else
# (use an integer number representative)
print "123"
}
else if ($3 == "floating")
# Consolidate integer and floating:
# (use a float number representative)
print "3.14"
else if ($3 == "string")
# (use a real string)
print "\"\""
else if ($3 == "character")
# (use a real char literal)
print "'A'"
else if ($3 == "newline") {
# Hack to avoid additional ORS after the newline.
ORS=""
print "\n"
}
}
END {}