-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathpreprocess.py
executable file
·109 lines (102 loc) · 3.24 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env python3
import argparse
import re
import string
import sys
import unicodedata
def main(args):
for line in args.infile:
if not line.rstrip():
if not args.filter_boundaries:
print()
continue
tokens = line.rstrip().split("\t")
if len(tokens) < 2 and args.filter_empty:
continue
if args.filter_punctuation:
punct = [
all(map(lambda x: unicodedata.category(x).startswith("P"), t))
for t in tokens
if len(t) > 0
]
if any(punct):
continue
if args.unicode_normalize is not None:
tokens = [unicodedata.normalize(args.unicode_normalize, t) for t in tokens]
if args.lower:
tokens = [t.lower() for t in tokens]
if args.digits and any(digit in tokens[0] for digit in string.digits):
if all(t == tokens[0] for t in tokens[1:]):
tokens = [re.sub("\d", "0", t) for t in tokens]
if args.spaces != " ":
tokens = [re.sub(" ", args.spaces, t) for t in tokens]
print("\t".join(tokens))
if __name__ == "__main__":
description = "Preprocess normalization datasets in two-column format."
epilog = ""
parser = argparse.ArgumentParser(description=description, epilog=epilog)
parser.add_argument(
"infile", type=argparse.FileType("r", encoding="UTF-8"), help="Input file"
)
parser.add_argument(
"-A",
"--all",
action="store_true",
default=False,
help='Equivalent to "-bdepl -s ÷ -u NFC"; overrides all other options',
)
parser.add_argument(
"-b",
"--filter-boundaries",
action="store_true",
default=False,
help="Remove empty lines (indicating sentence boundaries)",
)
parser.add_argument(
"-d",
"--digits",
action="store_true",
default=False,
help="Replace digits with zeroes when they are identical across columns",
)
parser.add_argument(
"-e",
"--filter-empty",
action="store_true",
default=False,
help="Filter lines where either column is empty",
)
parser.add_argument(
"-l", "--lower", action="store_true", default=False, help="Lowercase everything"
)
parser.add_argument(
"-p",
"--filter-punctuation",
action="store_true",
default=False,
help="Filter lines where either column consists only of punctuation characters",
)
parser.add_argument(
"-s",
"--spaces",
type=str,
default=" ",
help='Replace spaces within columns with this character (default: "%(default)s")',
)
parser.add_argument(
"-u",
"--unicode-normalize",
choices=("NFC", "NFKC", "NFD", "NFKD"),
default=None,
help="Perform Unicode normalization to given standard form (default: None)",
)
args = parser.parse_args()
if args.all:
args.filter_boundaries = True
args.digits = True
args.filter_empty = True
args.filter_punctuation = True
args.lower = True
args.spaces = "÷"
args.unicode_normalize = "NFC"
main(args)