-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmarkmath.py
executable file
·144 lines (122 loc) · 4.6 KB
/
markmath.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/bin/python
import re
import sys
# TOKENTYPES:
UNKNOWN = -1
SPLIT = 0
PRESERVEMATH = 1
DETECTEDMATH = 2
DELETE = 3
INTERMEDIATE = 4
# REGEXPS:
splitat = "(\s+|\n)"
splitat = re.compile(splitat)
dollar = re.compile("(?:(?:[^\\\\\\$]|^)((?:\\$\\$)|\\$))")
backtick = re.compile("(?:(?:[^\\\\]|^)(`(?:``)))")
beginpreservemath = "(?:\\\\begin\\{((align)|(equation)|(verbatim))\\*?\\})|(?:\\\\begin\\{(display)?math\\})|(?:\\\\begin\\{displaymath\\})"
endpreservemath = "(?:\\\\end\\{((align)|(equation)|(verbatim))\\*?\\})|(?:\\\\end\\{(display)math\\})|(?:\\\\end\\{displaymath\\})"
beginpreservemath = re.compile(beginpreservemath)
endpreservemath = re.compile(endpreservemath)
end = "(\)?(\.|,)?$)"
ismath = """((.+\(.*\).*)|(\\(?(\\\\[^\\$].*))|(.+(_|\^).+)|(.*\|.*)|([A-Za-z0-9]'+))""" + end
isatex = re.compile(ismath)
isintermediate = """(\d+|[A-Z]|[a-z]|\+|-|=|:|,)""" + end
isintermediate = re.compile(isintermediate)
# DICTIONARY REPLACINGS:
repls = []
repls += [('(?:-\\((.+)\\)->)', "\\\\xrightarrow{\\1}")]
repls += [('(?:<=\\((.+)\\)=>)', "\\\\stackrel{\\1}{\\\\Leftrightarrow}")]
repls += [('(?:<=\\((.+)\\)=)', "\\\\stackrel{\\1}{\\\\Rightarrow}")]
repls += [('(?:=\\((.+)\\)=>)', "\\\\stackrel{\\1}{\\\\Rightarrow}")]
repls += [('(?:=\\((.+)\\)=)', "\\\\stackrel{\\1}{=}")]
repls += [('~~>', "\\\\rightsquigarrow")]
repls += [('~>', "\\\\rightsquigarrow")]
repls += [('-->', "\\\\longrightarrow")]
repls += [('<->', "\\\\leftrightarrow")]
repls += [('->', "\\\\rightarrow")]
repls += [('<--', "\\\\longleftarrow")]
repls += [('<-', "\\\\leftarrow")]
repls += [('<=>', "\\\\Leftrightarrow")]
repls += [('<=', "\\\\Leftarrow")]
repls += [('=>', "\\\\Rightarrow")]
repls += [('\\\\forall', "\\\\, \\\\forall \\\\,")]
repls += [('\\\\exists', "\\\\, \\\\exists \\\\,")]
# read file
with open(sys.argv[1], 'r') as content_file:
content = content_file.read()
# split
tokens = re.split(splitat, content)
tokentype = [SPLIT if re.match(splitat,token) else UNKNOWN for token in tokens]
# fix braces: merge some splits
nopen = 0
nhist = 0
for i in range(len(tokens)):
if nopen > 0:
nhist = nhist + 1
tokentype[i] = DELETE
tokens[i - nhist] = tokens[i - nhist] + tokens[i]
else:
nhist = 0
nopen += tokens[i].count("{")
nopen -= tokens[i].count("}")
tokens = [tokens[i] for i in range(len(tokens)) if not tokentype[i] == DELETE]
tokentype = [tokent for tokent in tokentype if not tokent == DELETE]
# preserve already marked math
nopen = 0
dollarisopen = 0
doubledollarisopen = 0
backtickisopen = 0
triplebacktickisopen = 0
for i in range(len(tokens)):
if nopen > 0 or dollarisopen or doubledollarisopen:
tokentype[i] = PRESERVEMATH
dollarlens = [len(x) for x in re.findall(dollar, tokens[i])]
dollarisopen = (dollarisopen + len([x for x in dollarlens if x==1]))%2
doubledollarisopen = (doubledollarisopen + len([x for x in dollarlens if x==2]))%2
backticklens = [len(x) for x in re.findall(backtick, tokens[i])]
backtickisopen = (backtickisopen + len([x for x in backticklens if x==1]))%2
triplebacktickisopen = (triplebacktickisopen + len([x for x in backticklens if x==3]))%2
nopen += len(re.findall(beginpreservemath, tokens[i]))
nopen -= len(re.findall(endpreservemath, tokens[i]))
if nopen > 0 or dollarisopen or doubledollarisopen or len(dollarlens) > 0 or backtickisopen or triplebacktickisopen or len(backticklens) > 0:
tokentype[i] = PRESERVEMATH
# substitutions in non-preserved tokens
for i in range(len(tokens)):
if tokentype[i] != PRESERVEMATH and tokentype[i] != SPLIT:
for (a,b) in repls:
tokens[i] = re.sub(a, b, tokens[i])
# detect math
for i in range(len(tokens)):
if tokentype[i] != PRESERVEMATH and tokentype[i] != SPLIT:
if re.match(ismath, tokens[i]):
tokentype[i] = DETECTEDMATH
elif re.match(isintermediate, tokens[i]):
tokentype[i] = INTERMEDIATE
# classify intermediates according to context
for i in range(len(tokens)):
if tokentype[i] == INTERMEDIATE:
jl = i-1
jr = i+1
while jl >= 0 and tokentype[jl] == SPLIT:
jl = jl - 1
while jr < len(tokens) and tokentype[jr] == SPLIT:
jr = jr + 1
if tokentype[jl] == DETECTEDMATH or tokentype[jr] == DETECTEDMATH:
tokentype[i] = DETECTEDMATH
# output and mark detected math
mathon = 0
for i in range(len(tokens)):
if tokentype[i] == DETECTEDMATH:
if mathon:
print(tokens[i], end='')
else:
print('$' + tokens[i], end='')
mathon = 1
jr = i+1
while jr < len(tokens) and tokentype[jr] == SPLIT:
jr = jr + 1
if tokentype[jr] != DETECTEDMATH:
print("$", end='')
mathon = 0
else:
print(tokens[i], end='')