-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathrure.go
544 lines (496 loc) · 16 KB
/
rure.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
package rure
// #cgo LDFLAGS: -lrure
//
// #include <stdio.h>
// #include <stdlib.h>
// #include "rure.h"
//
// /*
// * rure_iter_collect exhausts the given iterator over the given haystack,
// * and reports all successive non-overlapping match locations in *matches.
// * *num_matches is set to the number of matches found.
// *
// * *matches contains `*num_matches * 2` offsets, where `2 * i` and
// * `2 * i + 1` represent the start and end byte offsets of match `i`.
// */
// void rure_iter_collect(rure_iter *it,
// const uint8_t *haystack, size_t length,
// size_t **matches, size_t *num_matches)
// {
// rure_match m = {0};
// size_t len = 0;
// size_t cap = 64;
// *matches = malloc(cap * sizeof(size_t));
// if (NULL == *matches) {
// fprintf(stderr, "rure_iter_collect: out of memory, aborting\n");
// abort();
// }
// while (rure_iter_next(it, haystack, length, &m)) {
// if ((len * 2 + 1) >= cap) {
// cap *= 2;
// *matches = realloc(*matches, cap * sizeof(size_t));
// if (NULL == *matches) {
// fprintf(
// stderr, "rure_iter_collect: out of memory, aborting\n");
// abort();
// }
// }
// (*matches)[len * 2 + 0] = m.start;
// (*matches)[len * 2 + 1] = m.end;
// len++;
// }
// *num_matches = len;
// }
import "C"
import (
"fmt"
"reflect"
"runtime"
"unsafe"
)
// Flags for modifying regex behavior. All flags can be modified in the
// expression itself using standard syntax. e.g., `(?i)` enables case
// insensitivity and `(?-i)` disables it.
const (
// FlagCaseI is the case insensitive (i) flag.
FlagCaseI = C.RURE_FLAG_CASEI
// FlagMulti is the multi-line matching (m) flag.
// (^ and $ match new line boundaries.)
FlagMulti = C.RURE_FLAG_MULTI
// FlagDotNL is the any character (s) flag. (. matches new line.)
FlagDotNL = C.RURE_FLAG_DOTNL
// FlagSwapGreed is the greedy swap (U) flag.
// (e.g., + is ungreedy and +? is greedy.)
FlagSwapGreed = C.RURE_FLAG_SWAP_GREED
// FlagSpace is the ignore whitespace (x) flag.
FlagSpace = C.RURE_FLAG_SPACE
// FlagUnicode is the Unicode (u) flag.
FlagUnicode = C.RURE_FLAG_UNICODE
// FlagDefault is used when calling MustCompile or Compile.
FlagDefault = FlagUnicode
)
// Regex is a compiled regular expression.
//
// It can be used safely from multiple goroutines simultaneously.
type Regex struct {
pattern string
p *C.rure
}
// Options represents non-flag compile time options.
//
// For example, calling SetSizeLimit will place an upper bound on how big
// the compiled regular expression can be.
type Options struct {
p *C.rure_options
}
// Captures represents start and end locations for every matching capture group
// in a regular expression match.
//
// It is not safe to use from multiple goroutines simultaneously.
type Captures struct {
re *Regex
p *C.rure_captures
ok bool
}
// Iter is an iterator over successive non-overlapping matches in a haystack.
//
// It is not safe to use from multiple goroutines simultaneously.
type Iter struct {
re *Regex
p *C.rure_iter
haystack []byte
match C.rure_match
}
// Error is an error that caused compilation of a regular expression to fail.
//
// Most errors are syntax errors, but an error can be returned if the compiled
// regular expression would be too big.
type Error struct {
p *C.rure_error
}
// MustCompile is like Compile, but if there was a problem compiling the
// pattern, then it will panic.
func MustCompile(pattern string) *Regex {
re, err := Compile(pattern)
if err != nil {
panic(fmt.Sprintf("regex.MustCompile failed on %s: %s", pattern, err))
}
return re
}
// Compile is like CompileOptions, but uses default flags (Unicode enabled)
// and default size limits.
//
// If there was a problem compiling the pattern, then no Regex is returned and
// a non-nil error is returned.
func Compile(pattern string) (*Regex, error) {
return CompileOptions(pattern, FlagDefault, nil)
}
// CompileOptions compiles a pattern (in UTF-8) to a regular expression
// suitable for searching text.
//
// Flags is a bitfield of the Flag constants in this package. A value of `0`
// disables all flags.
//
// Options is a set non-flag configuration settings for the compiled regular
// expression. When set to nil, default settings are used.
//
// If there was a problem compiling the pattern (including if it is not valid
// UTF-8), then an error is returned.
//
// N.B. When disabling Unicode support (either via an explicit flag here or in
// the pattern), one can use escape sequences to match bytes that aren't valid
// UTF-8. For example, use `\\xFF` instead of `\xFF`.
func CompileOptions(
pattern string,
flags uint32,
options *Options,
) (*Regex, error) {
re := &Regex{pattern: pattern}
runtime.SetFinalizer(re, func(re *Regex) {
if re.p != nil {
C.rure_free(re.p)
re.p = nil
}
})
var optp *C.rure_options
if options != nil {
optp = options.p
}
err := newError()
re.p = C.rure_compile(
asUint8Ptr(noCopyBytes(pattern)),
C.size_t(len(pattern)),
FlagDefault,
optp,
err.p,
)
if re.p == nil {
return nil, err
}
return re, nil
}
func (re *Regex) String() string {
return re.pattern
}
// IsMatch returns true if text matches re.
func (re *Regex) IsMatch(text string) bool {
return re.IsMatchBytesAt(noCopyBytes(text), 0)
}
// IsMatchBytes returns true if text matches re.
func (re *Regex) IsMatchBytes(text []byte) bool {
return re.IsMatchBytesAt(text, 0)
}
// IsMatchAt returns true if text matches re starting at index i.
func (re *Regex) IsMatchAt(text string, i int) bool {
return re.IsMatchBytesAt(noCopyBytes(text), i)
}
// IsMatchBytesAt returns true if text matches re starting at index i.
func (re *Regex) IsMatchBytesAt(text []byte, i int) bool {
return bool(C.rure_is_match(
re.p, asUint8Ptr(text), C.size_t(len(text)), C.size_t(i)))
}
// ShortestMatch returns the end location of a match in text if it exists. This
// may return an end location that occurs before the end of the proper
// leftmost-first match.
//
// If no match exists, false is returned.
//
// For example, matching `a+` against `aaaaa` will return `1` (while `Find`
// will report `5` as the end).
func (re *Regex) ShortestMatch(text string) (end int, ok bool) {
return re.ShortestMatchBytes(noCopyBytes(text))
}
// ShortestMatchBytes returns the end location of a match in text if it exists.
// This may return an end location that occurs before the end of the proper
// leftmost-first match.
//
// If no match exists, false is returned.
//
// For example, matching `a+` against `aaaaa` will return `1` (while `Find`
// will report `5` as the end).
func (re *Regex) ShortestMatchBytes(text []byte) (end int, ok bool) {
var cend C.size_t
ok = bool(C.rure_shortest_match(
re.p, asUint8Ptr(text), C.size_t(len(text)), 0, &cend))
end = int(cend)
return
}
// Find returns the start and end location of the leftmost-first match in text
// if it exists.
//
// If no match exists, false is returned.
func (re *Regex) Find(text string) (start, end int, ok bool) {
return re.FindBytes(noCopyBytes(text))
}
// FindBytes returns the start and end location of the leftmost-first match in
// text if it exists.
//
// If no match exists, false is returned.
func (re *Regex) FindBytes(text []byte) (start, end int, ok bool) {
match := C.rure_match{}
ok = bool(C.rure_find(
re.p, asUint8Ptr(text), C.size_t(len(text)), 0, &match))
if ok {
start, end = int(match.start), int(match.end)
}
return
}
// FindAll returns all successive non-overlapping matches of re in text.
//
// The slice returned contains a pair of start and end offsets for each match
// found. The start and end offset for match i is indexed by i*2 and i*2+1,
// respectively.
//
// This may be faster than using Iter since the slice of matches is built in
// C code.
func (re *Regex) FindAll(text string) []int {
return re.FindAllBytes(noCopyBytes(text))
}
// FindAllBytes returns all successive non-overlapping matches of re in text.
//
// The slice returned contains a pair of start and end offsets for each match
// found. The start and end offset for match i is indexed by i*2 and i*2+1,
// respectively.
//
// This may be faster than using Iter since the slice of matches is built in
// C code.
func (re *Regex) FindAllBytes(text []byte) []int {
it := C.rure_iter_new(re.p)
defer C.rure_iter_free(it)
haystack := asUint8Ptr(text)
len := C.size_t(len(text))
nmatches := C.size_t(0)
matches := (*C.size_t)(nil)
defer func() {
if matches != nil {
C.free(unsafe.Pointer(matches))
}
}()
C.rure_iter_collect(it, haystack, len, &matches, &nmatches)
if nmatches == 0 {
return nil
}
// Copy the matches from C memory to Go memory.
matchesInts := make([]int, 2*nmatches)
p := uintptr(unsafe.Pointer(matches))
stride := unsafe.Sizeof(C.size_t(0))
for i := uintptr(0); i < uintptr(nmatches); i++ {
base := i * 2
start := unsafe.Pointer(p + (base * stride))
end := unsafe.Pointer(p + ((base + 1) * stride))
matchesInts[base] = int(*(*C.size_t)(start))
matchesInts[base+1] = int(*(*C.size_t)(end))
}
return matchesInts
}
// NewCaptures allocates room for storing the start and end offset of each
// capturing group in re.
//
// Captures may be reused in subsequent calls. When it is reused, its internal
// state is reset.
//
// Captures may not be used from multiple threads simultaneously.
func (re *Regex) NewCaptures() *Captures {
caps := &Captures{re: re, p: C.rure_captures_new(re.p)}
runtime.SetFinalizer(caps, func(caps *Captures) {
if caps.p != nil {
C.rure_captures_free(caps.p)
caps.p = nil
}
})
return caps
}
// Captures populates caps with the start and end locations of all matching
// capturing groups in re for text.
//
// If no match is found, then false is returned.
//
// A caps value can be built with re.NewCaptures().
//
// caps must not be nil.
func (re *Regex) Captures(caps *Captures, text string) bool {
return re.CapturesBytes(caps, noCopyBytes(text))
}
// CapturesBytes populates caps with the start and end locations of all
// matching capturing groups in re for text.
//
// If no match is found, then false is returned.
//
// A caps value can be built with re.NewCaptures().
//
// caps must not be nil.
func (re *Regex) CapturesBytes(caps *Captures, text []byte) bool {
caps.ok = bool(C.rure_find_captures(
re.p, asUint8Ptr(text), C.size_t(len(text)), 0, caps.p))
return caps.ok
}
// Iter returns an iterator over successive non-overlapping matches of re
// in text.
//
// Next must be called on the iterator before accessing match information.
func (re *Regex) Iter(text string) *Iter {
return re.IterBytes(noCopyBytes(text))
}
// IterBytes returns an iterator over successive non-overlapping matches of re
// in text.
//
// Next must be called on the iterator before accessing match information.
func (re *Regex) IterBytes(text []byte) *Iter {
return newIter(re, text)
}
// CaptureNames returns a slice of the names of call capturing groups in this
// regex. The slice has the same order as the order of the appearance of each
// capturing group. Index 0 corresponds to the entire regex match, and is
// therefore always unnamed. Unnamed capturing groups are always represented by
// an empty string.
func (re *Regex) CaptureNames() []string {
it := C.rure_iter_capture_names_new(re.p)
defer C.rure_iter_capture_names_free(it)
var names []string
var name *C.char
for C.rure_iter_capture_names_next(it, &name) {
names = append(names, C.GoString(name))
}
return names
}
// NewOptions returns a fresh options value for configuring non-flag options
// of a regex.
//
// Options can be passed to multiple calls to CompileOptions.
//
// Options is not safe to mutate from multiple goroutines simultaneously, but
// it may be used in calls to CompileOptions from multiple goroutines
// simultaneously.
func NewOptions() *Options {
opts := &Options{C.rure_options_new()}
runtime.SetFinalizer(opts, func(opts *Options) {
if opts.p != nil {
C.rure_options_free(opts.p)
opts.p = nil
}
})
return opts
}
// SetSizeLimit sets the approximate size limit (in bytes) of the compiled
// regular expression.
//
// If a pattern would result in a compiled program large than this size, then
// compilation will return an error.
func (opts *Options) SetSizeLimit(limit int) {
C.rure_options_size_limit(opts.p, C.size_t(limit))
}
// SetDFASizeLimit sets the approximate size limit (in bytes) of the DFA's
// cache size. It is a per-thread limit, so that a Regex used in multiple
// threads simultaneously may use up to this many bytes per-thread of usage.
//
// 0 is a legal value.
func (opts *Options) SetDFASizeLimit(limit int) {
C.rure_options_dfa_size_limit(opts.p, C.size_t(limit))
}
// IsMatch returns true if caps corresponds to a match in a regular expression.
func (caps *Captures) IsMatch() bool {
return caps.ok
}
// Group returns the start and end offsets for the capturing group indexed by
// i. Capturing groups are indexed by the appearance of their opening
// parenthesis in the pattern.
//
// If the capturing group was not part of the match, then this returns false.
//
// Note that capture group 0 always corresponds to the full match of the
// regular expression and is always unnamed.
func (caps *Captures) Group(i int) (start, end int, ok bool) {
match := C.rure_match{}
ok = bool(C.rure_captures_at(caps.p, C.size_t(i), &match))
if ok {
start, end = int(match.start), int(match.end)
}
return
}
// GroupName is like Group, but uses the name of a capturing group instead of
// its index. Named capture groups look like (?P<foo>re) in the pattern.
//
// If no such named capture group exists or if it wasn't part of the match
// of the regular expression, GroupName returns false.
func (caps *Captures) GroupName(name string) (start, end int, ok bool) {
i := C.rure_capture_name_index(caps.re.p, C.CString(name))
if i == -1 {
return
}
return caps.Group(int(i))
}
// Len returns the number of capturing groups.
//
// Once caps is created, this never changes.
func (caps *Captures) Len() int {
return int(C.rure_captures_len(caps.p))
}
func newIter(re *Regex, haystack []byte) *Iter {
it := &Iter{
re: re,
haystack: haystack,
p: C.rure_iter_new(re.p),
}
runtime.SetFinalizer(it, func(it *Iter) {
if it.p != nil {
C.rure_iter_free(it.p)
}
})
return it
}
// Next advances the iterator. If it finds a match, it returns true, and
// otherwise returns false. Once it returns false, it will always return false.
//
// This must be called before a call to Match.
//
// If caps is nil, then capture information is not extracted. If caps is not
// nil, then the start and end offsets of each matching capturing group are
// stored in caps.
func (it *Iter) Next(caps *Captures) bool {
haystack := asUint8Ptr(it.haystack)
len := C.size_t(len(it.haystack))
if caps == nil {
return bool(C.rure_iter_next(it.p, haystack, len, &it.match))
}
caps.ok = bool(C.rure_iter_next_captures(it.p, haystack, len, caps.p))
C.rure_captures_at(caps.p, 0, &it.match)
return caps.ok
}
// Match returns the start and end offsets of the current match in the
// iteraotr.
func (it *Iter) Match() (start, end int) {
return int(it.match.start), int(it.match.end)
}
func newError() *Error {
err := &Error{C.rure_error_new()}
runtime.SetFinalizer(err, func(err *Error) {
if err.p != nil {
C.rure_error_free(err.p)
err.p = nil
}
})
return err
}
func (err *Error) Error() string {
return C.GoString(C.rure_error_message(err.p))
}
// Converts a string to a []byte without allocating.
//
// This is very dangerous and must be handled with care. In particular, the
// input given must be kept alive for the duration of the return value.
func noCopyBytes(s string) []byte {
sh := (*reflect.StringHeader)(unsafe.Pointer(&s))
bh := reflect.SliceHeader{
Data: sh.Data,
Len: sh.Len,
Cap: sh.Len,
}
return *(*[]byte)(unsafe.Pointer(&bh))
}
// Converts a byte slice to a *C.uint8_t.
//
// This works even for empty slices.
func asUint8Ptr(bs []byte) *C.uint8_t {
bh := (*reflect.SliceHeader)(unsafe.Pointer(&bs))
return (*C.uint8_t)(unsafe.Pointer(bh.Data))
}