Skip to content

Commit

Permalink
Support nested character classes and intersection with &&
Browse files Browse the repository at this point in the history
This implements parts of UTS#18 RL1.3, namely:

* Nested character classes, e.g.: `[a[b-c]]`
* Intersections in classes, e.g.: `[\w&&\p{Greek}]`

They can be combined to do things like `[\w&&[^a]]` to get all word
characters except `a`.

Fixes #341
  • Loading branch information
robinst committed Feb 22, 2017
1 parent 204e409 commit c436bfd
Show file tree
Hide file tree
Showing 2 changed files with 567 additions and 54 deletions.
159 changes: 158 additions & 1 deletion regex-syntax/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -680,7 +680,7 @@ impl CharClass {
self.canonicalize()
}

/// Canonicalze any sequence of ranges.
/// Canonicalize any sequence of ranges.
///
/// This is responsible for enforcing the canonical format invariants
/// as described on the docs for the `CharClass` type.
Expand All @@ -703,6 +703,43 @@ impl CharClass {
ordered
}

/// Calculate the intersection of two canonical character classes.
///
/// The returned intersection is canonical.
fn intersection(&self, other: &CharClass) -> CharClass {
if self.ranges.is_empty() || other.ranges.is_empty() {
return CharClass::empty();
}

let mut intersection = CharClass::empty();

let mut iter_a = self.ranges.iter();
let mut iter_b = other.ranges.iter();
let mut a = iter_a.next().unwrap();
let mut b = iter_b.next().unwrap();
loop {
if let Some(i) = a.intersection(&b) {
intersection.ranges.push(i);
}

// If the range with the smaller end didn't match this time,
// it won't ever match, so move on to the next one.
let (iter, item) = if a.end < b.end {
(&mut iter_a, &mut a)
} else {
(&mut iter_b, &mut b)
};
if let Some(v) = iter.next() {
*item = v;
} else {
// No more ranges to check, done.
break;
}
}

intersection.canonicalize()
}

/// Negates the character class.
///
/// For all `c` where `c` is a Unicode scalar value, `c` matches `self`
Expand Down Expand Up @@ -801,6 +838,18 @@ impl ClassRange {
max(self.start, other.start) <= inc_char(min(self.end, other.end))
}

/// Returns the intersection of the two ranges if they have common
/// characters, `None` otherwise.
fn intersection(&self, other: &ClassRange) -> Option<ClassRange> {
let start = max(self.start, other.start);
let end = min(self.end, other.end);
if start <= end {
Some(ClassRange::new(start, end))
} else {
None
}
}

/// Creates a new range representing the union of `self` and `other.
fn merge(self, other: ClassRange) -> ClassRange {
ClassRange {
Expand Down Expand Up @@ -1907,6 +1956,108 @@ mod tests {
]));
}

#[test]
fn class_intersection_empty() {
let cls1 = class(&[]);
let cls2 = class(&[('a', 'a')]);
assert_intersection(cls1, cls2, class(&[]));
}

#[test]
fn class_intersection_single_equal() {
let cls1 = class(&[('a', 'a')]);
let cls2 = class(&[('a', 'a')]);
assert_intersection(cls1, cls2, class(&[('a', 'a')]));
}

#[test]
fn class_intersection_single_unequal() {
let cls1 = class(&[('a', 'a')]);
let cls2 = class(&[('b', 'b')]);
assert_intersection(cls1, cls2, class(&[]));
}

#[test]
fn class_intersection_single_in_other() {
let cls1 = class(&[('a', 'a')]);
let cls2 = class(&[('a', 'c')]);
assert_intersection(cls1, cls2, class(&[('a', 'a')]));
}

#[test]
fn class_intersection_range_in_other() {
let cls1 = class(&[('a', 'b')]);
let cls2 = class(&[('a', 'c')]);
assert_intersection(cls1, cls2, class(&[('a', 'b')]));
}

#[test]
fn class_intersection_range_intersection() {
let cls1 = class(&[('a', 'b')]);
let cls2 = class(&[('b', 'c')]);
assert_intersection(cls1, cls2, class(&[('b', 'b')]));
}

#[test]
fn class_intersection_only_adjacent() {
let cls1 = class(&[('a', 'b')]);
let cls2 = class(&[('c', 'd')]);
assert_intersection(cls1, cls2, class(&[]));
}

#[test]
fn class_intersection_range_subset() {
let cls1 = class(&[('b', 'c')]);
let cls2 = class(&[('a', 'd')]);
assert_intersection(cls1, cls2, class(&[('b', 'c')]));
}

#[test]
fn class_intersection_many_ranges_in_one_big() {
let cls1 = class(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
let cls2 = class(&[('a', 'h')]);
assert_intersection(cls1, cls2, class(&[
('a', 'b'), ('d', 'e'), ('g', 'h')
]));
}

#[test]
fn class_intersection_many_ranges_same() {
let cls1 = class(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
let cls2 = class(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
assert_intersection(cls1, cls2, class(&[
('a', 'b'), ('d', 'e'), ('g', 'h')
]));
}

#[test]
fn class_intersection_multiple_non_intersecting() {
let cls1 = class(&[('a', 'b'), ('g', 'h')]);
let cls2 = class(&[('d', 'e'), ('k', 'l')]);
assert_intersection(cls1, cls2, class(&[]));
}

#[test]
fn class_intersection_non_intersecting_then_intersecting() {
let cls1 = class(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
let cls2 = class(&[('h', 'h')]);
assert_intersection(cls1, cls2, class(&[('h', 'h')]));
}

#[test]
fn class_intersection_adjacent_alternating() {
let cls1 = class(&[('a', 'b'), ('e', 'f'), ('i', 'j')]);
let cls2 = class(&[('c', 'd'), ('g', 'h'), ('k', 'l')]);
assert_intersection(cls1, cls2, class(&[]));
}

#[test]
fn class_intersection_overlapping_alternating() {
let cls1 = class(&[('a', 'b'), ('c', 'd'), ('e', 'f')]);
let cls2 = class(&[('b', 'c'), ('d', 'e'), ('f', 'g')]);
assert_intersection(cls1, cls2, class(&[('b', 'f')]));
}

#[test]
fn class_canon_overlap_many_case_fold() {
let cls = class(&[
Expand Down Expand Up @@ -2056,4 +2207,10 @@ mod tests {
let expr = e("(?-u)[-./]");
assert_eq!("(?-u:[-\\.-/])", expr.to_string());
}

fn assert_intersection(cls1: CharClass, cls2: CharClass, expected: CharClass) {
// intersection operation should be commutative
assert_eq!(cls1.intersection(&cls2), expected);
assert_eq!(cls2.intersection(&cls1), expected);
}
}
Loading

0 comments on commit c436bfd

Please sign in to comment.