From e3d6e7cae0fb68c570f9606419852fd87a3d1871 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Tue, 13 Mar 2018 19:23:45 -0400
Subject: [PATCH 1/3] style: reword ast::print docs

Also, small formatting fix and removal of debugging test.
---
 regex-syntax/src/ast/print.rs | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)
diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs
index 0d6dfb0a20..ddec734cef 100644
--- a/regex-syntax/src/ast/print.rs
+++ b/regex-syntax/src/ast/print.rs
@@ -9,7 +9,7 @@
 // except according to those terms.
 
 /*!
-This module provides a regular expression printer.
+This module provides a regular expression printer for `Ast`.
 */
 
 use std::fmt;
@@ -422,7 +422,7 @@ mod tests {
     }
 
     fn roundtrip_with<F>(mut f: F, given: &str)
-        where F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder
+    where F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder
     {
         let mut builder = ParserBuilder::new();
         f(&mut builder);
@@ -434,11 +434,6 @@ mod tests {
         assert_eq!(given, dst);
     }
 
-    #[test]
-    fn scratch() {
-        roundtrip(".");
-    }
-
     #[test]
     fn print_literal() {
         roundtrip("a");

From 369e0f9e2c23a2009c7c132540f58633a56562d0 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Tue, 13 Mar 2018 19:26:30 -0400
Subject: [PATCH 2/3] syntax/hir: fix handling of ASCII word boundaries

Previously, we had some inconsistencies in how we were handling ASCII
word boundaries. In particular, the translator was accepting a negated
ASCII word boundary even if the caller didn't disable the UTF-8 invariant.
This is wrong, since a negated ASCII word boundary can match between any
two arbitrary bytes. However, fixing this is a breaking change, so for
now we document the bug. We plan to fix it with regex 1.0. See #457.

Additionally, we were incorrectly declaring that an ASCII word boundary
matched invalid UTF-8 via the Hir::is_always_utf8 property. An ASCII word
boundary must always match an ASCII byte on one side, which implies a
valid UTF-8 position.
---
 regex-syntax/src/hir/mod.rs       |  5 +----
 regex-syntax/src/hir/translate.rs | 37 +++++++++++++++++++++++++------
 regex-syntax/src/parser.rs        |  5 +++++
 3 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs
index d443c538b3..d249064484 100644
--- a/regex-syntax/src/hir/mod.rs
+++ b/regex-syntax/src/hir/mod.rs
@@ -302,10 +302,7 @@ impl Hir {
         // A negated word boundary matches the empty string, but a normal
         // word boundary does not!
         info.set_match_empty(word_boundary.is_negated());
-        // ASCII word boundaries can match invalid UTF-8.
-        if let WordBoundary::Ascii = word_boundary {
-            info.set_always_utf8(false);
-        }
+        // Negated ASCII word boundaries can match invalid UTF-8.
         if let WordBoundary::AsciiNegate = word_boundary {
             info.set_always_utf8(false);
         }
diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs
index c56d9745a2..bdde0ddfa1 100644
--- a/regex-syntax/src/hir/translate.rs
+++ b/regex-syntax/src/hir/translate.rs
@@ -58,6 +58,11 @@ impl TranslatorBuilder {
     /// When disabled (the default), the translator is guaranteed to produce
     /// an expression that will only ever match valid UTF-8 (otherwise, the
     /// translator will return an error).
+    ///
+    /// Note that currently, even when invalid UTF-8 is banned, the translator
+    /// will permit a negated ASCII word boundary (i.e., `(?-u:\B)`) even
+    /// though it can actually match at invalid UTF-8 boundaries. This bug
+    /// will be fixed on the next semver release.
     pub fn allow_invalid_utf8(
         &mut self,
         yes: bool,
@@ -290,7 +295,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
                 self.push(HirFrame::Expr(try!(self.hir_dot(span))));
             }
             Ast::Assertion(ref x) => {
-                self.push(HirFrame::Expr(self.hir_assertion(x)));
+                self.push(HirFrame::Expr(try!(self.hir_assertion(x))));
             }
             Ast::Class(ast::Class::Perl(ref x)) => {
                 if self.flags().unicode() {
@@ -679,10 +684,10 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
         })
     }
 
-    fn hir_assertion(&self, asst: &ast::Assertion) -> Hir {
+    fn hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir> {
         let unicode = self.flags().unicode();
         let multi_line = self.flags().multi_line();
-        match asst.kind {
+        Ok(match asst.kind {
             ast::AssertionKind::StartLine => {
                 Hir::anchor(if multi_line {
                     hir::Anchor::StartLine
@@ -714,10 +719,20 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
                 Hir::word_boundary(if unicode {
                     hir::WordBoundary::UnicodeNegate
                 } else {
+                    // It is possible for negated ASCII word boundaries to
+                    // match at invalid UTF-8 boundaries, even when searching
+                    // valid UTF-8.
+                    //
+                    // TODO(ag): Enable this error when regex goes to 1.0.
+                    // Otherwise, it is too steep of a breaking change.
+                    // if !self.trans().allow_invalid_utf8 {
+                        // return Err(self.error(
+                            // asst.span, ErrorKind::InvalidUtf8));
+                    // }
                     hir::WordBoundary::AsciiNegate
                 })
             }
-        }
+        })
     }
 
     fn hir_group(&self, group: &ast::Group, expr: Hir) -> Hir {
@@ -1490,7 +1505,15 @@ mod tests {
         assert_eq!(t(r"\b"), hir_word(hir::WordBoundary::Unicode));
         assert_eq!(t(r"\B"), hir_word(hir::WordBoundary::UnicodeNegate));
         assert_eq!(t(r"(?-u)\b"), hir_word(hir::WordBoundary::Ascii));
-        assert_eq!(t(r"(?-u)\B"), hir_word(hir::WordBoundary::AsciiNegate));
+        assert_eq!(
+            t_bytes(r"(?-u)\B"),
+            hir_word(hir::WordBoundary::AsciiNegate));
+
+        // TODO(ag): Enable this tests when regex goes to 1.0.
+        // assert_eq!(t_err(r"(?-u)\B"), TestError {
+            // kind: hir::ErrorKind::InvalidUtf8,
+            // span: Span::new(Position::new(5, 1, 6), Position::new(7, 1, 8)),
+        // });
     }
 
     #[test]
@@ -2355,13 +2378,13 @@ mod tests {
         assert!(t_bytes(r"[^a][^a]").is_always_utf8());
         assert!(t_bytes(r"\b").is_always_utf8());
         assert!(t_bytes(r"\B").is_always_utf8());
+        assert!(t_bytes(r"(?-u)\b").is_always_utf8());
 
         // Negative examples.
         assert!(!t_bytes(r"(?-u)\xFF").is_always_utf8());
         assert!(!t_bytes(r"(?-u)\xFF\xFF").is_always_utf8());
         assert!(!t_bytes(r"(?-u)[^a]").is_always_utf8());
         assert!(!t_bytes(r"(?-u)[^a][^a]").is_always_utf8());
-        assert!(!t_bytes(r"(?-u)\b").is_always_utf8());
         assert!(!t_bytes(r"(?-u)\B").is_always_utf8());
     }
 
@@ -2490,7 +2513,7 @@ mod tests {
         assert!(t(r"\A").is_match_empty());
         assert!(t(r"\z").is_match_empty());
         assert!(t(r"\B").is_match_empty());
-        assert!(t(r"(?-u)\B").is_match_empty());
+        assert!(t_bytes(r"(?-u)\B").is_match_empty());
 
         // Negative examples.
         assert!(!t(r"a+").is_match_empty());
diff --git a/regex-syntax/src/parser.rs b/regex-syntax/src/parser.rs
index 2afd2fe234..e28d7f3263 100644
--- a/regex-syntax/src/parser.rs
+++ b/regex-syntax/src/parser.rs
@@ -87,6 +87,11 @@ impl ParserBuilder {
     /// When disabled (the default), the parser is guaranteed to produce
     /// an expression that will only ever match valid UTF-8 (otherwise, the
     /// parser will return an error).
+    ///
+    /// Note that currently, even when invalid UTF-8 is banned, the parser
+    /// will permit a negated ASCII word boundary (i.e., `(?-u:\B)`) even
+    /// though it can actually match at invalid UTF-8 boundaries. This bug
+    /// will be fixed on the next semver release.
     pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut ParserBuilder {
         self.hir.allow_invalid_utf8(yes);
         self

From 3ff742b2191caad60723895a14f3de0e9a02f49b Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Tue, 13 Mar 2018 19:27:26 -0400
Subject: [PATCH 3/3] syntax/hir: add a printer for HIR

This adds a printer for the high-level intermediate representation. The
regex it prints is valid, and can be used as a way to turn it into a
regex::Regex.
---
 regex-syntax/src/hir/mod.rs   |  18 ++
 regex-syntax/src/hir/print.rs | 359 ++++++++++++++++++++++++++++++++++
 2 files changed, 377 insertions(+)
 create mode 100644 regex-syntax/src/hir/print.rs

diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs
index d249064484..903e6085be 100644
--- a/regex-syntax/src/hir/mod.rs
+++ b/regex-syntax/src/hir/mod.rs
@@ -25,6 +25,7 @@ pub use hir::visitor::{Visitor, visit};
 
 mod interval;
 pub mod literal;
+pub mod print;
 pub mod translate;
 mod visitor;
 
@@ -152,6 +153,10 @@ impl fmt::Display for ErrorKind {
 ///    and can be computed cheaply during the construction process. For
 ///    example, one such attribute is whether the expression must match at the
 ///    beginning of the text.
+///
+/// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular
+/// expression pattern string, and uses constant stack space and heap space
+/// proportional to the size of the `Hir`.
 #[derive(Clone, Debug, Eq, PartialEq)]
 pub struct Hir {
     /// The underlying HIR kind.
@@ -602,6 +607,19 @@ impl HirKind {
     }
 }
 
+/// Print a display representation of this Hir.
+///
+/// The result of this is a valid regular expression pattern string.
+///
+/// This implementation uses constant stack space and heap space proportional
+/// to the size of the `Hir`.
+impl fmt::Display for Hir {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        use hir::print::Printer;
+        Printer::new().print(self, f)
+    }
+}
+
 /// The high-level intermediate representation of a literal.
 ///
 /// A literal corresponds to a single character, where a character is either
diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs
new file mode 100644
index 0000000000..f31468504a
--- /dev/null
+++ b/regex-syntax/src/hir/print.rs
@@ -0,0 +1,359 @@
+/*!
+This module provides a regular expression printer for `Hir`.
+*/
+
+use std::fmt;
+
+use hir::{self, Hir, HirKind};
+use hir::visitor::{self, Visitor};
+use is_meta_character;
+
+/// A builder for constructing a printer.
+///
+/// Note that since a printer doesn't have any configuration knobs, this type
+/// remains unexported.
+#[derive(Clone, Debug)]
+struct PrinterBuilder {
+    _priv: (),
+}
+
+impl Default for PrinterBuilder {
+    fn default() -> PrinterBuilder {
+        PrinterBuilder::new()
+    }
+}
+
+impl PrinterBuilder {
+    fn new() -> PrinterBuilder {
+        PrinterBuilder {
+            _priv: (),
+        }
+    }
+
+    fn build(&self) -> Printer {
+        Printer {
+            _priv: (),
+        }
+    }
+}
+
+/// A printer for a regular expression's high-level intermediate
+/// representation.
+///
+/// A printer converts a high-level intermediate representation (HIR) to a
+/// regular expression pattern string. This particular printer uses constant
+/// stack space and heap space proportional to the size of the HIR.
+///
+/// Since this printer is only using the HIR, the pattern it prints will likely
+/// not resemble the original pattern at all. For example, a pattern like
+/// `\pL` will have its entire class written out.
+///
+/// The purpose of this printer is to provide a means to mutate an HIR and then
+/// build a regular expression from the result of that mutation. (A regex
+/// library could provide a constructor from this HIR explicitly, but that
+/// creates an unnecessary public coupling between the regex library and this
+/// specific HIR representation.)
+#[derive(Debug)]
+pub struct Printer {
+    _priv: (),
+}
+
+impl Printer {
+    /// Create a new printer.
+    pub fn new() -> Printer {
+        PrinterBuilder::new().build()
+    }
+
+    /// Print the given `Ast` to the given writer. The writer must implement
+    /// `fmt::Write`. Typical implementations of `fmt::Write` that can be used
+    /// here are a `fmt::Formatter` (which is available in `fmt::Display`
+    /// implementations) or a `&mut String`.
+    pub fn print<W: fmt::Write>(&mut self, hir: &Hir, wtr: W) -> fmt::Result {
+        visitor::visit(hir, Writer { printer: self, wtr: wtr })
+    }
+}
+
+#[derive(Debug)]
+struct Writer<'p, W> {
+    printer: &'p mut Printer,
+    wtr: W,
+}
+
+impl<'p, W: fmt::Write> Visitor for Writer<'p, W> {
+    type Output = ();
+    type Err = fmt::Error;
+
+    fn finish(self) -> fmt::Result {
+        Ok(())
+    }
+
+    fn visit_pre(&mut self, hir: &Hir) -> fmt::Result {
+        match *hir.kind() {
+            HirKind::Empty
+            | HirKind::Repetition(_)
+            | HirKind::Concat(_)
+            | HirKind::Alternation(_) => {}
+            HirKind::Literal(hir::Literal::Unicode(c)) => {
+                try!(self.write_literal_char(c));
+            }
+            HirKind::Literal(hir::Literal::Byte(b)) => {
+                try!(self.write_literal_byte(b));
+            }
+            HirKind::Class(hir::Class::Unicode(ref cls)) => {
+                try!(self.wtr.write_str("["));
+                for range in cls.iter() {
+                    if range.start() == range.end() {
+                        try!(self.write_literal_char(range.start()));
+                    } else {
+                        try!(self.write_literal_char(range.start()));
+                        try!(self.wtr.write_str("-"));
+                        try!(self.write_literal_char(range.end()));
+                    }
+                }
+                try!(self.wtr.write_str("]"));
+            }
+            HirKind::Class(hir::Class::Bytes(ref cls)) => {
+                try!(self.wtr.write_str("(?-u:["));
+                for range in cls.iter() {
+                    if range.start() == range.end() {
+                        try!(self.write_literal_class_byte(range.start()));
+                    } else {
+                        try!(self.write_literal_class_byte(range.start()));
+                        try!(self.wtr.write_str("-"));
+                        try!(self.write_literal_class_byte(range.end()));
+                    }
+                }
+                try!(self.wtr.write_str("])"));
+            }
+            HirKind::Anchor(hir::Anchor::StartLine) => {
+                try!(self.wtr.write_str("(?m:^)"));
+            }
+            HirKind::Anchor(hir::Anchor::EndLine) => {
+                try!(self.wtr.write_str("(?m:$)"));
+            }
+            HirKind::Anchor(hir::Anchor::StartText) => {
+                try!(self.wtr.write_str(r"\A"));
+            }
+            HirKind::Anchor(hir::Anchor::EndText) => {
+                try!(self.wtr.write_str(r"\z"));
+            }
+            HirKind::WordBoundary(hir::WordBoundary::Unicode) => {
+                try!(self.wtr.write_str(r"\b"));
+            }
+            HirKind::WordBoundary(hir::WordBoundary::UnicodeNegate) => {
+                try!(self.wtr.write_str(r"\B"));
+            }
+            HirKind::WordBoundary(hir::WordBoundary::Ascii) => {
+                try!(self.wtr.write_str(r"(?-u:\b)"));
+            }
+            HirKind::WordBoundary(hir::WordBoundary::AsciiNegate) => {
+                try!(self.wtr.write_str(r"(?-u:\B)"));
+            }
+            HirKind::Group(ref x) => {
+                match x.kind {
+                    hir::GroupKind::CaptureIndex(_) => {
+                        try!(self.wtr.write_str("("));
+                    }
+                    hir::GroupKind::CaptureName { ref name, .. } => {
+                        try!(write!(self.wtr, "(?P<{}>", name));
+                    }
+                    hir::GroupKind::NonCapturing => {
+                        try!(self.wtr.write_str("(?:"));
+                    }
+                }
+            }
+        }
+        Ok(())
+    }
+
+    fn visit_post(&mut self, hir: &Hir) -> fmt::Result {
+        match *hir.kind() {
+            // Handled during visit_pre
+            HirKind::Empty
+            | HirKind::Literal(_)
+            | HirKind::Class(_)
+            | HirKind::Anchor(_)
+            | HirKind::WordBoundary(_)
+            | HirKind::Concat(_)
+            | HirKind::Alternation(_) => {}
+            HirKind::Repetition(ref x) => {
+                match x.kind {
+                    hir::RepetitionKind::ZeroOrOne => {
+                        try!(self.wtr.write_str("?"));
+                    }
+                    hir::RepetitionKind::ZeroOrMore => {
+                        try!(self.wtr.write_str("*"));
+                    }
+                    hir::RepetitionKind::OneOrMore => {
+                        try!(self.wtr.write_str("+"));
+                    }
+                    hir::RepetitionKind::Range(ref x) => {
+                        match *x {
+                            hir::RepetitionRange::Exactly(m) => {
+                                try!(write!(self.wtr, "{{{}}}", m));
+                            }
+                            hir::RepetitionRange::AtLeast(m) => {
+                                try!(write!(self.wtr, "{{{},}}", m));
+                            }
+                            hir::RepetitionRange::Bounded(m, n) => {
+                                try!(write!(self.wtr, "{{{},{}}}", m, n));
+                            }
+                        }
+                    }
+                }
+                if !x.greedy {
+                    try!(self.wtr.write_str("?"));
+                }
+            }
+            HirKind::Group(_) => {
+                try!(self.wtr.write_str(")"));
+            }
+        }
+        Ok(())
+    }
+
+    fn visit_alternation_in(&mut self) -> fmt::Result {
+        self.wtr.write_str("|")
+    }
+}
+
+impl<'p, W: fmt::Write> Writer<'p, W> {
+    fn write_literal_char(&mut self, c: char) -> fmt::Result {
+        if is_meta_character(c) {
+            try!(self.wtr.write_str("\\"));
+        }
+        self.wtr.write_char(c)
+    }
+
+    fn write_literal_byte(&mut self, b: u8) -> fmt::Result {
+        let c = b as char;
+        if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() {
+            self.wtr.write_char(c)
+        } else {
+            write!(self.wtr, "(?-u:\\x{:02X})", b)
+        }
+    }
+
+    fn write_literal_class_byte(&mut self, b: u8) -> fmt::Result {
+        let c = b as char;
+        if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() {
+            self.wtr.write_char(c)
+        } else {
+            write!(self.wtr, "\\x{:02X}", b)
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use ParserBuilder;
+    use super::Printer;
+
+    fn roundtrip(given: &str, expected: &str) {
+        roundtrip_with(|b| b, given, expected);
+    }
+
+    fn roundtrip_bytes(given: &str, expected: &str) {
+        roundtrip_with(|b| b.allow_invalid_utf8(true), given, expected);
+    }
+
+    fn roundtrip_with<F>(mut f: F, given: &str, expected: &str)
+    where F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder
+    {
+        let mut builder = ParserBuilder::new();
+        f(&mut builder);
+        let hir = builder.build().parse(given).unwrap();
+
+        let mut printer = Printer::new();
+        let mut dst = String::new();
+        printer.print(&hir, &mut dst).unwrap();
+        assert_eq!(expected, dst);
+    }
+
+    #[test]
+    fn print_literal() {
+        roundtrip("a", "a");
+        roundtrip(r"\xff", "\u{FF}");
+        roundtrip_bytes(r"\xff", "\u{FF}");
+        roundtrip_bytes(r"(?-u)\xff", r"(?-u:\xFF)");
+        roundtrip("☃", "☃");
+    }
+
+    #[test]
+    fn print_class() {
+        roundtrip(r"[a]", r"[a]");
+        roundtrip(r"[a-z]", r"[a-z]");
+        roundtrip(r"[a-z--b-c--x-y]", r"[ad-wz]");
+        roundtrip(r"[^\x01-\u{10FFFF}]", "[\u{0}]");
+        roundtrip(r"[-]", r"[\-]");
+        roundtrip(r"[☃-⛄]", r"[☃-⛄]");
+
+        roundtrip(r"(?-u)[a]", r"(?-u:[a])");
+        roundtrip(r"(?-u)[a-z]", r"(?-u:[a-z])");
+        roundtrip_bytes(r"(?-u)[a-\xFF]", r"(?-u:[a-\xFF])");
+    }
+
+    #[test]
+    fn print_anchor() {
+        roundtrip(r"^", r"\A");
+        roundtrip(r"$", r"\z");
+        roundtrip(r"(?m)^", r"(?m:^)");
+        roundtrip(r"(?m)$", r"(?m:$)");
+    }
+
+    #[test]
+    fn print_word_boundary() {
+        roundtrip(r"\b", r"\b");
+        roundtrip(r"\B", r"\B");
+        roundtrip(r"(?-u)\b", r"(?-u:\b)");
+        roundtrip_bytes(r"(?-u)\B", r"(?-u:\B)");
+    }
+
+    #[test]
+    fn print_repetition() {
+        roundtrip("a?", "a?");
+        roundtrip("a??", "a??");
+        roundtrip("(?U)a?", "a??");
+
+        roundtrip("a*", "a*");
+        roundtrip("a*?", "a*?");
+        roundtrip("(?U)a*", "a*?");
+
+        roundtrip("a+", "a+");
+        roundtrip("a+?", "a+?");
+        roundtrip("(?U)a+", "a+?");
+
+        roundtrip("a{1}", "a{1}");
+        roundtrip("a{1,}", "a{1,}");
+        roundtrip("a{1,5}", "a{1,5}");
+        roundtrip("a{1}?", "a{1}?");
+        roundtrip("a{1,}?", "a{1,}?");
+        roundtrip("a{1,5}?", "a{1,5}?");
+        roundtrip("(?U)a{1}", "a{1}?");
+        roundtrip("(?U)a{1,}", "a{1,}?");
+        roundtrip("(?U)a{1,5}", "a{1,5}?");
+    }
+
+    #[test]
+    fn print_group() {
+        roundtrip("()", "()");
+        roundtrip("(?P<foo>)", "(?P<foo>)");
+        roundtrip("(?:)", "(?:)");
+
+        roundtrip("(a)", "(a)");
+        roundtrip("(?P<foo>a)", "(?P<foo>a)");
+        roundtrip("(?:a)", "(?:a)");
+
+        roundtrip("((((a))))", "((((a))))");
+    }
+
+    #[test]
+    fn print_alternation() {
+        roundtrip("|", "|");
+        roundtrip("||", "||");
+
+        roundtrip("a|b", "a|b");
+        roundtrip("a|b|c", "a|b|c");
+        roundtrip("foo|bar|quux", "foo|bar|quux");
+    }
+}