Fix percent delimiter strings with crlfs

parse.y treats CRLF as a LF and basically "normalizes" them before parsing. That means a string like `%\nfoo\r\n` is actually treated as `%\nfoo\n` for the purposes of parsing. This happens on both the opening side of the percent string as well as on the closing side. So for example `%\r\nfoo\n` must be treated as `%\nfoo\n`. To handle this in Prism, when we start a % string, we check if it starts with `\r\n`, and then consider the terminator to actually be `\n`. Then we check if there are `\r\n` as we lex the string and treat those as `\n`, but only in the case the start was a `\n`. Fixes: #3230 [Bug #20938] Co-authored-by: John Hawthorn <[email protected]> Co-authored-by: eileencodes <[email protected]> Co-authored-by: Kevin Newton <[email protected]>
ruby · Dec 11, 2024 · e573cea · e573cea
1 parent 75a6171
commit e573cea
Show file tree

Hide file tree

Showing 2 changed files with 86 additions and 3 deletions.
diff --git a/src/prism.c b/src/prism.c
@@ -10508,6 +10508,7 @@ pm_token_buffer_escape(pm_parser_t *parser, pm_token_buffer_t *token_buffer) {
     }
 
     const uint8_t *end = parser->current.end - 1;
+    assert(end >= start);
     pm_buffer_append_bytes(&token_buffer->buffer, start, (size_t) (end - start));
 
     token_buffer->cursor = end;
@@ -10588,9 +10589,15 @@ pm_lex_percent_delimiter(pm_parser_t *parser) {
             pm_newline_list_append(&parser->newline_list, parser->current.end + eol_length - 1);
         }
 
-        const uint8_t delimiter = *parser->current.end;
-        parser->current.end += eol_length;
+        uint8_t delimiter = *parser->current.end;
+
+        // If our delimiter is \r\n, we want to treat it as if it's \n.
+        // For example, %\r\nfoo\r\n should be "foo"
+        if (eol_length == 2) {
+            delimiter = *(parser->current.end + 1);
+        }
 
+        parser->current.end += eol_length;
         return delimiter;
     }
 
@@ -12340,10 +12347,28 @@ parser_lex(pm_parser_t *parser) {
                     continue;
                 }
 
+                bool is_terminator = (*breakpoint == lex_mode->as.string.terminator);
+
+                // If the terminator is newline, we need to consider \r\n _also_ a newline
+                // For example: `%\nfoo\r\n`
+                // The string should be "foo", not "foo\r"
+                if (*breakpoint == '\r' && peek_at(parser, breakpoint + 1) == '\n') {
+                    if (lex_mode->as.string.terminator == '\n') {
+                        is_terminator = true;
+                    }
+
+                    // If the terminator is a CR, but we see a CRLF, we need to
+                    // treat the CRLF as a newline, meaning this is _not_ the
+                    // terminator
+                    if (lex_mode->as.string.terminator == '\r') {
+                        is_terminator = false;
+                    }
+                }
+
                 // Note that we have to check the terminator here first because we could
                 // potentially be parsing a % string that has a # character as the
                 // terminator.
-                if (*breakpoint == lex_mode->as.string.terminator) {
+                if (is_terminator) {
                     // If this terminator doesn't actually close the string, then we need
                     // to continue on past it.
                     if (lex_mode->as.string.nesting > 0) {

diff --git a/test/prism/percent_delimiter_string_test.rb b/test/prism/percent_delimiter_string_test.rb
@@ -0,0 +1,58 @@
+# frozen_string_literal: true
+
+require_relative "test_helper"
+
+module Prism
+  class PercentDelimiterStringTest < TestCase
+    def test_newline_terminator_with_lf_crlf
+      str = "%\n123456\r\n"
+      assert_parse "123456", str
+    end
+
+    def test_newline_terminator_with_lf_crlf_with_extra_cr
+      str = "%\n123456\r\r\n"
+      assert_parse "123456\r", str
+    end
+
+    def test_newline_terminator_with_crlf_pair
+      str = "%\r\n123456\r\n"
+      assert_parse "123456", str
+    end
+
+    def test_newline_terminator_with_crlf_crlf_with_extra_cr
+      str = "%\r\n123456\r\r\n"
+      assert_parse "123456\r", str
+    end
+
+    def test_newline_terminator_with_cr_cr
+      str = "%\r123456\r;\n"
+      assert_parse "123456", str
+    end
+
+    def test_newline_terminator_with_crlf_lf
+      str = "%\r\n123456\n;\n"
+      assert_parse "123456", str
+    end
+
+    def test_cr_crlf
+      str = "%\r1\r\n \r"
+      assert_parse "1\n ", str
+    end
+
+    def test_lf_crlf
+      str = "%\n1\r\n \n"
+      assert_parse "1", str
+    end
+
+    def test_lf_lf
+      str = "%\n1\n \n"
+      assert_parse "1", str
+    end
+
+    def assert_parse(expected, str)
+      tree = Prism.parse str
+      node = tree.value.breadth_first_search { |x| Prism::StringNode === x }
+      assert_equal expected, node.unescaped
+    end
+  end
+end