pcre2test: avoid printing invalid utf trail in partial match (#237)

When match_invalid_utf is enabled, invalid UTF-8 data can't match but it was mistakenly getting printed as part of a partial match eventhough the ovector correctly didn't include it, as shown by: PCRE2 version 10.34 2019-11-21 re> /(?<=..)X/match_invalid_utf,allvector data> XX\x80\=ph,ovector=1 Partial match: \x{80} ** ovector[1] is not equal to the subject length: 2 != 3 0: 2 2 Fix the logic to print instead the empty match that was returned and address a buffer overread when trying to decode UTF-8 that was missing code units. Fixes: #235
PCRE2Project · Apr 21, 2023 · 15a11d1 · 15a11d1
1 parent 9bad465
commit 15a11d1
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 6 deletions.
diff --git a/.gitignore b/.gitignore
@@ -66,6 +66,10 @@ testtemp2
 testtemp2grep
 testtry
 testtrygrep
+testSinput
+testbtables
+testsaved1
+testsaved2
 
 m4/libtool.m4
 m4/ltoptions.m4

diff --git a/src/pcre2test.c b/src/pcre2test.c
@@ -2913,14 +2913,15 @@ limit.
 
 Argument:
   utf8bytes   a pointer to the byte vector
+  end         a pointer to the end of the byte vector
   vptr        a pointer to an int to receive the value
 
 Returns:      >  0 => the number of bytes consumed
               -6 to 0 => malformed UTF-8 character at offset = (-return)
 */
 
 static int
-utf82ord(PCRE2_SPTR8 utf8bytes, uint32_t *vptr)
+utf82ord(PCRE2_SPTR8 utf8bytes, PCRE2_SPTR8 end, uint32_t *vptr)
 {
 uint32_t c = *utf8bytes++;
 uint32_t d = c;
@@ -2942,6 +2943,8 @@ d = (c & utf8_table3[i]) << s;
 
 for (j = 0; j < i; j++)
   {
+  if (utf8bytes >= end) return 0;
+
   c = *utf8bytes++;
   if ((c & 0xc0) != 0x80) return -(j+1);
   s -= 6;
@@ -3052,14 +3055,16 @@ counts chars without printing (because pchar() does that). */
 
 static int pchars8(PCRE2_SPTR8 p, int length, BOOL utf, FILE *f)
 {
+PCRE2_SPTR8 end;
 uint32_t c = 0;
 int yield = 0;
 if (length < 0) length = *p++;
+end = p + length;
 while (length-- > 0)
   {
   if (utf)
     {
-    int rc = utf82ord(p, &c);
+    int rc = utf82ord(p, end, &c);
     if (rc > 0 && rc <= length + 1)   /* Mustn't run over the end */
       {
       length -= rc - 1;
@@ -3238,7 +3243,8 @@ if (!utf && (pat_patctl.control & CTL_UTF8_INPUT) == 0)
 else while (len > 0)
   {
   uint32_t c;
-  int chlen = utf82ord(p, &c);
+  const uint8_t *end = p + len;
+  int chlen = utf82ord(p, end, &c);
   if (chlen <= 0) return -1;
   if (!utf && c > 0xffff) return -3;
   if (c > 0x10ffff) return -2;
@@ -3329,13 +3335,14 @@ else while (len > 0)
   int chlen;
   uint32_t c;
   uint32_t topbit = 0;
+  const uint8_t *end = p + len;
   if (!utf && *p == 0xff && len > 1)
     {
     topbit = 0x80000000u;
     p++;
     len--;
     }
-  chlen = utf82ord(p, &c);
+  chlen = utf82ord(p, end, &c);
   if (chlen <= 0) return -1;
   if (utf && c > 0x10ffff) return -2;
   p += chlen;
@@ -6852,7 +6859,9 @@ if (utf)
   uint8_t *q;
   uint32_t cc;
   int n = 1;
-  for (q = p; n > 0 && *q; q += n) n = utf82ord(q, &cc);
+  uint8_t *q_end = p + len;
+
+  for (q = p; n > 0 && *q; q += n) n = utf82ord(q, q_end, &cc);
   if (n <= 0)
     {
     fprintf(outfile, "** Failed: invalid UTF-8 string cannot be used as input "
@@ -8081,7 +8090,7 @@ for (gmatched = 0;; gmatched++)
     rubriclength += 15;
 
     PCHARS(backlength, pp, leftchar, ovector[0] - leftchar, utf, outfile);
-    PCHARSV(pp, ovector[0], ulen - ovector[0], utf, outfile);
+    PCHARSV(pp, ovector[0], ovector[1] - ovector[0], utf, outfile);
 
     if ((pat_patctl.control & CTL_JITVERIFY) != 0 && jit_was_used)
       fprintf(outfile, " (JIT)");