Skip to content

Commit

Permalink
pcre2test: avoid printing invalid utf trail in partial match (#237)
Browse files Browse the repository at this point in the history
When match_invalid_utf is enabled, invalid UTF-8 data can't match
but it was mistakenly getting printed as part of a partial match
eventhough the ovector correctly didn't include it, as shown by:

  PCRE2 version 10.34 2019-11-21
    re> /(?<=..)X/match_invalid_utf,allvector
  data> XX\x80\=ph,ovector=1
  Partial match: \x{80}
  ** ovector[1] is not equal to the subject length: 2 != 3
   0: 2 2

Fix the logic to print instead the empty match that was returned
and address a buffer overread when trying to decode UTF-8 that was
missing code units.

Fixes: #235
  • Loading branch information
carenas authored Apr 21, 2023
1 parent 9bad465 commit 15a11d1
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 6 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ testtemp2
testtemp2grep
testtry
testtrygrep
testSinput
testbtables
testsaved1
testsaved2

m4/libtool.m4
m4/ltoptions.m4
Expand Down
21 changes: 15 additions & 6 deletions src/pcre2test.c
Original file line number Diff line number Diff line change
Expand Up @@ -2913,14 +2913,15 @@ limit.
Argument:
utf8bytes a pointer to the byte vector
end a pointer to the end of the byte vector
vptr a pointer to an int to receive the value
Returns: > 0 => the number of bytes consumed
-6 to 0 => malformed UTF-8 character at offset = (-return)
*/

static int
utf82ord(PCRE2_SPTR8 utf8bytes, uint32_t *vptr)
utf82ord(PCRE2_SPTR8 utf8bytes, PCRE2_SPTR8 end, uint32_t *vptr)
{
uint32_t c = *utf8bytes++;
uint32_t d = c;
Expand All @@ -2942,6 +2943,8 @@ d = (c & utf8_table3[i]) << s;

for (j = 0; j < i; j++)
{
if (utf8bytes >= end) return 0;

c = *utf8bytes++;
if ((c & 0xc0) != 0x80) return -(j+1);
s -= 6;
Expand Down Expand Up @@ -3052,14 +3055,16 @@ counts chars without printing (because pchar() does that). */

static int pchars8(PCRE2_SPTR8 p, int length, BOOL utf, FILE *f)
{
PCRE2_SPTR8 end;
uint32_t c = 0;
int yield = 0;
if (length < 0) length = *p++;
end = p + length;
while (length-- > 0)
{
if (utf)
{
int rc = utf82ord(p, &c);
int rc = utf82ord(p, end, &c);
if (rc > 0 && rc <= length + 1) /* Mustn't run over the end */
{
length -= rc - 1;
Expand Down Expand Up @@ -3238,7 +3243,8 @@ if (!utf && (pat_patctl.control & CTL_UTF8_INPUT) == 0)
else while (len > 0)
{
uint32_t c;
int chlen = utf82ord(p, &c);
const uint8_t *end = p + len;
int chlen = utf82ord(p, end, &c);
if (chlen <= 0) return -1;
if (!utf && c > 0xffff) return -3;
if (c > 0x10ffff) return -2;
Expand Down Expand Up @@ -3329,13 +3335,14 @@ else while (len > 0)
int chlen;
uint32_t c;
uint32_t topbit = 0;
const uint8_t *end = p + len;
if (!utf && *p == 0xff && len > 1)
{
topbit = 0x80000000u;
p++;
len--;
}
chlen = utf82ord(p, &c);
chlen = utf82ord(p, end, &c);
if (chlen <= 0) return -1;
if (utf && c > 0x10ffff) return -2;
p += chlen;
Expand Down Expand Up @@ -6852,7 +6859,9 @@ if (utf)
uint8_t *q;
uint32_t cc;
int n = 1;
for (q = p; n > 0 && *q; q += n) n = utf82ord(q, &cc);
uint8_t *q_end = p + len;

for (q = p; n > 0 && *q; q += n) n = utf82ord(q, q_end, &cc);
if (n <= 0)
{
fprintf(outfile, "** Failed: invalid UTF-8 string cannot be used as input "
Expand Down Expand Up @@ -8081,7 +8090,7 @@ for (gmatched = 0;; gmatched++)
rubriclength += 15;

PCHARS(backlength, pp, leftchar, ovector[0] - leftchar, utf, outfile);
PCHARSV(pp, ovector[0], ulen - ovector[0], utf, outfile);
PCHARSV(pp, ovector[0], ovector[1] - ovector[0], utf, outfile);

if ((pat_patctl.control & CTL_JITVERIFY) != 0 && jit_was_used)
fprintf(outfile, " (JIT)");
Expand Down

0 comments on commit 15a11d1

Please sign in to comment.