From 2b9acd37f0a13572684dde80e3e56d5c1b2ec045 Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Mon, 18 Mar 2019 12:57:43 +0100
Subject: [PATCH] Fixed bug #72685

We currently have a large performance problem when implementing lexers
working on UTF-8 strings in PHP. This kind of code tends to perform a
large number of matches at different offsets on a single string. This
is generally fast. However, if /u mode is used, the full string will
be UTF-8 validated on each match. This results in quadratic runtime.

This patch fixes the issue by adding a IS_STR_VALID_UTF8 flag, which
is set when we have determined that the string is valid UTF8 and
further validation is skipped.

A limitation of this approach is that we can't set the flag for interned
strings. I think this is not a problem for this use-case which will
generally work on dynamic data. If we want to use this flag for other
purposes as well (mbstring?) then it might be worthwhile to UTF-8 validate
strings during interning. But right now this doesn't seem useful.
---
 NEWS                         |  4 ++++
 UPGRADING                    |  5 +++++
 Zend/zend_string.h           |  7 ++++---
 Zend/zend_types.h            |  1 +
 ext/pcre/php_pcre.c          |  9 +++++++--
 ext/pcre/tests/bug72685.phpt | 17 +++++++++++++++++
 6 files changed, 38 insertions(+), 5 deletions(-)
 create mode 100644 ext/pcre/tests/bug72685.phpt

diff --git a/NEWS b/NEWS
index 1e2103ef96add..30b4103b31b3d 100644
--- a/NEWS
+++ b/NEWS
@@ -61,6 +61,10 @@ PHP                                                                        NEWS
   . openssl_random_pseudo_bytes() now throws in error conditions.
     (Sammy Kaye Powers)
 
+- PCRE:
+  . Fixed bug #72685 (Repeated UTF-8 validation of same string in UTF-8 mode).
+    (Nikita)
+
 - PDO_OCI:
   . Support Oracle Database tracing attributes ACTION, MODULE,
     CLIENT_INFO, and CLIENT_IDENTIFIER. (Cameron Porter)
diff --git a/UPGRADING b/UPGRADING
index 0152527217621..49c459e191aeb 100644
--- a/UPGRADING
+++ b/UPGRADING
@@ -415,3 +415,8 @@ The following extensions are affected:
     which improves performance of this function if it can be statically
     resolved. In namespaced code, this may require writing \array_key_exists()
     or explicitly importing the function.
+
+- PCRE:
+  . When preg_match() in UTF-8 mode ("u" modifier) is repeatedly called on the
+    same string (but possibly different offsets), it will only be checked for
+    UTF-8 validity once.
diff --git a/Zend/zend_string.h b/Zend/zend_string.h
index 1accff3cd21f7..4ede3f03819e0 100644
--- a/Zend/zend_string.h
+++ b/Zend/zend_string.h
@@ -79,7 +79,7 @@ END_EXTERN_C()
 	(str) = (zend_string *)do_alloca(ZEND_MM_ALIGNED_SIZE_EX(_ZSTR_STRUCT_SIZE(_len), 8), (use_heap)); \
 	GC_SET_REFCOUNT(str, 1); \
 	GC_TYPE_INFO(str) = IS_STRING; \
-	zend_string_forget_hash_val(str); \
+	ZSTR_H(str) = 0; \
 	ZSTR_LEN(str) = _len; \
 } while (0)
 
@@ -101,6 +101,7 @@ static zend_always_inline zend_ulong zend_string_hash_val(zend_string *s)
 static zend_always_inline void zend_string_forget_hash_val(zend_string *s)
 {
 	ZSTR_H(s) = 0;
+	GC_DEL_FLAGS(s, IS_STR_VALID_UTF8);
 }
 
 static zend_always_inline uint32_t zend_string_refcount(const zend_string *s)
@@ -133,7 +134,7 @@ static zend_always_inline zend_string *zend_string_alloc(size_t len, int persist
 
 	GC_SET_REFCOUNT(ret, 1);
 	GC_TYPE_INFO(ret) = IS_STRING | ((persistent ? IS_STR_PERSISTENT : 0) << GC_FLAGS_SHIFT);
-	zend_string_forget_hash_val(ret);
+	ZSTR_H(ret) = 0;
 	ZSTR_LEN(ret) = len;
 	return ret;
 }
@@ -144,7 +145,7 @@ static zend_always_inline zend_string *zend_string_safe_alloc(size_t n, size_t m
 
 	GC_SET_REFCOUNT(ret, 1);
 	GC_TYPE_INFO(ret) = IS_STRING | ((persistent ? IS_STR_PERSISTENT : 0) << GC_FLAGS_SHIFT);
-	zend_string_forget_hash_val(ret);
+	ZSTR_H(ret) = 0;
 	ZSTR_LEN(ret) = (n * m) + l;
 	return ret;
 }
diff --git a/Zend/zend_types.h b/Zend/zend_types.h
index f14f769ba6ff3..a5ca56e0cf67e 100644
--- a/Zend/zend_types.h
+++ b/Zend/zend_types.h
@@ -579,6 +579,7 @@ static zend_always_inline uint32_t zval_gc_info(uint32_t gc_type_info) {
 #define IS_STR_INTERNED				GC_IMMUTABLE  /* interned string */
 #define IS_STR_PERSISTENT			GC_PERSISTENT /* allocated using malloc */
 #define IS_STR_PERMANENT        	(1<<8)        /* relives request boundary */
+#define IS_STR_VALID_UTF8           (1<<9)        /* valid UTF-8 according to PCRE */
 
 /* array flags */
 #define IS_ARRAY_IMMUTABLE			GC_IMMUTABLE
diff --git a/ext/pcre/php_pcre.c b/ext/pcre/php_pcre.c
index d18ab6ae32447..e1c46842b9d39 100644
--- a/ext/pcre/php_pcre.c
+++ b/ext/pcre/php_pcre.c
@@ -1104,7 +1104,8 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str,
 		}
 	}
 
-	options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK;
+	options = (pce->compile_options & PCRE2_UTF) && !(GC_FLAGS(subject_str) & IS_STR_VALID_UTF8)
+		? 0 : PCRE2_NO_UTF_CHECK;
 
 	/* Execute the regular expression. */
 #ifdef HAVE_PCRE_JIT_SUPPORT
@@ -1403,8 +1404,12 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str,
 		efree(subpat_names);
 	}
 
-	/* Did we encounter an error? */
 	if (PCRE_G(error_code) == PHP_PCRE_NO_ERROR) {
+		/* If there was no error and we're in /u mode, remember that the string is valid UTF-8. */
+		if ((pce->compile_options & PCRE2_UTF) && !ZSTR_IS_INTERNED(subject_str)) {
+			GC_ADD_FLAGS(subject_str, IS_STR_VALID_UTF8);
+		}
+
 		RETVAL_LONG(matched);
 	} else {
 		RETVAL_FALSE;
diff --git a/ext/pcre/tests/bug72685.phpt b/ext/pcre/tests/bug72685.phpt
new file mode 100644
index 0000000000000..7f6eabc182bcd
--- /dev/null
+++ b/ext/pcre/tests/bug72685.phpt
@@ -0,0 +1,17 @@
+--TEST--
+Bug #72685: Same string is UTF-8 validated repeatedly
+--FILE--
+<?php
+
+$input_size = 64 * 1024;
+$str = str_repeat('a', $input_size);
+
+$start = microtime(true);
+$pos = 0;
+while (preg_match('/\G\w/u', $str, $m, 0, $pos)) ++$pos;
+$end = microtime(true);
+var_dump(($end - $start) < 0.5); // large margin, more like 0.05 in debug build
+
+?>
+--EXPECT--
+bool(true)