From 2b9acd37f0a13572684dde80e3e56d5c1b2ec045 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Mon, 18 Mar 2019 12:57:43 +0100 Subject: [PATCH] Fixed bug #72685 We currently have a large performance problem when implementing lexers working on UTF-8 strings in PHP. This kind of code tends to perform a large number of matches at different offsets on a single string. This is generally fast. However, if /u mode is used, the full string will be UTF-8 validated on each match. This results in quadratic runtime. This patch fixes the issue by adding a IS_STR_VALID_UTF8 flag, which is set when we have determined that the string is valid UTF8 and further validation is skipped. A limitation of this approach is that we can't set the flag for interned strings. I think this is not a problem for this use-case which will generally work on dynamic data. If we want to use this flag for other purposes as well (mbstring?) then it might be worthwhile to UTF-8 validate strings during interning. But right now this doesn't seem useful. --- NEWS | 4 ++++ UPGRADING | 5 +++++ Zend/zend_string.h | 7 ++++--- Zend/zend_types.h | 1 + ext/pcre/php_pcre.c | 9 +++++++-- ext/pcre/tests/bug72685.phpt | 17 +++++++++++++++++ 6 files changed, 38 insertions(+), 5 deletions(-) create mode 100644 ext/pcre/tests/bug72685.phpt diff --git a/NEWS b/NEWS index 1e2103ef96add..30b4103b31b3d 100644 --- a/NEWS +++ b/NEWS @@ -61,6 +61,10 @@ PHP NEWS . openssl_random_pseudo_bytes() now throws in error conditions. (Sammy Kaye Powers) +- PCRE: + . Fixed bug #72685 (Repeated UTF-8 validation of same string in UTF-8 mode). + (Nikita) + - PDO_OCI: . Support Oracle Database tracing attributes ACTION, MODULE, CLIENT_INFO, and CLIENT_IDENTIFIER. (Cameron Porter) diff --git a/UPGRADING b/UPGRADING index 0152527217621..49c459e191aeb 100644 --- a/UPGRADING +++ b/UPGRADING @@ -415,3 +415,8 @@ The following extensions are affected: which improves performance of this function if it can be statically resolved. In namespaced code, this may require writing \array_key_exists() or explicitly importing the function. + +- PCRE: + . When preg_match() in UTF-8 mode ("u" modifier) is repeatedly called on the + same string (but possibly different offsets), it will only be checked for + UTF-8 validity once. diff --git a/Zend/zend_string.h b/Zend/zend_string.h index 1accff3cd21f7..4ede3f03819e0 100644 --- a/Zend/zend_string.h +++ b/Zend/zend_string.h @@ -79,7 +79,7 @@ END_EXTERN_C() (str) = (zend_string *)do_alloca(ZEND_MM_ALIGNED_SIZE_EX(_ZSTR_STRUCT_SIZE(_len), 8), (use_heap)); \ GC_SET_REFCOUNT(str, 1); \ GC_TYPE_INFO(str) = IS_STRING; \ - zend_string_forget_hash_val(str); \ + ZSTR_H(str) = 0; \ ZSTR_LEN(str) = _len; \ } while (0) @@ -101,6 +101,7 @@ static zend_always_inline zend_ulong zend_string_hash_val(zend_string *s) static zend_always_inline void zend_string_forget_hash_val(zend_string *s) { ZSTR_H(s) = 0; + GC_DEL_FLAGS(s, IS_STR_VALID_UTF8); } static zend_always_inline uint32_t zend_string_refcount(const zend_string *s) @@ -133,7 +134,7 @@ static zend_always_inline zend_string *zend_string_alloc(size_t len, int persist GC_SET_REFCOUNT(ret, 1); GC_TYPE_INFO(ret) = IS_STRING | ((persistent ? IS_STR_PERSISTENT : 0) << GC_FLAGS_SHIFT); - zend_string_forget_hash_val(ret); + ZSTR_H(ret) = 0; ZSTR_LEN(ret) = len; return ret; } @@ -144,7 +145,7 @@ static zend_always_inline zend_string *zend_string_safe_alloc(size_t n, size_t m GC_SET_REFCOUNT(ret, 1); GC_TYPE_INFO(ret) = IS_STRING | ((persistent ? IS_STR_PERSISTENT : 0) << GC_FLAGS_SHIFT); - zend_string_forget_hash_val(ret); + ZSTR_H(ret) = 0; ZSTR_LEN(ret) = (n * m) + l; return ret; } diff --git a/Zend/zend_types.h b/Zend/zend_types.h index f14f769ba6ff3..a5ca56e0cf67e 100644 --- a/Zend/zend_types.h +++ b/Zend/zend_types.h @@ -579,6 +579,7 @@ static zend_always_inline uint32_t zval_gc_info(uint32_t gc_type_info) { #define IS_STR_INTERNED GC_IMMUTABLE /* interned string */ #define IS_STR_PERSISTENT GC_PERSISTENT /* allocated using malloc */ #define IS_STR_PERMANENT (1<<8) /* relives request boundary */ +#define IS_STR_VALID_UTF8 (1<<9) /* valid UTF-8 according to PCRE */ /* array flags */ #define IS_ARRAY_IMMUTABLE GC_IMMUTABLE diff --git a/ext/pcre/php_pcre.c b/ext/pcre/php_pcre.c index d18ab6ae32447..e1c46842b9d39 100644 --- a/ext/pcre/php_pcre.c +++ b/ext/pcre/php_pcre.c @@ -1104,7 +1104,8 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, } } - options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK; + options = (pce->compile_options & PCRE2_UTF) && !(GC_FLAGS(subject_str) & IS_STR_VALID_UTF8) + ? 0 : PCRE2_NO_UTF_CHECK; /* Execute the regular expression. */ #ifdef HAVE_PCRE_JIT_SUPPORT @@ -1403,8 +1404,12 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, efree(subpat_names); } - /* Did we encounter an error? */ if (PCRE_G(error_code) == PHP_PCRE_NO_ERROR) { + /* If there was no error and we're in /u mode, remember that the string is valid UTF-8. */ + if ((pce->compile_options & PCRE2_UTF) && !ZSTR_IS_INTERNED(subject_str)) { + GC_ADD_FLAGS(subject_str, IS_STR_VALID_UTF8); + } + RETVAL_LONG(matched); } else { RETVAL_FALSE; diff --git a/ext/pcre/tests/bug72685.phpt b/ext/pcre/tests/bug72685.phpt new file mode 100644 index 0000000000000..7f6eabc182bcd --- /dev/null +++ b/ext/pcre/tests/bug72685.phpt @@ -0,0 +1,17 @@ +--TEST-- +Bug #72685: Same string is UTF-8 validated repeatedly +--FILE-- + +--EXPECT-- +bool(true)