From 26149a3d6581c9a840d999b8a386744b72ef1c89 Mon Sep 17 00:00:00 2001 From: Tyson Andre Date: Sun, 2 Oct 2022 09:52:32 -0400 Subject: [PATCH] Improve buffer reuse. Check for out of memory error code Closes #80 - simdjson_is_valid() and other PHP functions would previously return false when out of memory - Related to #60 - other php apis (using emalloc instead) will also emit fatal errors when out of memory and end the process. Closes #79 - reuse buffers for strings less than 1000000 bytes and 100000 depth. (Assumes the depth rarely changes in callers) --- php_simdjson.cpp | 58 ++++++++++++++++++++++++-------------- src/bindings.cpp | 72 ++++++++++++++++++++++++++---------------------- src/bindings.h | 31 ++++++++++++++++----- 3 files changed, 101 insertions(+), 60 deletions(-) diff --git a/php_simdjson.cpp b/php_simdjson.cpp index 6a6628c..709cae1 100644 --- a/php_simdjson.cpp +++ b/php_simdjson.cpp @@ -66,22 +66,8 @@ SIMDJSON_ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(simdjson_key_count_arginfo, 0, ZEND_ARG_TYPE_INFO(0, depth, IS_LONG, 0) ZEND_END_ARG_INFO() -extern simdjson::dom::parser* cplus_simdjson_create_parser(void); - -extern void cplus_simdjson_free_parser(simdjson::dom::parser* parser); - -extern bool cplus_simdjson_is_valid(simdjson::dom::parser& parser, const char *json, size_t len, size_t depth); - -extern void cplus_simdjson_parse(simdjson::dom::parser& parser, const char *json, size_t len, zval *return_value, unsigned char assoc, size_t depth); - -extern void cplus_simdjson_key_value(simdjson::dom::parser& parser, const char *json, size_t len, const char *key, zval *return_value, unsigned char assoc, size_t depth); - -extern u_short cplus_simdjson_key_exists(simdjson::dom::parser& parser, const char *json, size_t len, const char *key, size_t depth); - -extern void cplus_simdjson_key_count(simdjson::dom::parser& parser, const char *json, size_t len, const char *key, zval *return_value, size_t depth); - #define SIMDJSON_G(v) ZEND_MODULE_GLOBALS_ACCESSOR(simdjson, v) -static simdjson::dom::parser &simdjson_get_parser() { +static simdjson::dom::parser &simdjson_get_reused_parser() { simdjson::dom::parser *parser = (simdjson::dom::parser *)SIMDJSON_G(parser); if (parser == NULL) { parser = cplus_simdjson_create_parser(); @@ -114,7 +100,14 @@ PHP_FUNCTION (simdjson_is_valid) { if (!simdjson_validate_depth(depth)) { RETURN_NULL(); } - bool is_json = cplus_simdjson_is_valid(simdjson_get_parser(), ZSTR_VAL(json), ZSTR_LEN(json), depth); + bool is_json; + if (SIMDJSON_SHOULD_REUSE_PARSER(ZSTR_LEN(json), depth)) { + is_json = cplus_simdjson_is_valid(simdjson_get_reused_parser(), ZSTR_VAL(json), ZSTR_LEN(json), depth); + } else { + simdjson::dom::parser parser; + SIMDJSON_TRY_ALLOCATE(parser, ZSTR_LEN(json), depth); + is_json = cplus_simdjson_is_valid(parser, ZSTR_VAL(json), ZSTR_LEN(json), depth); + } ZVAL_BOOL(return_value, is_json); } @@ -128,7 +121,13 @@ PHP_FUNCTION (simdjson_decode) { if (!simdjson_validate_depth(depth)) { RETURN_NULL(); } - cplus_simdjson_parse(simdjson_get_parser(), ZSTR_VAL(json), ZSTR_LEN(json), return_value, assoc, depth); + if (SIMDJSON_SHOULD_REUSE_PARSER(ZSTR_LEN(json), depth)) { + cplus_simdjson_parse(simdjson_get_reused_parser(), ZSTR_VAL(json), ZSTR_LEN(json), return_value, assoc, depth); + } else { + simdjson::dom::parser parser; + SIMDJSON_TRY_ALLOCATE(parser, ZSTR_LEN(json), depth); + cplus_simdjson_parse(parser, ZSTR_VAL(json), ZSTR_LEN(json), return_value, assoc, depth); + } } PHP_FUNCTION (simdjson_key_value) { @@ -143,7 +142,13 @@ PHP_FUNCTION (simdjson_key_value) { if (!simdjson_validate_depth(depth)) { RETURN_NULL(); } - cplus_simdjson_key_value(simdjson_get_parser(), ZSTR_VAL(json), ZSTR_LEN(json), ZSTR_VAL(key), return_value, assoc, depth); + if (SIMDJSON_SHOULD_REUSE_PARSER(ZSTR_LEN(json), depth)) { + cplus_simdjson_key_value(simdjson_get_reused_parser(), ZSTR_VAL(json), ZSTR_LEN(json), ZSTR_VAL(key), return_value, assoc, depth); + } else { + simdjson::dom::parser parser; + SIMDJSON_TRY_ALLOCATE(parser, ZSTR_LEN(json), depth); + cplus_simdjson_key_value(parser, ZSTR_VAL(json), ZSTR_LEN(json), ZSTR_VAL(key), return_value, assoc, depth); + } } PHP_FUNCTION (simdjson_key_count) { @@ -156,7 +161,13 @@ PHP_FUNCTION (simdjson_key_count) { if (!simdjson_validate_depth(depth)) { RETURN_NULL(); } - cplus_simdjson_key_count(simdjson_get_parser(), ZSTR_VAL(json), ZSTR_LEN(json), ZSTR_VAL(key), return_value, depth); + if (SIMDJSON_SHOULD_REUSE_PARSER(ZSTR_LEN(json), depth)) { + cplus_simdjson_key_count(simdjson_get_reused_parser(), ZSTR_VAL(json), ZSTR_LEN(json), ZSTR_VAL(key), return_value, depth); + } else { + simdjson::dom::parser parser; + SIMDJSON_TRY_ALLOCATE(parser, ZSTR_LEN(json), depth); + cplus_simdjson_key_count(parser, ZSTR_VAL(json), ZSTR_LEN(json), ZSTR_VAL(key), return_value, depth); + } } PHP_FUNCTION (simdjson_key_exists) { @@ -169,7 +180,14 @@ PHP_FUNCTION (simdjson_key_exists) { if (!simdjson_validate_depth(depth)) { return; } - u_short stats = cplus_simdjson_key_exists(simdjson_get_parser(), ZSTR_VAL(json), ZSTR_LEN(json), ZSTR_VAL(key), depth); + u_short stats; + if (SIMDJSON_SHOULD_REUSE_PARSER(ZSTR_LEN(json), depth)) { + stats = cplus_simdjson_key_exists(simdjson_get_reused_parser(), ZSTR_VAL(json), ZSTR_LEN(json), ZSTR_VAL(key), depth); + } else { + simdjson::dom::parser parser; + SIMDJSON_TRY_ALLOCATE(parser, ZSTR_LEN(json), depth); + stats = cplus_simdjson_key_exists(parser, ZSTR_VAL(json), ZSTR_LEN(json), ZSTR_VAL(key), depth); + } if (SIMDJSON_PARSE_FAIL == stats) { RETURN_NULL(); } else if (SIMDJSON_PARSE_KEY_EXISTS == stats) { diff --git a/src/bindings.cpp b/src/bindings.cpp index 24276f0..36e896f 100644 --- a/src/bindings.cpp +++ b/src/bindings.cpp @@ -26,8 +26,6 @@ extern "C" { #define zend_string_release_ex(s, persistent) zend_string_release((s)) #endif -#define SIMDJSON_DEPTH_CHECK_THRESHOLD 100000 - static inline simdjson::simdjson_result get_key_with_optional_prefix(simdjson::dom::element &doc, std::string_view json_pointer) { @@ -37,34 +35,42 @@ get_key_with_optional_prefix(simdjson::dom::element &doc, std::string_view json_ static simdjson::error_code build_parsed_json_cust(simdjson::dom::parser& parser, simdjson::dom::element &doc, const char *buf, size_t len, bool realloc_if_needed, - size_t depth = simdjson::DEFAULT_MAX_DEPTH) { - if (UNEXPECTED(depth > SIMDJSON_DEPTH_CHECK_THRESHOLD) && depth > len && depth > parser.max_depth()) { - /* - * Choose the depth in a way that both avoids frequent reallocations - * and avoids excessive amounts of wasted memory beyond multiples of the largest string ever decoded. - * - * If the depth is already sufficient to parse a string of length `len`, - * then use the parser's previous depth. - * - * Precondition: depth > len - * Postcondition: depth <= original_depth && depth > len - */ - if (len < SIMDJSON_DEPTH_CHECK_THRESHOLD) { - depth = SIMDJSON_DEPTH_CHECK_THRESHOLD; - } else if (depth > len * 2) { - // In callers, simdjson_validate_depth ensures depth <= SIMDJSON_MAX_DEPTH (which is <= SIZE_MAX/8), - // so len * 2 is even smaller than the previous depth and won't overflow. - depth = len * 2; + size_t depth = simdjson::DEFAULT_MAX_DEPTH) { /* {{{ */ + if (depth != parser.max_depth()) { + if (UNEXPECTED(depth > SIMDJSON_DEPTH_CHECK_THRESHOLD) && depth > len) { + /* + * Choose the depth in a way that both avoids frequent reallocations + * and avoids excessive amounts of wasted memory beyond multiples of the largest string ever decoded. + * + * If the depth is already sufficient to parse a string of length `len`, + * then use the parser's previous depth. + * + * Precondition: depth > len + * Postcondition: depth <= original_depth && depth > len + */ + if (len < SIMDJSON_DEPTH_CHECK_THRESHOLD) { + depth = SIMDJSON_DEPTH_CHECK_THRESHOLD; + } else if (depth > len * 2) { + // In callers, simdjson_validate_depth ensures depth <= SIMDJSON_MAX_DEPTH (which is <= SIZE_MAX/8), + // so len * 2 is even smaller than the previous depth and won't overflow. + depth = len * 2; + } } - } - auto error = parser.allocate(len, depth); - - if (error) { - return error; + // Allocate is the only api the simdjson::dom::parser class provides to change the max depth. + // parse() calls parse_into_document(), which calls ensure_capacity(), which will ensure the capacity is at least as much as needed. + SIMDJSON_TRY_ALLOCATE(parser, len, depth); } - error = parser.parse(buf, len, realloc_if_needed).get(doc); + // Parse the json into document doc (simdjson::dom::element). + // + // Note that when realloc_if_needed is true (always, in this extension's PHP functions), + // this creates a padded string copy with an extra 64 bytes of initialized padding + // after the end of the string. + // + // The buffers in the parser must not be overwritten or reallocated/freed until we're done processing doc. + auto error = parser.parse(buf, len, realloc_if_needed).get(doc); if (error) { + if (error == simdjson::MEMALLOC) { SIMDJSON_OUT_OF_MEMORY_ERROR_NORETURN(); } return error; } @@ -234,15 +240,15 @@ static zval create_object(simdjson::dom::element element) /* {{{ */ { /* }}} */ -simdjson::dom::parser* cplus_simdjson_create_parser(void) /* {{{ */ { +ZEND_API simdjson::dom::parser* cplus_simdjson_create_parser(void) /* {{{ */ { return new simdjson::dom::parser(); } -void cplus_simdjson_free_parser(simdjson::dom::parser* parser) /* {{{ */ { +ZEND_API void cplus_simdjson_free_parser(simdjson::dom::parser* parser) /* {{{ */ { delete parser; } -bool cplus_simdjson_is_valid(simdjson::dom::parser& parser, const char *json, size_t len, size_t depth) /* {{{ */ { +ZEND_API bool cplus_simdjson_is_valid(simdjson::dom::parser& parser, const char *json, size_t len, size_t depth) /* {{{ */ { simdjson::dom::element doc; /* The depth is passed in to ensure this behaves the same way for the same arguments */ auto error = build_parsed_json_cust(parser, doc, json, len, true, depth); @@ -254,7 +260,7 @@ bool cplus_simdjson_is_valid(simdjson::dom::parser& parser, const char *json, si /* }}} */ -void cplus_simdjson_parse(simdjson::dom::parser& parser, const char *json, size_t len, zval *return_value, unsigned char assoc, size_t depth) /* {{{ */ { +ZEND_API void cplus_simdjson_parse(simdjson::dom::parser& parser, const char *json, size_t len, zval *return_value, unsigned char assoc, size_t depth) /* {{{ */ { simdjson::dom::element doc; auto error = build_parsed_json_cust(parser, doc, json, len, true, depth); if (error) { @@ -269,7 +275,7 @@ void cplus_simdjson_parse(simdjson::dom::parser& parser, const char *json, size_ } } /* }}} */ -void cplus_simdjson_key_value(simdjson::dom::parser& parser, const char *json, size_t len, const char *key, zval *return_value, unsigned char assoc, +ZEND_API void cplus_simdjson_key_value(simdjson::dom::parser& parser, const char *json, size_t len, const char *key, zval *return_value, unsigned char assoc, size_t depth) /* {{{ */ { simdjson::dom::element doc; simdjson::dom::element element; @@ -295,7 +301,7 @@ void cplus_simdjson_key_value(simdjson::dom::parser& parser, const char *json, s /* }}} */ -u_short cplus_simdjson_key_exists(simdjson::dom::parser& parser, const char *json, size_t len, const char *key, size_t depth) /* {{{ */ { +ZEND_API u_short cplus_simdjson_key_exists(simdjson::dom::parser& parser, const char *json, size_t len, const char *key, size_t depth) /* {{{ */ { simdjson::dom::element doc; auto error = build_parsed_json_cust(parser, doc, json, len, true, depth); if (error) { @@ -311,7 +317,7 @@ u_short cplus_simdjson_key_exists(simdjson::dom::parser& parser, const char *jso /* }}} */ -void cplus_simdjson_key_count(simdjson::dom::parser& parser, const char *json, size_t len, const char *key, zval *return_value, size_t depth) /* {{{ */ { +ZEND_API void cplus_simdjson_key_count(simdjson::dom::parser& parser, const char *json, size_t len, const char *key, zval *return_value, size_t depth) /* {{{ */ { simdjson::dom::element doc; simdjson::dom::element element; diff --git a/src/bindings.h b/src/bindings.h index cf86bb4..9b3f7d2 100644 --- a/src/bindings.h +++ b/src/bindings.h @@ -11,12 +11,29 @@ +----------------------------------------------------------------------+ */ +#ifndef PHP_SIMDJSON_BINDINGS +#define PHP_SIMDJSON_BINDINGS + #include "simdjson.h" -simdjson::dom::parser* cplus_simdjson_create_parser(void); -void cplus_simdjson_free_parser(simdjson::dom::parser* parser); -bool cplus_simdjson_is_valid(const char *json, size_t len); -void cplus_simdjson_parse(const char *json, size_t len, zval *return_value, unsigned char assoc, size_t depth); -void cplus_simdjson_key_value(const char *json, size_t len, const char *key, zval *return_value, unsigned char assoc, size_t depth); -u_short cplus_simdjson_key_exists(const char *json, size_t len, const char *key, size_t depth); -void cplus_simdjson_key_count(const char *json, size_t len, const char *key, zval *return_value, size_t depth); +#define SIMDJSON_DEPTH_CHECK_THRESHOLD 100000 +#define SIMDJSON_CAPACITY_RECLAIM_THRESHOLD 1000000 + +#define SIMDJSON_SHOULD_REUSE_PARSER(strlen, depth) (EXPECTED((strlen) <= SIMDJSON_DEPTH_CHECK_THRESHOLD && (depth) <= SIMDJSON_DEPTH_CHECK_THRESHOLD)) + +ZEND_API simdjson::dom::parser* cplus_simdjson_create_parser(void); +ZEND_API void cplus_simdjson_free_parser(simdjson::dom::parser* parser); +ZEND_API bool cplus_simdjson_is_valid(simdjson::dom::parser &parser, const char *json, size_t len, size_t depth); +ZEND_API void cplus_simdjson_parse(simdjson::dom::parser &parser, const char *json, size_t len, zval *return_value, unsigned char assoc, size_t depth); +ZEND_API void cplus_simdjson_key_value(simdjson::dom::parser &parser, const char *json, size_t len, const char *key, zval *return_value, unsigned char assoc, size_t depth); +u_short cplus_simdjson_key_exists(simdjson::dom::parser &parser, const char *json, size_t len, const char *key, size_t depth); +ZEND_API void cplus_simdjson_key_count(simdjson::dom::parser &parser, const char *json, size_t len, const char *key, zval *return_value, size_t depth); + +// Emit out of memory errors to make behavior of json_is_valid consistent with other PHP APIs if it returns. https://github.com/crazyxman/simdjson_php/issues/80 +#define SIMDJSON_OUT_OF_MEMORY_ERROR_NORETURN() zend_error_noreturn(E_ERROR, "simdjson_php: Out of memory") +#define SIMDJSON_TRY_ALLOCATE(parser, capacity, depth) { \ + auto error = (parser).allocate((capacity), (depth)); \ + if (UNEXPECTED(error)) { SIMDJSON_OUT_OF_MEMORY_ERROR_NORETURN(); } \ + } + +#endif /* PHP_SIMDJSON_BINDINGS */