diff --git a/Cargo.lock b/Cargo.lock index 249fdfbf9..2ba82639c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -138,9 +138,9 @@ checksum = "62b02a5381cc465bd3041d84623d0fa3b66738b52b8e2fc3bab8ad63ab032f4a" [[package]] name = "jiter" -version = "0.0.4" +version = "0.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b27d419c535bf7b50ad355278b1159cbf0cc8d507ea003d625b17bf0375720b8" +checksum = "e184598fea113663dd78e33a24ad3a1e7ba8ceedf71effb7406b3f2eccb63ed1" dependencies = [ "ahash", "lexical-core", diff --git a/Cargo.toml b/Cargo.toml index 512d98908..20f4b79b8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,8 +43,7 @@ base64 = "0.21.5" num-bigint = "0.4.4" python3-dll-a = "0.2.7" uuid = "1.5.0" -jiter = {version = "0.0.4", features = ["python"]} -#jiter = {path = "../jiter", features = ["python"]} +jiter = {version = "0.0.5", features = ["python"]} [lib] name = "_pydantic_core" diff --git a/python/pydantic_core/_pydantic_core.pyi b/python/pydantic_core/_pydantic_core.pyi index 4c99a4d61..382a6c804 100644 --- a/python/pydantic_core/_pydantic_core.pyi +++ b/python/pydantic_core/_pydantic_core.pyi @@ -385,7 +385,7 @@ def to_json( JSON bytes. """ -def from_json(data: str | bytes | bytearray, *, allow_inf_nan: bool = True) -> Any: +def from_json(data: str | bytes | bytearray, *, allow_inf_nan: bool = True, cache_strings: bool = True) -> Any: """ Deserialize JSON data to a Python object. @@ -394,6 +394,8 @@ def from_json(data: str | bytes | bytearray, *, allow_inf_nan: bool = True) -> A Arguments: data: The JSON data to deserialize. allow_inf_nan: Whether to allow `Infinity`, `-Infinity` and `NaN` values as `json.loads()` does by default. + cache_strings: Whether to cache strings to avoid constructing new Python objects, + this should have a significant impact on performance while increasing memory usage slightly. Raises: ValueError: If deserialization fails. diff --git a/src/input/input_abstract.rs b/src/input/input_abstract.rs index ceb0495a9..8229677b6 100644 --- a/src/input/input_abstract.rs +++ b/src/input/input_abstract.rs @@ -4,8 +4,6 @@ use pyo3::exceptions::PyValueError; use pyo3::types::{PyDict, PyType}; use pyo3::{intern, prelude::*}; -use jiter::JsonValue; - use crate::errors::{AsLocItem, ErrorTypeDefaults, InputValue, ValError, ValResult}; use crate::tools::py_err; use crate::{PyMultiHostUrl, PyUrl}; @@ -89,8 +87,6 @@ pub trait Input<'a>: fmt::Debug + ToPyObject + AsLocItem + Sized { fn validate_dataclass_args(&'a self, dataclass_name: &str) -> ValResult>; - fn parse_json(&'a self) -> ValResult; - fn validate_str( &'a self, strict: bool, diff --git a/src/input/input_json.rs b/src/input/input_json.rs index 3e94adad6..195f9caba 100644 --- a/src/input/input_json.rs +++ b/src/input/input_json.rs @@ -14,7 +14,7 @@ use super::datetime::{ float_as_time, int_as_datetime, int_as_duration, int_as_time, EitherDate, EitherDateTime, EitherTime, }; use super::return_enums::ValidationMatch; -use super::shared::{float_as_int, int_as_bool, map_json_err, str_as_bool, str_as_float, str_as_int}; +use super::shared::{float_as_int, int_as_bool, str_as_bool, str_as_float, str_as_int}; use super::{ BorrowInput, EitherBytes, EitherFloat, EitherInt, EitherString, EitherTimedelta, GenericArguments, GenericIterable, GenericIterator, GenericMapping, Input, JsonArgs, @@ -84,13 +84,6 @@ impl<'a> Input<'a> for JsonValue { } } - fn parse_json(&'a self) -> ValResult { - match self { - JsonValue::Str(s) => JsonValue::parse(s.as_bytes(), true).map_err(|e| map_json_err(self, e)), - _ => Err(ValError::new(ErrorTypeDefaults::JsonType, self)), - } - } - fn exact_str(&'a self) -> ValResult> { match self { JsonValue::Str(s) => Ok(s.as_str().into()), @@ -367,10 +360,6 @@ impl<'a> Input<'a> for String { )) } - fn parse_json(&'a self) -> ValResult { - JsonValue::parse(self.as_bytes(), true).map_err(|e| map_json_err(self, e)) - } - fn validate_str( &'a self, _strict: bool, diff --git a/src/input/input_python.rs b/src/input/input_python.rs index ba42abbb8..5d4d4826c 100644 --- a/src/input/input_python.rs +++ b/src/input/input_python.rs @@ -10,7 +10,6 @@ use pyo3::types::{ use pyo3::types::{PyDictItems, PyDictKeys, PyDictValues}; use pyo3::{intern, PyTypeInfo}; -use jiter::JsonValue; use speedate::MicrosecondsPrecisionOverflowBehavior; use crate::errors::{AsLocItem, ErrorType, ErrorTypeDefaults, InputValue, LocItem, ValError, ValResult}; @@ -26,8 +25,7 @@ use super::datetime::{ }; use super::return_enums::ValidationMatch; use super::shared::{ - decimal_as_int, float_as_int, get_enum_meta_object, int_as_bool, map_json_err, str_as_bool, str_as_float, - str_as_int, + decimal_as_int, float_as_int, get_enum_meta_object, int_as_bool, str_as_bool, str_as_float, str_as_int, }; use super::{ py_string_str, BorrowInput, EitherBytes, EitherFloat, EitherInt, EitherString, EitherTimedelta, GenericArguments, @@ -195,22 +193,6 @@ impl<'a> Input<'a> for PyAny { } } - fn parse_json(&'a self) -> ValResult { - let bytes = if let Ok(py_bytes) = self.downcast::() { - py_bytes.as_bytes() - } else if let Ok(py_str) = self.downcast::() { - let str = py_string_str(py_str)?; - str.as_bytes() - } else if let Ok(py_byte_array) = self.downcast::() { - // Safety: from_slice does not run arbitrary Python code and the GIL is held so the - // bytes array will not be mutated while `JsonValue::parse` is reading it - unsafe { py_byte_array.as_bytes() } - } else { - return Err(ValError::new(ErrorTypeDefaults::JsonType, self)); - }; - JsonValue::parse(bytes, true).map_err(|e| map_json_err(self, e)) - } - fn validate_str( &'a self, strict: bool, diff --git a/src/input/input_string.rs b/src/input/input_string.rs index 290247617..0c2c9a8ca 100644 --- a/src/input/input_string.rs +++ b/src/input/input_string.rs @@ -1,7 +1,6 @@ use pyo3::prelude::*; use pyo3::types::{PyDict, PyString}; -use jiter::JsonValue; use speedate::MicrosecondsPrecisionOverflowBehavior; use crate::errors::{AsLocItem, ErrorTypeDefaults, InputValue, LocItem, ValError, ValResult}; @@ -12,7 +11,7 @@ use crate::validators::decimal::create_decimal; use super::datetime::{ bytes_as_date, bytes_as_datetime, bytes_as_time, bytes_as_timedelta, EitherDate, EitherDateTime, EitherTime, }; -use super::shared::{map_json_err, str_as_bool, str_as_float}; +use super::shared::{str_as_bool, str_as_float}; use super::{ BorrowInput, EitherBytes, EitherFloat, EitherInt, EitherString, EitherTimedelta, GenericArguments, GenericIterable, GenericIterator, GenericMapping, Input, ValidationMatch, @@ -86,16 +85,6 @@ impl<'a> Input<'a> for StringMapping<'a> { } } - fn parse_json(&'a self) -> ValResult { - match self { - Self::String(s) => { - let str = py_string_str(s)?; - JsonValue::parse(str.as_bytes(), true).map_err(|e| map_json_err(self, e)) - } - Self::Mapping(_) => Err(ValError::new(ErrorTypeDefaults::JsonType, self)), - } - } - fn validate_str( &'a self, _strict: bool, diff --git a/src/input/shared.rs b/src/input/shared.rs index 647bce8a3..591c5abfc 100644 --- a/src/input/shared.rs +++ b/src/input/shared.rs @@ -1,10 +1,9 @@ use pyo3::sync::GILOnceCell; use pyo3::{intern, Py, PyAny, Python, ToPyObject}; -use jiter::JsonValueError; use num_bigint::BigInt; -use crate::errors::{ErrorType, ErrorTypeDefaults, ValError, ValResult}; +use crate::errors::{ErrorTypeDefaults, ValError, ValResult}; use super::{EitherFloat, EitherInt, Input}; static ENUM_META_OBJECT: GILOnceCell> = GILOnceCell::new(); @@ -20,16 +19,6 @@ pub fn get_enum_meta_object(py: Python) -> Py { .clone() } -pub fn map_json_err<'a>(input: &'a impl Input<'a>, error: JsonValueError) -> ValError { - ValError::new( - ErrorType::JsonInvalid { - error: error.to_string(), - context: None, - }, - input, - ) -} - pub fn str_as_bool<'a>(input: &'a impl Input<'a>, str: &str) -> ValResult { if str == "0" || str.eq_ignore_ascii_case("f") diff --git a/src/lib.rs b/src/lib.rs index f969c0657..de4a6d9bd 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,7 +5,6 @@ extern crate core; use std::sync::OnceLock; use pyo3::exceptions::PyTypeError; -use pyo3::types::{PyByteArray, PyBytes, PyString}; use pyo3::{prelude::*, sync::GILOnceCell}; // parse this first to get access to the contained macro @@ -37,17 +36,16 @@ pub use serializers::{ }; pub use validators::{validate_core_schema, PySome, SchemaValidator}; -#[pyfunction(signature = (data, *, allow_inf_nan=true))] -pub fn from_json(py: Python, data: &PyAny, allow_inf_nan: bool) -> PyResult { - if let Ok(py_bytes) = data.downcast::() { - jiter::python_parse(py, py_bytes.as_bytes(), allow_inf_nan) - } else if let Ok(py_str) = data.downcast::() { - jiter::python_parse(py, py_str.to_str()?.as_bytes(), allow_inf_nan) - } else if let Ok(py_byte_array) = data.downcast::() { - jiter::python_parse(py, &py_byte_array.to_vec(), allow_inf_nan) - } else { - Err(PyTypeError::new_err("Expected bytes, bytearray or str")) - } +use crate::input::Input; + +#[pyfunction(signature = (data, *, allow_inf_nan=true, cache_strings=true))] +pub fn from_json(py: Python, data: &PyAny, allow_inf_nan: bool, cache_strings: bool) -> PyResult { + let v_match = data + .validate_bytes(false) + .map_err(|_| PyTypeError::new_err("Expected bytes, bytearray or str"))?; + let json_either_bytes = v_match.into_inner(); + let json_bytes = json_either_bytes.as_slice(); + jiter::python_parse(py, json_bytes, allow_inf_nan, cache_strings).map_err(|e| jiter::map_json_error(json_bytes, &e)) } pub fn get_pydantic_core_version() -> &'static str { diff --git a/src/validators/json.rs b/src/validators/json.rs index 7ce1f9e29..44250ef9d 100644 --- a/src/validators/json.rs +++ b/src/validators/json.rs @@ -2,8 +2,10 @@ use pyo3::intern; use pyo3::prelude::*; use pyo3::types::PyDict; -use crate::errors::ValResult; -use crate::input::Input; +use jiter::JsonValue; + +use crate::errors::{ErrorType, ErrorTypeDefaults, ValError, ValLineError, ValResult}; +use crate::input::{EitherBytes, Input, ValidationMatch}; use crate::tools::SchemaDict; use super::{build_validator, BuildValidator, CombinedValidator, DefinitionsBuilder, ValidationState, Validator}; @@ -50,13 +52,19 @@ impl Validator for JsonValidator { input: &'data impl Input<'data>, state: &mut ValidationState, ) -> ValResult { - let json_value = input.parse_json()?; + let v_match = validate_json_bytes(input)?; + let json_either_bytes = v_match.unpack(state); + let json_bytes = json_either_bytes.as_slice(); match self.validator { - Some(ref validator) => match validator.validate(py, &json_value, state) { - Ok(v) => Ok(v), - Err(err) => Err(err), - }, - None => Ok(json_value.to_object(py)), + Some(ref validator) => { + let json_value = JsonValue::parse(json_bytes, true).map_err(|e| map_json_err(input, e, json_bytes))?; + validator.validate(py, &json_value, state) + } + None => { + let obj = + jiter::python_parse(py, json_bytes, true, true).map_err(|e| map_json_err(input, e, json_bytes))?; + Ok(obj) + } } } @@ -64,3 +72,32 @@ impl Validator for JsonValidator { &self.name } } + +pub fn validate_json_bytes<'data>(input: &'data impl Input<'data>) -> ValResult>> { + match input.validate_bytes(false) { + Ok(v_match) => Ok(v_match), + Err(ValError::LineErrors(e)) => Err(ValError::LineErrors( + e.into_iter().map(map_bytes_error).collect::>(), + )), + Err(e) => Err(e), + } +} + +fn map_bytes_error(line_error: ValLineError) -> ValLineError { + match line_error.error_type { + ErrorType::BytesType { .. } => { + ValLineError::new_custom_input(ErrorTypeDefaults::JsonType, line_error.input_value) + } + _ => line_error, + } +} + +pub fn map_json_err<'a>(input: &'a impl Input<'a>, error: jiter::JsonError, json_bytes: &[u8]) -> ValError { + ValError::new( + ErrorType::JsonInvalid { + error: error.description(json_bytes), + context: None, + }, + input, + ) +} diff --git a/src/validators/mod.rs b/src/validators/mod.rs index ee95fa799..7809b1ee3 100644 --- a/src/validators/mod.rs +++ b/src/validators/mod.rs @@ -171,7 +171,6 @@ impl SchemaValidator { from_attributes, context, self_instance, - &mut RecursionGuard::default(), ) .map_err(|e| self.prepare_validation_err(py, e, InputType::Python)) } @@ -194,7 +193,6 @@ impl SchemaValidator { from_attributes, context, self_instance, - &mut RecursionGuard::default(), ) { Ok(_) => Ok(true), Err(ValError::InternalErr(err)) => Err(err), @@ -213,22 +211,18 @@ impl SchemaValidator { context: Option<&PyAny>, self_instance: Option<&PyAny>, ) -> PyResult { - let recursion_guard = &mut RecursionGuard::default(); - match input.parse_json() { - Ok(input) => self - ._validate( - py, - &input, - InputType::Json, - strict, - None, - context, - self_instance, - recursion_guard, - ) - .map_err(|e| self.prepare_validation_err(py, e, InputType::Json)), - Err(err) => Err(self.prepare_validation_err(py, err, InputType::Json)), - } + let r = match json::validate_json_bytes(input) { + Ok(v_match) => self._validate_json( + py, + input, + v_match.into_inner().as_slice(), + strict, + context, + self_instance, + ), + Err(err) => Err(err), + }; + r.map_err(|e| self.prepare_validation_err(py, e, InputType::Json)) } #[pyo3(signature = (input, *, strict=None, context=None))] @@ -242,8 +236,7 @@ impl SchemaValidator { let t = InputType::String; let string_mapping = StringMapping::new_value(input).map_err(|e| self.prepare_validation_err(py, e, t))?; - let recursion_guard = &mut RecursionGuard::default(); - match self._validate(py, &string_mapping, t, strict, None, context, None, recursion_guard) { + match self._validate(py, &string_mapping, t, strict, None, context, None) { Ok(r) => Ok(r), Err(e) => Err(self.prepare_validation_err(py, e, t)), } @@ -329,18 +322,32 @@ impl SchemaValidator { from_attributes: Option, context: Option<&'data PyAny>, self_instance: Option<&PyAny>, - recursion_guard: &'data mut RecursionGuard, ) -> ValResult where 's: 'data, { + let mut recursion_guard = RecursionGuard::default(); let mut state = ValidationState::new( Extra::new(strict, from_attributes, context, self_instance, input_type), - recursion_guard, + &mut recursion_guard, ); self.validator.validate(py, input, &mut state) } + fn _validate_json( + &self, + py: Python, + input: &PyAny, + json_data: &[u8], + strict: Option, + context: Option<&PyAny>, + self_instance: Option<&PyAny>, + ) -> ValResult { + let json_value = + jiter::JsonValue::parse(json_data, true).map_err(|e| json::map_json_err(input, e, json_data))?; + self._validate(py, &json_value, InputType::Json, strict, None, context, self_instance) + } + fn prepare_validation_err(&self, py: Python, error: ValError, input_type: InputType) -> PyErr { ValidationError::from_val_error( py, diff --git a/tests/validators/test_json.py b/tests/validators/test_json.py index d8666d335..228d18e55 100644 --- a/tests/validators/test_json.py +++ b/tests/validators/test_json.py @@ -48,36 +48,40 @@ def test_any(py_and_json: PyAndJson, input_value, expected): @pytest.mark.parametrize( 'input_value,expected', [ - ('{"a": 1}', {'a': 1}), - (b'{"a": 1}', {'a': 1}), - ( + pytest.param('{"a": 1}', {'a': 1}, id='str'), + pytest.param(b'{"a": 1}', {'a': 1}, id='bytes'), + pytest.param( '🐈 Hello \ud800World', Err( 'Input should be a valid string, unable to parse raw data as a unicode string ' "[type=string_unicode, input_value='🐈 Hello \\ud800World', input_type=str]" ), + id='str_unicode', ), - (bytearray(b'{"a": 1}'), {'a': 1}), - ( + pytest.param(bytearray(b'{"a": 1}'), {'a': 1}, id='bytearray'), + pytest.param( 'xx', Err( 'Invalid JSON: expected value at line 1 column 1 ' "[type=json_invalid, input_value='xx', input_type=str]" ), + id='str_invalid', ), - ( + pytest.param( b'xx', Err( 'Invalid JSON: expected value at line 1 column 1 ' "[type=json_invalid, input_value=b'xx', input_type=bytes]" ), + id='bytes_invalid', ), - ( + pytest.param( bytearray(b'xx'), Err( 'Invalid JSON: expected value at line 1 column 1 ' "[type=json_invalid, input_value=bytearray(b'xx'), input_type=bytearray]" ), + id='bytearray_invalid', ), ], )