From 98c6e436e637999b41a40373f0a2e8c90a93ca81 Mon Sep 17 00:00:00 2001 From: fan Date: Mon, 20 Nov 2023 18:15:18 +0800 Subject: [PATCH 1/4] fix: fix json decode number Signed-off-by: fan --- arrow-json/src/reader/mod.rs | 38 ++++++++++++++++++++++++++- arrow-json/src/reader/string_array.rs | 16 ++++++++++- 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/arrow-json/src/reader/mod.rs b/arrow-json/src/reader/mod.rs index 71a73df9fedb..dbecef2e6d82 100644 --- a/arrow-json/src/reader/mod.rs +++ b/arrow-json/src/reader/mod.rs @@ -717,7 +717,9 @@ mod tests { use arrow_array::cast::AsArray; use arrow_array::types::Int32Type; - use arrow_array::{make_array, Array, BooleanArray, ListArray, StringArray, StructArray}; + use arrow_array::{ + make_array, Array, BooleanArray, Float64Array, ListArray, StringArray, StructArray, + }; use arrow_buffer::{ArrowNativeType, Buffer}; use arrow_cast::display::{ArrayFormatter, FormatOptions}; use arrow_data::ArrayDataBuilder; @@ -2259,4 +2261,38 @@ mod tests { .values(); assert_eq!(values, &[1699148028689, 2, 3, 4]); } + + #[test] + fn test_coercing_primitive_into_string_decoder() { + let buf = r#"[ + {"a": 1, "b": "A", "c": "T"}, + {"a": 2, "b": "BB", "c": "F"}, + {"a": 3, "b": 123, "c": false} + ]"#; + let schema = Schema::new(vec![ + Field::new("a", DataType::Float64, true), + Field::new("b", DataType::Utf8, true), + Field::new("c", DataType::Utf8, true), + ]); + let json_array: Vec = serde_json::from_str(buf).unwrap(); + let schema_ref = Arc::new(schema); + + // read record batches + let reader = ReaderBuilder::new(schema_ref.clone()).with_coerce_primitive(true); + let mut decoder = reader.build_decoder().unwrap(); + decoder.serialize(json_array.as_slice()).unwrap(); + let batch = decoder.flush().unwrap().unwrap(); + assert_eq!( + batch, + RecordBatch::try_new( + schema_ref, + vec![ + Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0])), + Arc::new(StringArray::from(vec!["A", "BB", "0"])), + Arc::new(StringArray::from(vec!["T", "F", "false"])), + ] + ) + .unwrap() + ); + } } diff --git a/arrow-json/src/reader/string_array.rs b/arrow-json/src/reader/string_array.rs index 63a9bcedb7d1..83da23bbf3f7 100644 --- a/arrow-json/src/reader/string_array.rs +++ b/arrow-json/src/reader/string_array.rs @@ -61,7 +61,15 @@ impl ArrayDecoder for StringArrayDecoder { TapeElement::Number(idx) if coerce_primitive => { data_capacity += tape.get_string(idx).len(); } - _ => return Err(tape.error(*p, "string")), + TapeElement::I64(n) | TapeElement::I32(n) if coerce_primitive => { + data_capacity += n.to_string().len(); + } + TapeElement::F32(n) | TapeElement::F64(n) if coerce_primitive => { + data_capacity += n.to_string().len(); + } + _ => { + return Err(tape.error(*p, "string")); + } } } @@ -89,6 +97,12 @@ impl ArrayDecoder for StringArrayDecoder { TapeElement::Number(idx) if coerce_primitive => { builder.append_value(tape.get_string(idx)); } + TapeElement::I64(n) | TapeElement::I32(n) if coerce_primitive => { + builder.append_value(n.to_string()); + } + TapeElement::F32(n) | TapeElement::F64(n) if coerce_primitive => { + builder.append_value(n.to_string()); + } _ => unreachable!(), } } From 6470b6f5889199597f312afce6292291cb50acd5 Mon Sep 17 00:00:00 2001 From: fan Date: Mon, 20 Nov 2023 19:54:05 +0800 Subject: [PATCH 2/4] follow reviews Signed-off-by: fan --- arrow-json/src/reader/mod.rs | 21 +++++++++++++-------- arrow-json/src/reader/string_array.rs | 11 ++++++++++- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/arrow-json/src/reader/mod.rs b/arrow-json/src/reader/mod.rs index dbecef2e6d82..5afe0dec279a 100644 --- a/arrow-json/src/reader/mod.rs +++ b/arrow-json/src/reader/mod.rs @@ -2264,11 +2264,11 @@ mod tests { #[test] fn test_coercing_primitive_into_string_decoder() { - let buf = r#"[ - {"a": 1, "b": "A", "c": "T"}, - {"a": 2, "b": "BB", "c": "F"}, - {"a": 3, "b": 123, "c": false} - ]"#; + let buf = &format!( + r#"[{{"a": 1, "b": "A", "c": "T"}}, {{"a": 2, "b": "BB", "c": "F"}}, {{"a": {}, "b": 123, "c": false}}, {{"a": {}, "b": 789, "c": true}}]"#, + (std::i32::MAX as i64 + 10), + std::i64::MAX - 10 + ); let schema = Schema::new(vec![ Field::new("a", DataType::Float64, true), Field::new("b", DataType::Utf8, true), @@ -2287,9 +2287,14 @@ mod tests { RecordBatch::try_new( schema_ref, vec![ - Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0])), - Arc::new(StringArray::from(vec!["A", "BB", "0"])), - Arc::new(StringArray::from(vec!["T", "F", "false"])), + Arc::new(Float64Array::from(vec![ + 1.0, + 2.0, + (std::i32::MAX as i64 + 10) as f64, + (std::i64::MAX - 10) as f64 + ])), + Arc::new(StringArray::from(vec!["A", "BB", "123", "789"])), + Arc::new(StringArray::from(vec!["T", "F", "false", "true"])), ] ) .unwrap() diff --git a/arrow-json/src/reader/string_array.rs b/arrow-json/src/reader/string_array.rs index 83da23bbf3f7..6bbf2372dc33 100644 --- a/arrow-json/src/reader/string_array.rs +++ b/arrow-json/src/reader/string_array.rs @@ -97,7 +97,16 @@ impl ArrayDecoder for StringArrayDecoder { TapeElement::Number(idx) if coerce_primitive => { builder.append_value(tape.get_string(idx)); } - TapeElement::I64(n) | TapeElement::I32(n) if coerce_primitive => { + TapeElement::I64(high) if coerce_primitive => { + match tape.get(p + 1) { + TapeElement::I32(low) => { + let val = (high as i64) << 32 | (low as u32) as i64; + builder.append_value(val.to_string()); + } + _ => unreachable!(), + } + } + TapeElement::I32(n) if coerce_primitive => { builder.append_value(n.to_string()); } TapeElement::F32(n) | TapeElement::F64(n) if coerce_primitive => { From b64b0530d8877e03b80804f9397a0e0b200c4169 Mon Sep 17 00:00:00 2001 From: fan Date: Mon, 20 Nov 2023 20:03:17 +0800 Subject: [PATCH 3/4] follow reviews Signed-off-by: fan --- arrow-json/src/reader/string_array.rs | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/arrow-json/src/reader/string_array.rs b/arrow-json/src/reader/string_array.rs index 6bbf2372dc33..1fb26298c405 100644 --- a/arrow-json/src/reader/string_array.rs +++ b/arrow-json/src/reader/string_array.rs @@ -97,21 +97,26 @@ impl ArrayDecoder for StringArrayDecoder { TapeElement::Number(idx) if coerce_primitive => { builder.append_value(tape.get_string(idx)); } - TapeElement::I64(high) if coerce_primitive => { - match tape.get(p + 1) { - TapeElement::I32(low) => { - let val = (high as i64) << 32 | (low as u32) as i64; - builder.append_value(val.to_string()); - } - _ => unreachable!(), + TapeElement::I64(high) if coerce_primitive => match tape.get(p + 1) { + TapeElement::I32(low) => { + let val = (high as i64) << 32 | (low as u32) as i64; + builder.append_value(val.to_string()); } - } + _ => unreachable!(), + }, TapeElement::I32(n) if coerce_primitive => { builder.append_value(n.to_string()); } - TapeElement::F32(n) | TapeElement::F64(n) if coerce_primitive => { + TapeElement::F32(n) if coerce_primitive => { builder.append_value(n.to_string()); } + TapeElement::F64(high) if coerce_primitive => match tape.get(p + 1) { + TapeElement::F32(low) => { + let val = f64::from_bits((high as u64) << 32 | low as u64); + builder.append_value(val.to_string()); + } + _ => unreachable!(), + }, _ => unreachable!(), } } From 7ef4634b521da372a737b42c5af1d26e36b19d14 Mon Sep 17 00:00:00 2001 From: fan Date: Tue, 21 Nov 2023 11:25:12 +0800 Subject: [PATCH 4/4] use fixed size space Signed-off-by: fan --- arrow-json/src/reader/string_array.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/arrow-json/src/reader/string_array.rs b/arrow-json/src/reader/string_array.rs index 1fb26298c405..5ab4d09d5d63 100644 --- a/arrow-json/src/reader/string_array.rs +++ b/arrow-json/src/reader/string_array.rs @@ -61,11 +61,14 @@ impl ArrayDecoder for StringArrayDecoder { TapeElement::Number(idx) if coerce_primitive => { data_capacity += tape.get_string(idx).len(); } - TapeElement::I64(n) | TapeElement::I32(n) if coerce_primitive => { - data_capacity += n.to_string().len(); - } - TapeElement::F32(n) | TapeElement::F64(n) if coerce_primitive => { - data_capacity += n.to_string().len(); + TapeElement::I64(_) + | TapeElement::I32(_) + | TapeElement::F64(_) + | TapeElement::F32(_) + if coerce_primitive => + { + // An arbitrary estimate + data_capacity += 10; } _ => { return Err(tape.error(*p, "string"));