Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cleanup row count handling in JSON writer #3934

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 44 additions & 66 deletions arrow-json/src/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -124,21 +124,15 @@ where

fn struct_array_to_jsonmap_array(
array: &StructArray,
row_count: usize,
) -> Result<Vec<JsonMap<String, Value>>, ArrowError> {
let inner_col_names = array.column_names();

let mut inner_objs = iter::repeat(JsonMap::new())
.take(row_count)
.take(array.len())
.collect::<Vec<JsonMap<String, Value>>>();

for (j, struct_col) in array.columns().iter().enumerate() {
set_column_for_json_rows(
&mut inner_objs,
row_count,
struct_col,
inner_col_names[j],
)?
set_column_for_json_rows(&mut inner_objs, struct_col, inner_col_names[j])?
}
Ok(inner_objs)
}
Expand Down Expand Up @@ -197,8 +191,7 @@ pub fn array_to_json_array(array: &ArrayRef) -> Result<Vec<Value>, ArrowError> {
})
.collect(),
DataType::Struct(_) => {
let jsonmaps =
struct_array_to_jsonmap_array(as_struct_array(array), array.len())?;
let jsonmaps = struct_array_to_jsonmap_array(array.as_struct())?;
Ok(jsonmaps.into_iter().map(Value::Object).collect())
}
t => Err(ArrowError::JsonError(format!(
Expand All @@ -208,21 +201,21 @@ pub fn array_to_json_array(array: &ArrayRef) -> Result<Vec<Value>, ArrowError> {
}

macro_rules! set_column_by_array_type {
($cast_fn:ident, $col_name:ident, $rows:ident, $array:ident, $row_count:ident) => {
($cast_fn:ident, $col_name:ident, $rows:ident, $array:ident) => {
let arr = $cast_fn($array);
$rows.iter_mut().zip(arr.iter()).take($row_count).for_each(
|(row, maybe_value)| {
$rows
.iter_mut()
.zip(arr.iter())
.for_each(|(row, maybe_value)| {
if let Some(v) = maybe_value {
row.insert($col_name.to_string(), v.into());
}
},
);
});
};
}

fn set_column_by_primitive_type<T>(
rows: &mut [JsonMap<String, Value>],
row_count: usize,
array: &ArrayRef,
col_name: &str,
) where
Expand All @@ -233,7 +226,6 @@ fn set_column_by_primitive_type<T>(

rows.iter_mut()
.zip(primitive_arr.iter())
.take(row_count)
.for_each(|(row, maybe_value)| {
// when value is null, we simply skip setting the key
if let Some(j) = maybe_value.and_then(|v| v.into_json_value()) {
Expand All @@ -244,58 +236,51 @@ fn set_column_by_primitive_type<T>(

fn set_column_for_json_rows(
rows: &mut [JsonMap<String, Value>],
row_count: usize,
array: &ArrayRef,
col_name: &str,
) -> Result<(), ArrowError> {
match array.data_type() {
DataType::Int8 => {
set_column_by_primitive_type::<Int8Type>(rows, row_count, array, col_name);
set_column_by_primitive_type::<Int8Type>(rows, array, col_name);
}
DataType::Int16 => {
set_column_by_primitive_type::<Int16Type>(rows, row_count, array, col_name);
set_column_by_primitive_type::<Int16Type>(rows, array, col_name);
}
DataType::Int32 => {
set_column_by_primitive_type::<Int32Type>(rows, row_count, array, col_name);
set_column_by_primitive_type::<Int32Type>(rows, array, col_name);
}
DataType::Int64 => {
set_column_by_primitive_type::<Int64Type>(rows, row_count, array, col_name);
set_column_by_primitive_type::<Int64Type>(rows, array, col_name);
}
DataType::UInt8 => {
set_column_by_primitive_type::<UInt8Type>(rows, row_count, array, col_name);
set_column_by_primitive_type::<UInt8Type>(rows, array, col_name);
}
DataType::UInt16 => {
set_column_by_primitive_type::<UInt16Type>(rows, row_count, array, col_name);
set_column_by_primitive_type::<UInt16Type>(rows, array, col_name);
}
DataType::UInt32 => {
set_column_by_primitive_type::<UInt32Type>(rows, row_count, array, col_name);
set_column_by_primitive_type::<UInt32Type>(rows, array, col_name);
}
DataType::UInt64 => {
set_column_by_primitive_type::<UInt64Type>(rows, row_count, array, col_name);
set_column_by_primitive_type::<UInt64Type>(rows, array, col_name);
}
DataType::Float32 => {
set_column_by_primitive_type::<Float32Type>(rows, row_count, array, col_name);
set_column_by_primitive_type::<Float32Type>(rows, array, col_name);
}
DataType::Float64 => {
set_column_by_primitive_type::<Float64Type>(rows, row_count, array, col_name);
set_column_by_primitive_type::<Float64Type>(rows, array, col_name);
}
DataType::Null => {
// when value is null, we simply skip setting the key
}
DataType::Boolean => {
set_column_by_array_type!(as_boolean_array, col_name, rows, array, row_count);
set_column_by_array_type!(as_boolean_array, col_name, rows, array);
}
DataType::Utf8 => {
set_column_by_array_type!(as_string_array, col_name, rows, array, row_count);
set_column_by_array_type!(as_string_array, col_name, rows, array);
}
DataType::LargeUtf8 => {
set_column_by_array_type!(
as_largestring_array,
col_name,
rows,
array,
row_count
);
set_column_by_array_type!(as_largestring_array, col_name, rows, array);
}
DataType::Date32
| DataType::Date64
Expand All @@ -306,61 +291,53 @@ fn set_column_for_json_rows(
let options = FormatOptions::default();
let formatter = ArrayFormatter::try_new(array.as_ref(), &options)?;
let data = array.data();
rows.iter_mut()
.take(row_count)
.enumerate()
.for_each(|(idx, row)| {
if data.is_valid(idx) {
row.insert(
col_name.to_string(),
formatter.value(idx).to_string().into(),
);
}
});
rows.iter_mut().enumerate().for_each(|(idx, row)| {
if data.is_valid(idx) {
row.insert(
col_name.to_string(),
formatter.value(idx).to_string().into(),
);
}
});
}
DataType::Struct(_) => {
let inner_objs =
struct_array_to_jsonmap_array(as_struct_array(array), row_count)?;
let inner_objs = struct_array_to_jsonmap_array(array.as_struct())?;
rows.iter_mut()
.take(row_count)
.zip(inner_objs.into_iter())
.for_each(|(row, obj)| {
row.insert(col_name.to_string(), Value::Object(obj));
});
}
DataType::List(_) => {
let listarr = as_list_array(array);
rows.iter_mut()
.zip(listarr.iter())
.take(row_count)
.try_for_each(|(row, maybe_value)| -> Result<(), ArrowError> {
rows.iter_mut().zip(listarr.iter()).try_for_each(
|(row, maybe_value)| -> Result<(), ArrowError> {
if let Some(v) = maybe_value {
row.insert(
col_name.to_string(),
Value::Array(array_to_json_array(&v)?),
);
}
Ok(())
})?;
},
)?;
}
DataType::LargeList(_) => {
let listarr = as_large_list_array(array);
rows.iter_mut()
.zip(listarr.iter())
.take(row_count)
.try_for_each(|(row, maybe_value)| -> Result<(), ArrowError> {
rows.iter_mut().zip(listarr.iter()).try_for_each(
|(row, maybe_value)| -> Result<(), ArrowError> {
if let Some(v) = maybe_value {
let val = array_to_json_array(&v)?;
row.insert(col_name.to_string(), Value::Array(val));
}
Ok(())
})?;
},
)?;
}
DataType::Dictionary(_, value_type) => {
let slice = array.slice(0, row_count);
Copy link
Contributor Author

@tustvold tustvold Mar 24, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This slice was redundant, the row count would always match the array length

let hydrated = arrow_cast::cast::cast(&slice, value_type)
let hydrated = arrow_cast::cast::cast(&array, value_type)
.expect("cannot cast dictionary to underlying values");
set_column_for_json_rows(rows, row_count, &hydrated, col_name)?;
set_column_for_json_rows(rows, &hydrated, col_name)?;
}
DataType::Map(_, _) => {
let maparr = as_map_array(array);
Expand All @@ -381,7 +358,7 @@ fn set_column_for_json_rows(

let mut kv = keys.iter().zip(values.into_iter());

for (i, row) in rows.iter_mut().take(row_count).enumerate() {
for (i, row) in rows.iter_mut().enumerate() {
if maparr.is_null(i) {
row.insert(col_name.to_string(), serde_json::Value::Null);
continue;
Expand Down Expand Up @@ -424,9 +401,10 @@ pub fn record_batches_to_json_rows(
let mut base = 0;
for batch in batches {
let row_count = batch.num_rows();
let row_slice = &mut rows[base..base + batch.num_rows()];
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the change that enables removing row_count, it doesn't appear to be used elsewhere

for (j, col) in batch.columns().iter().enumerate() {
let col_name = schema.field(j).name();
set_column_for_json_rows(&mut rows[base..], row_count, col, col_name)?
set_column_for_json_rows(row_slice, col, col_name)?
}
base += row_count;
}
Expand Down