-
Notifications
You must be signed in to change notification settings - Fork 839
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix generate_non_canonical_map_case, fix MapArray
equality
#1476
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,14 +15,17 @@ | |
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
use crate::datatypes::DataType; | ||
use crate::{ | ||
array::ArrayData, | ||
array::{data::count_nulls, OffsetSizeTrait}, | ||
buffer::Buffer, | ||
util::bit_util::get_bit, | ||
}; | ||
|
||
use super::{equal_range, utils::child_logical_null_buffer}; | ||
use super::{ | ||
equal_range, equal_values, utils::child_logical_null_buffer, utils::equal_nulls, | ||
}; | ||
|
||
fn lengths_equal<T: OffsetSizeTrait>(lhs: &[T], rhs: &[T]) -> bool { | ||
// invariant from `base_equal` | ||
|
@@ -58,22 +61,47 @@ fn offset_value_equal<T: OffsetSizeTrait>( | |
lhs_pos: usize, | ||
rhs_pos: usize, | ||
len: usize, | ||
data_type: &DataType, | ||
) -> bool { | ||
let lhs_start = lhs_offsets[lhs_pos].to_usize().unwrap(); | ||
let rhs_start = rhs_offsets[rhs_pos].to_usize().unwrap(); | ||
let lhs_len = lhs_offsets[lhs_pos + len] - lhs_offsets[lhs_pos]; | ||
let rhs_len = rhs_offsets[rhs_pos + len] - rhs_offsets[rhs_pos]; | ||
|
||
lhs_len == rhs_len | ||
&& equal_range( | ||
lhs_values, | ||
rhs_values, | ||
lhs_nulls, | ||
rhs_nulls, | ||
lhs_start, | ||
rhs_start, | ||
lhs_len.to_usize().unwrap(), | ||
) | ||
lhs_len == rhs_len && { | ||
match data_type { | ||
DataType::Map(_, _) => { | ||
// Don't use `equal_range` which calls `utils::base_equal` that checks | ||
// struct fields, but we don't enforce struct field names. | ||
equal_nulls( | ||
lhs_values, | ||
rhs_values, | ||
lhs_nulls, | ||
rhs_nulls, | ||
lhs_start, | ||
rhs_start, | ||
lhs_len.to_usize().unwrap(), | ||
) && equal_values( | ||
lhs_values, | ||
rhs_values, | ||
lhs_nulls, | ||
rhs_nulls, | ||
lhs_start, | ||
rhs_start, | ||
lhs_len.to_usize().unwrap(), | ||
) | ||
} | ||
_ => equal_range( | ||
lhs_values, | ||
rhs_values, | ||
lhs_nulls, | ||
rhs_nulls, | ||
lhs_start, | ||
rhs_start, | ||
lhs_len.to_usize().unwrap(), | ||
), | ||
} | ||
} | ||
} | ||
|
||
pub(super) fn list_equal<T: OffsetSizeTrait>( | ||
|
@@ -131,17 +159,46 @@ pub(super) fn list_equal<T: OffsetSizeTrait>( | |
lengths_equal( | ||
&lhs_offsets[lhs_start..lhs_start + len], | ||
&rhs_offsets[rhs_start..rhs_start + len], | ||
) && equal_range( | ||
lhs_values, | ||
rhs_values, | ||
child_lhs_nulls.as_ref(), | ||
child_rhs_nulls.as_ref(), | ||
lhs_offsets[lhs_start].to_usize().unwrap(), | ||
rhs_offsets[rhs_start].to_usize().unwrap(), | ||
(lhs_offsets[lhs_start + len] - lhs_offsets[lhs_start]) | ||
.to_usize() | ||
.unwrap(), | ||
) | ||
) && { | ||
match lhs.data_type() { | ||
DataType::Map(_, _) => { | ||
// Don't use `equal_range` which calls `utils::base_equal` that checks | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as above |
||
// struct fields, but we don't enforce struct field names. | ||
equal_nulls( | ||
lhs_values, | ||
rhs_values, | ||
child_lhs_nulls.as_ref(), | ||
child_rhs_nulls.as_ref(), | ||
lhs_offsets[lhs_start].to_usize().unwrap(), | ||
rhs_offsets[rhs_start].to_usize().unwrap(), | ||
(lhs_offsets[lhs_start + len] - lhs_offsets[lhs_start]) | ||
.to_usize() | ||
.unwrap(), | ||
) && equal_values( | ||
lhs_values, | ||
rhs_values, | ||
child_lhs_nulls.as_ref(), | ||
child_rhs_nulls.as_ref(), | ||
lhs_offsets[lhs_start].to_usize().unwrap(), | ||
rhs_offsets[rhs_start].to_usize().unwrap(), | ||
(lhs_offsets[lhs_start + len] - lhs_offsets[lhs_start]) | ||
.to_usize() | ||
.unwrap(), | ||
) | ||
} | ||
_ => equal_range( | ||
lhs_values, | ||
rhs_values, | ||
child_lhs_nulls.as_ref(), | ||
child_rhs_nulls.as_ref(), | ||
lhs_offsets[lhs_start].to_usize().unwrap(), | ||
rhs_offsets[rhs_start].to_usize().unwrap(), | ||
(lhs_offsets[lhs_start + len] - lhs_offsets[lhs_start]) | ||
.to_usize() | ||
.unwrap(), | ||
), | ||
} | ||
} | ||
} else { | ||
// get a ref of the parent null buffer bytes, to use in testing for nullness | ||
let lhs_null_bytes = lhs_nulls.unwrap().as_slice(); | ||
|
@@ -166,6 +223,7 @@ pub(super) fn list_equal<T: OffsetSizeTrait>( | |
lhs_pos, | ||
rhs_pos, | ||
1, | ||
lhs.data_type(), | ||
) | ||
}) | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,8 @@ | |
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
use arrow::datatypes::Schema; | ||
use arrow::datatypes::{DataType, Field}; | ||
use arrow::error::{ArrowError, Result}; | ||
use arrow::ipc::reader::FileReader; | ||
use arrow::ipc::writer::FileWriter; | ||
|
@@ -107,6 +109,47 @@ fn arrow_to_json(arrow_name: &str, json_name: &str, verbose: bool) -> Result<()> | |
Ok(()) | ||
} | ||
|
||
fn canonicalize_schema(schema: &Schema) -> Schema { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is the issue that other arrow implementations create different names for the "key" and "value" fields? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The generated json has fields named other than "key" and 'value". I think this is why it is non canonical map case. But I'm not sure which arrow implementations can actually produce such map arrays. |
||
let fields = schema | ||
.fields() | ||
.iter() | ||
.map(|field| match field.data_type() { | ||
DataType::Map(child_field, sorted) => match child_field.data_type() { | ||
DataType::Struct(fields) if fields.len() == 2 => { | ||
let first_field = fields.get(0).unwrap(); | ||
let key_field = Field::new( | ||
"key", | ||
first_field.data_type().clone(), | ||
first_field.is_nullable(), | ||
); | ||
let second_field = fields.get(1).unwrap(); | ||
let value_field = Field::new( | ||
"value", | ||
second_field.data_type().clone(), | ||
second_field.is_nullable(), | ||
); | ||
|
||
let struct_type = DataType::Struct(vec![key_field, value_field]); | ||
let child_field = | ||
Field::new("entries", struct_type, child_field.is_nullable()); | ||
|
||
Field::new( | ||
field.name().as_str(), | ||
DataType::Map(Box::new(child_field), *sorted), | ||
field.is_nullable(), | ||
) | ||
} | ||
_ => panic!( | ||
"The child field of Map type should be Struct type with 2 fields." | ||
), | ||
}, | ||
_ => field.clone(), | ||
}) | ||
.collect::<Vec<_>>(); | ||
|
||
Schema::new(fields).with_metadata(schema.metadata().clone()) | ||
} | ||
|
||
fn validate(arrow_name: &str, json_name: &str, verbose: bool) -> Result<()> { | ||
if verbose { | ||
eprintln!("Validating {} and {}", arrow_name, json_name); | ||
|
@@ -121,7 +164,7 @@ fn validate(arrow_name: &str, json_name: &str, verbose: bool) -> Result<()> { | |
let arrow_schema = arrow_reader.schema().as_ref().to_owned(); | ||
|
||
// compare schemas | ||
if json_file.schema != arrow_schema { | ||
if canonicalize_schema(&json_file.schema) != canonicalize_schema(&arrow_schema) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Something feels a bit off about this logic living at the edges, i.e. the interface. If the field names of a DataType::Map really don't matter, I would expect this to either not be stored, or at least stored as an implementation detail... |
||
return Err(ArrowError::ComputeError(format!( | ||
"Schemas do not match. JSON: {:?}. Arrow: {:?}", | ||
json_file.schema, arrow_schema | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm probably missing something but as far as I can tell you have modified base_equal to not check the names, and so I'm not sure why this special case is still needed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Because here we are compare the nested struct between two map arrays. So at
utils::base_equal
, it compares two struct types, not the map types. And we shouldn't letutils::base_equal
ignore field name difference for struct types.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But why not just move this special case of
DataType::Map(_, _)
into data type? So this code would just do a standardDataType::eq
on theDataType::Map
and it would internally special case?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm, we don't implement
Eq
forDataType
, do you mean we should do it? Even we implement it, here we are comparing child data of the map array, so its data type isStruct(vec!["key", "value"])
. So we still need to specially treat map type here...There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Aah I see the problem, darn... 👍