From b714d2ee41ee28efabda177b02a2e998743796f9 Mon Sep 17 00:00:00 2001 From: Arttu Voutilainen Date: Mon, 25 Nov 2024 19:33:37 +0100 Subject: [PATCH] apply same logic to datatype_is_semantically_equal --- datafusion/common/src/dfschema.rs | 172 +++++++++++++++++++++++++++++- 1 file changed, 169 insertions(+), 3 deletions(-) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 5660505730cf..45620c3cacc8 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -712,9 +712,26 @@ impl DFSchema { } (DataType::List(f1), DataType::List(f2)) | (DataType::LargeList(f1), DataType::LargeList(f2)) - | (DataType::FixedSizeList(f1, _), DataType::FixedSizeList(f2, _)) - | (DataType::Map(f1, _), DataType::Map(f2, _)) => { - Self::field_is_semantically_equal(f1, f2) + | (DataType::FixedSizeList(f1, _), DataType::FixedSizeList(f2, _)) => { + // Don't compare the names of the technical inner field + // Usually "item" but that's not mandated + Self::datatype_is_semantically_equal(f1.data_type(), f2.data_type()) + } + (DataType::Map(f1, _), DataType::Map(f2, _)) => { + // Don't compare the names of the technical inner fields + // Usually "entries", "key", "value" but that's not mandated + match (f1.data_type(), f2.data_type()) { + (DataType::Struct(f1_inner), DataType::Struct(f2_inner)) => { + f1_inner.len() == f2_inner.len() + && f1_inner.iter().zip(f2_inner.iter()).all(|(f1, f2)| { + Self::datatype_is_semantically_equal( + f1.data_type(), + f2.data_type(), + ) + }) + } + _ => panic!("Map type should have an inner struct field"), + } } (DataType::Struct(fields1), DataType::Struct(fields2)) => { let iter1 = fields1.iter(); @@ -1480,6 +1497,155 @@ mod tests { )); } + #[test] + fn test_datatype_is_logically_equivalent_to_dictionary() { + // Dictionary is logically equal to its value type + assert!(DFSchema::datatype_is_logically_equal( + &DataType::Utf8, + &DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)) + )); + } + + #[test] + fn test_datatype_is_semantically_equal() { + assert!(DFSchema::datatype_is_semantically_equal( + &DataType::Int8, + &DataType::Int8 + )); + + assert!(!DFSchema::datatype_is_semantically_equal( + &DataType::Int8, + &DataType::Int16 + )); + + // Test lists + + // Succeeds if both have the same element type, disregards names and nullability + assert!(DFSchema::datatype_is_semantically_equal( + &DataType::List(Field::new("item", DataType::Int8, true).into()), + &DataType::List(Field::new("element", DataType::Int8, false).into()) + )); + + // Fails if element type is different + assert!(!DFSchema::datatype_is_semantically_equal( + &DataType::List(Field::new("item", DataType::Int8, true).into()), + &DataType::List(Field::new("item", DataType::Int16, true).into()) + )); + + // Test maps + let map_field = DataType::Map( + Field::new( + "entries", + DataType::Struct(Fields::from(vec![ + Field::new("key", DataType::Int8, false), + Field::new("value", DataType::Int8, true), + ])), + true, + ) + .into(), + true, + ); + + // Succeeds if both maps have the same key and value types, disregards names and nullability + assert!(DFSchema::datatype_is_semantically_equal( + &map_field, + &DataType::Map( + Field::new( + "pairs", + DataType::Struct(Fields::from(vec![ + Field::new("one", DataType::Int8, false), + Field::new("two", DataType::Int8, false) + ])), + true + ) + .into(), + true + ) + )); + // Fails if value type is different + assert!(!DFSchema::datatype_is_semantically_equal( + &map_field, + &DataType::Map( + Field::new( + "entries", + DataType::Struct(Fields::from(vec![ + Field::new("key", DataType::Int8, false), + Field::new("value", DataType::Int16, true) + ])), + true + ) + .into(), + true + ) + )); + + // Fails if key type is different + assert!(!DFSchema::datatype_is_semantically_equal( + &map_field, + &DataType::Map( + Field::new( + "entries", + DataType::Struct(Fields::from(vec![ + Field::new("key", DataType::Int16, false), + Field::new("value", DataType::Int8, true) + ])), + true + ) + .into(), + true + ) + )); + + // Test structs + + let struct_field = DataType::Struct(Fields::from(vec![ + Field::new("a", DataType::Int8, true), + Field::new("b", DataType::Int8, true), + ])); + + // Succeeds if both have same names and datatypes, ignores nullability + assert!(DFSchema::datatype_is_logically_equal( + &struct_field, + &DataType::Struct(Fields::from(vec![ + Field::new("a", DataType::Int8, false), + Field::new("b", DataType::Int8, true), + ])) + )); + + // Fails if field names are different + assert!(!DFSchema::datatype_is_logically_equal( + &struct_field, + &DataType::Struct(Fields::from(vec![ + Field::new("x", DataType::Int8, true), + Field::new("y", DataType::Int8, true), + ])) + )); + + // Fails if types are different + assert!(!DFSchema::datatype_is_logically_equal( + &struct_field, + &DataType::Struct(Fields::from(vec![ + Field::new("a", DataType::Int16, true), + Field::new("b", DataType::Int8, true), + ])) + )); + + // Fails if more or less fields + assert!(!DFSchema::datatype_is_logically_equal( + &struct_field, + &DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int8, true),])) + )); + } + + #[test] + fn test_datatype_is_not_semantically_equivalent_to_dictionary() { + // Dictionary is not semantically equal to its value type + assert!(!DFSchema::datatype_is_semantically_equal( + &DataType::Utf8, + &DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)) + )); + } + fn test_schema_2() -> Schema { Schema::new(vec![ Field::new("c100", DataType::Boolean, true),