delta-io · nicklan · Jul 22, 2024 · Jul 17, 2024 · Jul 22, 2024 · Jul 22, 2024
diff --git a/acceptance/Cargo.toml b/acceptance/Cargo.toml
@@ -11,6 +11,7 @@ version.workspace = true
 
 [dependencies]
 arrow-array = { workspace = true }
+arrow-cast = { workspace = true }
 arrow-ord = { workspace = true }
 arrow-select = { workspace = true }
 arrow-schema = { workspace = true }

diff --git a/acceptance/build.rs b/acceptance/build.rs
@@ -10,7 +10,7 @@ use tar::Archive;
 
 const DAT_EXISTS_FILE_CHECK: &str = "tests/dat/.done";
 const OUTPUT_FOLDER: &str = "tests/dat";
-const VERSION: &str = "0.0.2";
+const VERSION: &str = "0.0.3";
 
 fn main() {
     if dat_exists() {

diff --git a/acceptance/src/data.rs b/acceptance/src/data.rs
@@ -1,6 +1,6 @@
 use std::{path::Path, sync::Arc};
 
-use arrow_array::RecordBatch;
+use arrow_array::{Array, RecordBatch};
 use arrow_ord::sort::{lexsort_to_indices, SortColumn};
 use arrow_schema::{DataType, Schema};
 use arrow_select::{concat::concat_batches, filter::filter_record_batch, take::take};
@@ -60,17 +60,11 @@ pub fn sort_record_batch(batch: RecordBatch) -> DeltaResult<RecordBatch> {
     Ok(RecordBatch::try_new(batch.schema(), columns)?)
 }
 
-static SKIPPED_TESTS: &[&str; 2] = &[
-    // For all_primitive_types and multi_partitioned_2: The golden table stores the timestamp as an
-    // INT96 (which is nanosecond precision), while the spec says we should read partition columns
-    // as microseconds. This means the read and golden data don't line up. When this is released in
-    // `dat` upstream, we can stop skipping these tests
-    "all_primitive_types",
-    "multi_partitioned_2",
-];
+static SKIPPED_TESTS: &[&str; 0] = &[];
 
-// Ensure that two schema have the same field names, data types, and dict_id/ordering.
+// Ensure that two schema have the same field names, and dict_id/ordering.
 // We ignore:
+//  - data type: This is checked already in `assert_columns_match`
 //  - nullability: parquet marks many things as nullable that we don't in our schema
 //  - metadata: because that diverges from the real data to the golden tabled data
 fn assert_schema_fields_match(schema: &Schema, golden: &Schema) {
@@ -79,10 +73,6 @@ fn assert_schema_fields_match(schema: &Schema, golden: &Schema) {
             schema_field.name() == golden_field.name(),
             "Field names don't match"
         );
-        assert!(
-            schema_field.data_type() == golden_field.data_type(),
-            "Field data types don't match"
-        );
         assert!(
             schema_field.dict_id() == golden_field.dict_id(),
             "Field dict_id doesn't match"
@@ -94,6 +84,33 @@ fn assert_schema_fields_match(schema: &Schema, golden: &Schema) {
     }
 }
 
+// some things are equivalent, but don't show up as equivalent for `==`, so we normalize here
+fn normalize_col(col: Arc<dyn Array>) -> Arc<dyn Array> {
+    if let DataType::Timestamp(unit, Some(zone)) = col.data_type() {
+        if **zone == *"+00:00" {
+            arrow_cast::cast::cast(&col, &DataType::Timestamp(*unit, Some("UTC".into())))
+                .expect("Could not cast to UTC")
+        } else {
+            col
+        }
+    } else {
+        col
+    }
+}
+
+fn assert_columns_match(actual: &[Arc<dyn Array>], expected: &[Arc<dyn Array>]) {
+    for (actual, expected) in actual.iter().zip(expected) {
+        let actual = normalize_col(actual.clone());
+        let expected = normalize_col(expected.clone());
+        // note that array equality includes data_type equality
+        // See: https://arrow.apache.org/rust/arrow_data/equal/fn.equal.html
+        assert_eq!(
+            &actual, &expected,
+            "Column data didn't match. Got {actual:?}, expected {expected:?}"
+        );
+    }
+}
+
 pub async fn assert_scan_data(engine: Arc<dyn Engine>, test_case: &TestCaseInfo) -> TestResult<()> {
     let root_dir = test_case.root_dir();
     for skipped in SKIPPED_TESTS {
@@ -135,10 +152,7 @@ pub async fn assert_scan_data(engine: Arc<dyn Engine>, test_case: &TestCaseInfo)
         .expect("Didn't find golden data");
     let golden = sort_record_batch(golden)?;
 
-    assert!(
-        all_data.columns() == golden.columns(),
-        "Read data does not equal golden data"
-    );
+    assert_columns_match(all_data.columns(), golden.columns());
     assert_schema_fields_match(all_data.schema().as_ref(), golden.schema().as_ref());
     assert!(
         all_data.num_rows() == golden.num_rows(),

diff --git a/ffi/tests/read-table-testing/expected-data/basic-partitioned.expected b/ffi/tests/read-table-testing/expected-data/basic-partitioned.expected
@@ -7,25 +7,25 @@ Schema:
 └─ a_float: double
 
 number:  [
-  6,
   4,
   5,
+  6,
   1,
   2,
   3
 ]
 a_float:  [
-  6.6,
   4.4,
   5.5,
+  6.6,
   1.1,
   2.2,
   3.3
 ]
 letter:  [
-  null,
   "a",
   "e",
+  "f",
   "a",
   "b",
   "c"