Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update to dat 0.3 #290

Merged
merged 5 commits into from
Jul 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions acceptance/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ version.workspace = true

[dependencies]
arrow-array = { workspace = true }
arrow-cast = { workspace = true }
arrow-ord = { workspace = true }
arrow-select = { workspace = true }
arrow-schema = { workspace = true }
Expand Down
2 changes: 1 addition & 1 deletion acceptance/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use tar::Archive;

const DAT_EXISTS_FILE_CHECK: &str = "tests/dat/.done";
const OUTPUT_FOLDER: &str = "tests/dat";
const VERSION: &str = "0.0.2";
const VERSION: &str = "0.0.3";

fn main() {
if dat_exists() {
Expand Down
50 changes: 32 additions & 18 deletions acceptance/src/data.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::{path::Path, sync::Arc};

use arrow_array::RecordBatch;
use arrow_array::{Array, RecordBatch};
use arrow_ord::sort::{lexsort_to_indices, SortColumn};
use arrow_schema::{DataType, Schema};
use arrow_select::{concat::concat_batches, filter::filter_record_batch, take::take};
Expand Down Expand Up @@ -60,17 +60,11 @@ pub fn sort_record_batch(batch: RecordBatch) -> DeltaResult<RecordBatch> {
Ok(RecordBatch::try_new(batch.schema(), columns)?)
}

static SKIPPED_TESTS: &[&str; 2] = &[
// For all_primitive_types and multi_partitioned_2: The golden table stores the timestamp as an
// INT96 (which is nanosecond precision), while the spec says we should read partition columns
// as microseconds. This means the read and golden data don't line up. When this is released in
// `dat` upstream, we can stop skipping these tests
"all_primitive_types",
"multi_partitioned_2",
];
static SKIPPED_TESTS: &[&str; 0] = &[];

// Ensure that two schema have the same field names, data types, and dict_id/ordering.
// Ensure that two schema have the same field names, and dict_id/ordering.
// We ignore:
// - data type: This is checked already in `assert_columns_match`
// - nullability: parquet marks many things as nullable that we don't in our schema
// - metadata: because that diverges from the real data to the golden tabled data
fn assert_schema_fields_match(schema: &Schema, golden: &Schema) {
Expand All @@ -79,10 +73,6 @@ fn assert_schema_fields_match(schema: &Schema, golden: &Schema) {
schema_field.name() == golden_field.name(),
"Field names don't match"
);
assert!(
schema_field.data_type() == golden_field.data_type(),
"Field data types don't match"
);
assert!(
schema_field.dict_id() == golden_field.dict_id(),
"Field dict_id doesn't match"
Expand All @@ -94,6 +84,33 @@ fn assert_schema_fields_match(schema: &Schema, golden: &Schema) {
}
}

// some things are equivalent, but don't show up as equivalent for `==`, so we normalize here
fn normalize_col(col: Arc<dyn Array>) -> Arc<dyn Array> {
if let DataType::Timestamp(unit, Some(zone)) = col.data_type() {
if **zone == *"+00:00" {
arrow_cast::cast::cast(&col, &DataType::Timestamp(*unit, Some("UTC".into())))
.expect("Could not cast to UTC")
} else {
col
}
} else {
col
}
}

fn assert_columns_match(actual: &[Arc<dyn Array>], expected: &[Arc<dyn Array>]) {
for (actual, expected) in actual.iter().zip(expected) {
let actual = normalize_col(actual.clone());
let expected = normalize_col(expected.clone());
// note that array equality includes data_type equality
// See: https://arrow.apache.org/rust/arrow_data/equal/fn.equal.html
assert_eq!(
&actual, &expected,
"Column data didn't match. Got {actual:?}, expected {expected:?}"
);
}
}

pub async fn assert_scan_data(engine: Arc<dyn Engine>, test_case: &TestCaseInfo) -> TestResult<()> {
let root_dir = test_case.root_dir();
for skipped in SKIPPED_TESTS {
Expand Down Expand Up @@ -135,10 +152,7 @@ pub async fn assert_scan_data(engine: Arc<dyn Engine>, test_case: &TestCaseInfo)
.expect("Didn't find golden data");
let golden = sort_record_batch(golden)?;

assert!(
all_data.columns() == golden.columns(),
"Read data does not equal golden data"
);
assert_columns_match(all_data.columns(), golden.columns());
assert_schema_fields_match(all_data.schema().as_ref(), golden.schema().as_ref());
assert!(
all_data.num_rows() == golden.num_rows(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,25 +7,25 @@ Schema:
└─ a_float: double

number: [
6,
4,
5,
6,
1,
2,
3
]
a_float: [
6.6,
4.4,
5.5,
6.6,
1.1,
2.2,
3.3
]
letter: [
null,
"a",
"e",
"f",
"a",
"b",
"c"
Expand Down
Loading