-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement TPCH substrait integration test, support tpch_4 and tpch_5 #11311
Changes from 1 commit
13744f8
9ceb8c0
489b96c
f1ae84c
3b193f8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -27,7 +27,7 @@ use datafusion::common::{ | |||||||||||||||||
substrait_err, DFSchema, DFSchemaRef, | ||||||||||||||||||
}; | ||||||||||||||||||
use datafusion::execution::FunctionRegistry; | ||||||||||||||||||
use datafusion::logical_expr::expr::{InSubquery, Sort}; | ||||||||||||||||||
use datafusion::logical_expr::expr::{Exists, InSubquery, Sort}; | ||||||||||||||||||
|
||||||||||||||||||
use datafusion::logical_expr::{ | ||||||||||||||||||
aggregate_function, expr::find_df_window_func, Aggregate, BinaryExpr, Case, | ||||||||||||||||||
|
@@ -1297,6 +1297,32 @@ pub async fn from_substrait_rex( | |||||||||||||||||
outer_ref_columns, | ||||||||||||||||||
}))) | ||||||||||||||||||
} | ||||||||||||||||||
SubqueryType::SetPredicate(predicate) => { | ||||||||||||||||||
match predicate.predicate_op { | ||||||||||||||||||
// exist | ||||||||||||||||||
1 => { | ||||||||||||||||||
let relations = &predicate.tuples; | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. super nit:
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. in 489b96c |
||||||||||||||||||
let plan = from_substrait_rel( | ||||||||||||||||||
ctx, | ||||||||||||||||||
&relations.clone().unwrap_or_default(), | ||||||||||||||||||
extensions, | ||||||||||||||||||
) | ||||||||||||||||||
.await?; | ||||||||||||||||||
let outer_ref_columns = plan.all_out_ref_exprs(); | ||||||||||||||||||
Ok(Arc::new(Expr::Exists(Exists::new( | ||||||||||||||||||
Subquery { | ||||||||||||||||||
subquery: Arc::new(plan), | ||||||||||||||||||
outer_ref_columns, | ||||||||||||||||||
}, | ||||||||||||||||||
false, | ||||||||||||||||||
)))) | ||||||||||||||||||
} | ||||||||||||||||||
other_type => Err(DataFusionError::Substrait(format!( | ||||||||||||||||||
"unimplemented type {:?} for set predicate", | ||||||||||||||||||
other_type | ||||||||||||||||||
))), | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. would
Suggested change
work here as well? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. in f1ae84c |
||||||||||||||||||
} | ||||||||||||||||||
} | ||||||||||||||||||
other_type => { | ||||||||||||||||||
substrait_err!("Subquery type {:?} not implemented", other_type) | ||||||||||||||||||
} | ||||||||||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -90,6 +90,40 @@ mod tests { | |
Ok(ctx) | ||
} | ||
|
||
async fn create_context_tpch4() -> Result<SessionContext> { | ||
let ctx = SessionContext::new(); | ||
|
||
let registrations = vec![ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. doesn't need to be part of this PR, but how about having a general create_context_tpch(registrations: Vec<(string, string)>)? and then writing the vec in the test functions |
||
("FILENAME_PLACEHOLDER_0", "tests/testdata/tpch/orders.csv"), | ||
("FILENAME_PLACEHOLDER_1", "tests/testdata/tpch/lineitem.csv"), | ||
]; | ||
|
||
for (table_name, file_path) in registrations { | ||
register_csv(&ctx, table_name, file_path).await?; | ||
} | ||
|
||
Ok(ctx) | ||
} | ||
|
||
async fn create_context_tpch5() -> Result<SessionContext> { | ||
let ctx = SessionContext::new(); | ||
|
||
let registrations = vec![ | ||
("FILENAME_PLACEHOLDER_0", "tests/testdata/tpch/customer.csv"), | ||
("FILENAME_PLACEHOLDER_1", "tests/testdata/tpch/orders.csv"), | ||
("FILENAME_PLACEHOLDER_2", "tests/testdata/tpch/lineitem.csv"), | ||
("FILENAME_PLACEHOLDER_3", "tests/testdata/tpch/supplier.csv"), | ||
("NATION", "tests/testdata/tpch/nation.csv"), | ||
("REGION", "tests/testdata/tpch/region.csv"), | ||
]; | ||
|
||
for (table_name, file_path) in registrations { | ||
register_csv(&ctx, table_name, file_path).await?; | ||
} | ||
|
||
Ok(ctx) | ||
} | ||
|
||
#[tokio::test] | ||
async fn tpch_test_1() -> Result<()> { | ||
let ctx = create_context_tpch1().await?; | ||
|
@@ -180,4 +214,56 @@ mod tests { | |
\n TableScan: FILENAME_PLACEHOLDER_1 projection=[o_orderkey, o_custkey, o_orderstatus, o_totalprice, o_orderdate, o_orderpriority, o_clerk, o_shippriority, o_comment]\n TableScan: FILENAME_PLACEHOLDER_2 projection=[l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment]"); | ||
Ok(()) | ||
} | ||
|
||
#[tokio::test] | ||
async fn tpch_test_4() -> Result<()> { | ||
let ctx = create_context_tpch4().await?; | ||
let path = "tests/testdata/tpch_substrait_plans/query_4.json"; | ||
let proto = serde_json::from_reader::<_, Plan>(BufReader::new( | ||
File::open(path).expect("file not found"), | ||
)) | ||
.expect("failed to parse json"); | ||
let plan = from_substrait_plan(&ctx, &proto).await?; | ||
let plan_str = format!("{:?}", plan); | ||
assert_eq!(plan_str, "Projection: FILENAME_PLACEHOLDER_0.o_orderpriority AS O_ORDERPRIORITY, count(Int64(1)) AS ORDER_COUNT\ | ||
\n Sort: FILENAME_PLACEHOLDER_0.o_orderpriority ASC NULLS LAST\ | ||
\n Aggregate: groupBy=[[FILENAME_PLACEHOLDER_0.o_orderpriority]], aggr=[[count(Int64(1))]]\ | ||
\n Projection: FILENAME_PLACEHOLDER_0.o_orderpriority\ | ||
\n Filter: FILENAME_PLACEHOLDER_0.o_orderdate >= CAST(Utf8(\"1993-07-01\") AS Date32) AND FILENAME_PLACEHOLDER_0.o_orderdate < CAST(Utf8(\"1993-10-01\") AS Date32) AND EXISTS (<subquery>)\ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
\n Subquery:\ | ||
\n Filter: FILENAME_PLACEHOLDER_1.l_orderkey = FILENAME_PLACEHOLDER_1.l_orderkey AND FILENAME_PLACEHOLDER_1.l_commitdate < FILENAME_PLACEHOLDER_1.l_receiptdate\ | ||
\n TableScan: FILENAME_PLACEHOLDER_1 projection=[l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment]\ | ||
\n TableScan: FILENAME_PLACEHOLDER_0 projection=[o_orderkey, o_custkey, o_orderstatus, o_totalprice, o_orderdate, o_orderpriority, o_clerk, o_shippriority, o_comment]"); | ||
Ok(()) | ||
} | ||
|
||
#[tokio::test] | ||
async fn tpch_test_5() -> Result<()> { | ||
let ctx = create_context_tpch5().await?; | ||
let path = "tests/testdata/tpch_substrait_plans/query_5.json"; | ||
let proto = serde_json::from_reader::<_, Plan>(BufReader::new( | ||
File::open(path).expect("file not found"), | ||
)) | ||
.expect("failed to parse json"); | ||
|
||
let plan = from_substrait_plan(&ctx, &proto).await?; | ||
let plan_str = format!("{:?}", plan); | ||
assert_eq!(plan_str, "Projection: NATION.n_name AS N_NAME, sum(FILENAME_PLACEHOLDER_2.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_2.l_discount) AS REVENUE\ | ||
\n Sort: sum(FILENAME_PLACEHOLDER_2.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_2.l_discount) DESC NULLS FIRST\ | ||
\n Aggregate: groupBy=[[NATION.n_name]], aggr=[[sum(FILENAME_PLACEHOLDER_2.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_2.l_discount)]]\ | ||
\n Projection: NATION.n_name, FILENAME_PLACEHOLDER_2.l_extendedprice * (CAST(Int32(1) AS Decimal128(19, 0)) - FILENAME_PLACEHOLDER_2.l_discount)\ | ||
\n Filter: FILENAME_PLACEHOLDER_0.c_custkey = FILENAME_PLACEHOLDER_1.o_custkey AND FILENAME_PLACEHOLDER_2.l_orderkey = FILENAME_PLACEHOLDER_1.o_orderkey AND FILENAME_PLACEHOLDER_2.l_suppkey = FILENAME_PLACEHOLDER_3.s_suppkey AND FILENAME_PLACEHOLDER_0.c_nationkey = FILENAME_PLACEHOLDER_3.s_nationkey AND FILENAME_PLACEHOLDER_3.s_nationkey = NATION.n_nationkey AND NATION.n_regionkey = REGION.r_regionkey AND REGION.r_name = CAST(Utf8(\"ASIA\") AS Utf8) AND FILENAME_PLACEHOLDER_1.o_orderdate >= CAST(Utf8(\"1994-01-01\") AS Date32) AND FILENAME_PLACEHOLDER_1.o_orderdate < CAST(Utf8(\"1995-01-01\") AS Date32)\ | ||
\n Inner Join: Filter: Boolean(true)\ | ||
\n Inner Join: Filter: Boolean(true)\ | ||
\n Inner Join: Filter: Boolean(true)\ | ||
\n Inner Join: Filter: Boolean(true)\ | ||
\n Inner Join: Filter: Boolean(true)\ | ||
\n TableScan: FILENAME_PLACEHOLDER_0 projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_mktsegment, c_comment]\ | ||
\n TableScan: FILENAME_PLACEHOLDER_1 projection=[o_orderkey, o_custkey, o_orderstatus, o_totalprice, o_orderdate, o_orderpriority, o_clerk, o_shippriority, o_comment]\ | ||
\n TableScan: FILENAME_PLACEHOLDER_2 projection=[l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment]\ | ||
\n TableScan: FILENAME_PLACEHOLDER_3 projection=[s_suppkey, s_name, s_address, s_nationkey, s_phone, s_acctbal, s_comment]\ | ||
\n TableScan: NATION projection=[n_nationkey, n_name, n_regionkey, n_comment]\ | ||
\n TableScan: REGION projection=[r_regionkey, r_name, r_comment]"); | ||
Ok(()) | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It looks like we could use https://docs.rs/substrait/0.35.0/substrait/proto/expression/subquery/struct.SetPredicate.html#method.predicate_op to match on
PredicateOp
rather than a constantSo lke
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure, I'll fix it.