-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Combine multiple IN
lists in ExprSimplifier
#8949
Changes from all commits
b411c90
c126954
cb19bb8
f4c3741
fd2fcba
69039ec
26413b4
af9e186
fe64b11
f106ed0
2c08079
3fb45db
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
//! This module implements a rule that simplifies the values for `InList`s | ||
|
||
use std::collections::HashSet; | ||
|
||
use datafusion_common::tree_node::TreeNodeRewriter; | ||
use datafusion_common::Result; | ||
use datafusion_expr::expr::InList; | ||
use datafusion_expr::{lit, BinaryExpr, Expr, Operator}; | ||
|
||
/// Simplify expressions that is guaranteed to be true or false to a literal boolean expression | ||
/// | ||
/// Rules: | ||
/// If both expressions are `IN` or `NOT IN`, then we can apply intersection or union on both lists | ||
/// Intersection: | ||
/// 1. `a in (1,2,3) AND a in (4,5) -> a in (), which is false` | ||
/// 2. `a in (1,2,3) AND a in (2,3,4) -> a in (2,3)` | ||
/// 3. `a not in (1,2,3) OR a not in (3,4,5,6) -> a not in (3)` | ||
/// Union: | ||
/// 4. `a not int (1,2,3) AND a not in (4,5,6) -> a not in (1,2,3,4,5,6)` | ||
/// # This rule is handled by `or_in_list_simplifier.rs` | ||
/// 5. `a in (1,2,3) OR a in (4,5,6) -> a in (1,2,3,4,5,6)` | ||
/// If one of the expressions is `IN` and another one is `NOT IN`, then we apply exception on `In` expression | ||
/// 6. `a in (1,2,3,4) AND a not in (1,2,3,4,5) -> a in (), which is false` | ||
/// 7. `a not in (1,2,3,4) AND a in (1,2,3,4,5) -> a = 5` | ||
/// 8. `a in (1,2,3,4) AND a not in (5,6,7,8) -> a in (1,2,3,4)` | ||
pub(super) struct InListSimplifier {} | ||
|
||
impl InListSimplifier { | ||
pub(super) fn new() -> Self { | ||
Self {} | ||
} | ||
} | ||
|
||
impl TreeNodeRewriter for InListSimplifier { | ||
type N = Expr; | ||
|
||
fn mutate(&mut self, expr: Expr) -> Result<Expr> { | ||
if let Expr::BinaryExpr(BinaryExpr { left, op, right }) = &expr { | ||
if let (Expr::InList(l1), Operator::And, Expr::InList(l2)) = | ||
(left.as_ref(), op, right.as_ref()) | ||
{ | ||
if l1.expr == l2.expr && !l1.negated && !l2.negated { | ||
return inlist_intersection(l1, l2, false); | ||
} else if l1.expr == l2.expr && l1.negated && l2.negated { | ||
return inlist_union(l1, l2, true); | ||
} else if l1.expr == l2.expr && !l1.negated && l2.negated { | ||
return inlist_except(l1, l2); | ||
} else if l1.expr == l2.expr && l1.negated && !l2.negated { | ||
return inlist_except(l2, l1); | ||
} | ||
} else if let (Expr::InList(l1), Operator::Or, Expr::InList(l2)) = | ||
(left.as_ref(), op, right.as_ref()) | ||
{ | ||
if l1.expr == l2.expr && l1.negated && l2.negated { | ||
return inlist_intersection(l1, l2, true); | ||
} | ||
} | ||
} | ||
|
||
Ok(expr) | ||
} | ||
} | ||
|
||
fn inlist_union(l1: &InList, l2: &InList, negated: bool) -> Result<Expr> { | ||
let mut seen: HashSet<Expr> = HashSet::new(); | ||
let list = l1 | ||
.list | ||
.iter() | ||
.chain(l2.list.iter()) | ||
.filter(|&e| seen.insert(e.to_owned())) | ||
.cloned() | ||
.collect::<Vec<_>>(); | ||
let merged_inlist = InList { | ||
expr: l1.expr.clone(), | ||
list, | ||
negated, | ||
}; | ||
Ok(Expr::InList(merged_inlist)) | ||
} | ||
|
||
fn inlist_intersection(l1: &InList, l2: &InList, negated: bool) -> Result<Expr> { | ||
let l1_set: HashSet<Expr> = l1.list.iter().cloned().collect(); | ||
let intersect_list: Vec<Expr> = l2 | ||
.list | ||
.iter() | ||
.filter(|x| l1_set.contains(x)) | ||
.cloned() | ||
.collect(); | ||
// e in () is always false | ||
// e not in () is always true | ||
if intersect_list.is_empty() { | ||
return Ok(lit(negated)); | ||
} | ||
let merged_inlist = InList { | ||
expr: l1.expr.clone(), | ||
list: intersect_list, | ||
negated, | ||
}; | ||
Ok(Expr::InList(merged_inlist)) | ||
} | ||
|
||
fn inlist_except(l1: &InList, l2: &InList) -> Result<Expr> { | ||
let l2_set: HashSet<Expr> = l2.list.iter().cloned().collect(); | ||
let except_list: Vec<Expr> = l1 | ||
.list | ||
.iter() | ||
.filter(|x| !l2_set.contains(x)) | ||
.cloned() | ||
.collect(); | ||
if except_list.is_empty() { | ||
return Ok(lit(false)); | ||
} | ||
let merged_inlist = InList { | ||
expr: l1.expr.clone(), | ||
list: except_list, | ||
negated: false, | ||
}; | ||
Ok(Expr::InList(merged_inlist)) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -725,3 +725,40 @@ AggregateExec: mode=SinglePartitioned, gby=[p_partkey@2 as p_partkey], aggr=[SUM | |
--------CoalesceBatchesExec: target_batch_size=8192 | ||
----------RepartitionExec: partitioning=Hash([ps_partkey@0], 4), input_partitions=1 | ||
------------MemoryExec: partitions=1, partition_sizes=[1] | ||
|
||
# Inlist simplification | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I recommend one or two tests in slt, and then adding the other tests in https://github.com/apache/arrow-datafusion/blob/795c71f3e8249847593a58dafe578ffcbcd3e012/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs#L1302-L1307 because:
|
||
|
||
statement ok | ||
create table t(x int) as values (1), (2), (3); | ||
|
||
query TT | ||
explain select x from t where x IN (1,2,3) AND x IN (4,5); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i have some additional suggested tests:
Maybe also some tests that could in theory be simplified like
(we don't have to actually implement that simplification in this PR, but it would be good to document the potential further improvement in tests) |
||
---- | ||
logical_plan EmptyRelation | ||
physical_plan EmptyExec | ||
|
||
query TT | ||
explain select x from t where x NOT IN (1,2,3,4) OR x NOT IN (5,6,7,8); | ||
---- | ||
logical_plan TableScan: t projection=[x] | ||
physical_plan MemoryExec: partitions=1, partition_sizes=[1] | ||
|
||
query TT | ||
explain select x from t where x IN (1,2,3,4,5) AND x NOT IN (1,2,3,4); | ||
---- | ||
logical_plan | ||
Filter: t.x = Int32(5) | ||
--TableScan: t projection=[x] | ||
physical_plan | ||
CoalesceBatchesExec: target_batch_size=8192 | ||
--FilterExec: x@0 = 5 | ||
----MemoryExec: partitions=1, partition_sizes=[1] | ||
|
||
query TT | ||
explain select x from t where x NOT IN (1,2,3,4,5) AND x IN (1,2,3); | ||
---- | ||
logical_plan EmptyRelation | ||
physical_plan EmptyExec | ||
|
||
statement ok | ||
drop table t; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you could avoid a clone here potentially because the mutate function gets passed an
owed
expr, like:(we could do this as a follow on PR too)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But we still need
expr
orBinaray Expr
if failed to matchif let
statement. I did not find out how to get the cloned value if we consume the value at first hand.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I spent some time fooling with it this afternoon and here is what I came up with #8971