Skip to content

Commit

Permalink
SQL: Allow sorting of groups by aggregates (#38042)
Browse files Browse the repository at this point in the history
Introduce client-side sorting of groups based on aggregate
functions. To allow this, the Analyzer has been extended to push down
to underlying Aggregate, aggregate function and the Querier has been
extended to identify the case and consume the results in order and sort
them based on the given columns.
The underlying QueryContainer has been slightly modified to allow a view
of the underlying values being extracted as the columns used for sorting
might not be requested by the user.

The PR also adds minor tweaks, mainly related to tree output.

Close #35118
  • Loading branch information
costin authored Feb 1, 2019
1 parent 630889b commit 783c9ed
Show file tree
Hide file tree
Showing 60 changed files with 1,349 additions and 407 deletions.
14 changes: 12 additions & 2 deletions docs/reference/sql/limitations.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,18 @@ a field is an array (has multiple values) or not, so without reading all the dat
=== Sorting by aggregation

When doing aggregations (`GROUP BY`) {es-sql} relies on {es}'s `composite` aggregation for its support for paginating results.
But this type of aggregation does come with a limitation: sorting can only be applied on the key used for the aggregation's buckets. This
means that queries like `SELECT * FROM test GROUP BY age ORDER BY COUNT(*)` are not possible.
However this type of aggregation does come with a limitation: sorting can only be applied on the key used for the aggregation's buckets.
{es-sql} overcomes this limitation by doing client-side sorting however as a safety measure, allows only up to *512* rows.

It is recommended to use `LIMIT` for queries that use sorting by aggregation, essentially indicating the top N results that are desired:

[source, sql]
--------------------------------------------------
SELECT * FROM test GROUP BY age ORDER BY COUNT(*) LIMIT 100;
--------------------------------------------------

It is possible to run the same queries without a `LIMIT` however in that case if the maximum size (*512*) is passed, an exception will be
returned as {es-sql} is unable to track (and sort) all the results returned.

[float]
=== Using aggregation functions on top of scalar functions
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ public void testExplainBasic() throws IOException {
assertThat(readLine(), startsWith("----------"));
assertThat(readLine(), startsWith("With[{}]"));
assertThat(readLine(), startsWith("\\_Project[[?*]]"));
assertThat(readLine(), startsWith(" \\_UnresolvedRelation[[][index=test],null,Unknown index [test]]"));
assertThat(readLine(), startsWith(" \\_UnresolvedRelation[test]"));
assertEquals("", readLine());

assertThat(command("EXPLAIN " + (randomBoolean() ? "" : "(PLAN ANALYZED) ") + "SELECT * FROM test"), containsString("plan"));
Expand Down Expand Up @@ -64,22 +64,22 @@ public void testExplainWithWhere() throws IOException {
assertThat(readLine(), startsWith("----------"));
assertThat(readLine(), startsWith("With[{}]"));
assertThat(readLine(), startsWith("\\_Project[[?*]]"));
assertThat(readLine(), startsWith(" \\_Filter[i = 2#"));
assertThat(readLine(), startsWith(" \\_UnresolvedRelation[[][index=test],null,Unknown index [test]]"));
assertThat(readLine(), startsWith(" \\_Filter[Equals[?i,2"));
assertThat(readLine(), startsWith(" \\_UnresolvedRelation[test]"));
assertEquals("", readLine());

assertThat(command("EXPLAIN " + (randomBoolean() ? "" : "(PLAN ANALYZED) ") + "SELECT * FROM test WHERE i = 2"),
containsString("plan"));
assertThat(readLine(), startsWith("----------"));
assertThat(readLine(), startsWith("Project[[i{f}#"));
assertThat(readLine(), startsWith("\\_Filter[i = 2#"));
assertThat(readLine(), startsWith("\\_Filter[Equals[i"));
assertThat(readLine(), startsWith(" \\_EsRelation[test][i{f}#"));
assertEquals("", readLine());

assertThat(command("EXPLAIN (PLAN OPTIMIZED) SELECT * FROM test WHERE i = 2"), containsString("plan"));
assertThat(readLine(), startsWith("----------"));
assertThat(readLine(), startsWith("Project[[i{f}#"));
assertThat(readLine(), startsWith("\\_Filter[i = 2#"));
assertThat(readLine(), startsWith("\\_Filter[Equals[i"));
assertThat(readLine(), startsWith(" \\_EsRelation[test][i{f}#"));
assertEquals("", readLine());

Expand Down Expand Up @@ -123,20 +123,20 @@ public void testExplainWithCount() throws IOException {
assertThat(command("EXPLAIN (PLAN PARSED) SELECT COUNT(*) FROM test"), containsString("plan"));
assertThat(readLine(), startsWith("----------"));
assertThat(readLine(), startsWith("With[{}]"));
assertThat(readLine(), startsWith("\\_Project[[?COUNT(*)]]"));
assertThat(readLine(), startsWith(" \\_UnresolvedRelation[[][index=test],null,Unknown index [test]]"));
assertThat(readLine(), startsWith("\\_Project[[?COUNT[?*]]]"));
assertThat(readLine(), startsWith(" \\_UnresolvedRelation[test]"));
assertEquals("", readLine());

assertThat(command("EXPLAIN " + (randomBoolean() ? "" : "(PLAN ANALYZED) ") + "SELECT COUNT(*) FROM test"),
containsString("plan"));
assertThat(readLine(), startsWith("----------"));
assertThat(readLine(), startsWith("Aggregate[[],[COUNT(*)#"));
assertThat(readLine(), startsWith("Aggregate[[],[Count[*=1"));
assertThat(readLine(), startsWith("\\_EsRelation[test][i{f}#"));
assertEquals("", readLine());

assertThat(command("EXPLAIN (PLAN OPTIMIZED) SELECT COUNT(*) FROM test"), containsString("plan"));
assertThat(readLine(), startsWith("----------"));
assertThat(readLine(), startsWith("Aggregate[[],[COUNT(*)#"));
assertThat(readLine(), startsWith("Aggregate[[],[Count[*=1"));
assertThat(readLine(), startsWith("\\_EsRelation[test][i{f}#"));
assertEquals("", readLine());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ public void testSelectProjectScoreInAggContext() throws Exception {
public void testSelectOrderByScoreInAggContext() throws Exception {
index("test", body -> body.field("foo", 1));
assertFoundOneProblem(command("SELECT foo, COUNT(*) FROM test GROUP BY foo ORDER BY SCORE()"));
assertEquals("line 1:54: Cannot order by non-grouped column [SCORE()], expected [foo]" + END, readLine());
assertEquals("line 1:54: Cannot order by non-grouped column [SCORE()], expected [foo] or an aggregate function" + END, readLine());
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,9 @@ public void testSelectOrderByScoreInAggContext() throws Exception {
try (Connection c = esJdbc()) {
SQLException e = expectThrows(SQLException.class, () ->
c.prepareStatement("SELECT foo, COUNT(*) FROM test GROUP BY foo ORDER BY SCORE()").executeQuery());
assertEquals("Found 1 problem(s)\nline 1:54: Cannot order by non-grouped column [SCORE()], expected [foo]", e.getMessage());
assertEquals(
"Found 1 problem(s)\nline 1:54: Cannot order by non-grouped column [SCORE()], expected [foo] or an aggregate function",
e.getMessage());
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ public static List<Object[]> readScriptSpec() throws Exception {
tests.addAll(readScriptSpec("/datetime.sql-spec", parser));
tests.addAll(readScriptSpec("/math.sql-spec", parser));
tests.addAll(readScriptSpec("/agg.sql-spec", parser));
tests.addAll(readScriptSpec("/agg-ordering.sql-spec", parser));
tests.addAll(readScriptSpec("/arithmetic.sql-spec", parser));
tests.addAll(readScriptSpec("/string-functions.sql-spec", parser));
tests.addAll(readScriptSpec("/case-functions.sql-spec", parser));
Expand Down
87 changes: 87 additions & 0 deletions x-pack/plugin/sql/qa/src/main/resources/agg-ordering.sql-spec
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
//
// Custom sorting/ordering on aggregates
//

countWithImplicitGroupBy
SELECT MAX(salary) AS m FROM test_emp ORDER BY COUNT(*);

countWithImplicitGroupByWithHaving
SELECT MAX(salary) AS m FROM test_emp HAVING MIN(salary) > 1 ORDER BY COUNT(*);

countAndMaxWithImplicitGroupBy
SELECT MAX(salary) AS m FROM test_emp ORDER BY MAX(salary), COUNT(*);

maxWithAliasWithImplicitGroupBy
SELECT MAX(salary) AS m FROM test_emp ORDER BY m;

maxWithAliasWithImplicitGroupByAndHaving
SELECT MAX(salary) AS m FROM test_emp HAVING COUNT(*) > 1 ORDER BY m;

multipleOrderWithImplicitGroupByWithHaving
SELECT MAX(salary) AS m FROM test_emp HAVING MIN(salary) > 1 ORDER BY COUNT(*), m DESC;

multipleOrderWithImplicitGroupByWithoutAlias
SELECT MAX(salary) AS m FROM test_emp HAVING MIN(salary) > 1 ORDER BY COUNT(*), MIN(salary) DESC;

multipleOrderWithImplicitGroupByOfOrdinals
SELECT MAX(salary) AS max, MIN(salary) AS min FROM test_emp HAVING MIN(salary) > 1 ORDER BY 1, COUNT(*), 2 DESC;

aggWithoutAlias
SELECT MAX(salary) AS max FROM test_emp GROUP BY gender ORDER BY MAX(salary);

aggWithAlias
SELECT MAX(salary) AS m FROM test_emp GROUP BY gender ORDER BY m;

multipleAggsThatGetRewrittenWithoutAlias
SELECT MAX(salary) AS max, MIN(salary) AS min FROM test_emp GROUP BY gender ORDER BY MAX(salary);

multipleAggsThatGetRewrittenWithAliasDesc
SELECT MAX(salary) AS max, MIN(salary) AS min FROM test_emp GROUP BY gender ORDER BY 1 DESC;

multipleAggsThatGetRewrittenWithAlias
SELECT MAX(salary) AS max, MIN(salary) AS min FROM test_emp GROUP BY gender ORDER BY max;

aggNotSpecifiedInTheAggregate
SELECT MIN(salary) AS min, COUNT(*) AS c FROM test_emp GROUP BY gender ORDER BY MAX(salary);

aggNotSpecifiedInTheAggregatePlusOrdinal
SELECT MIN(salary) AS min, COUNT(*) AS c FROM test_emp GROUP BY gender ORDER BY MAX(salary), 2 DESC;

aggNotSpecifiedInTheAggregateWithHaving
SELECT MIN(salary) AS min, COUNT(*) AS c FROM test_emp GROUP BY gender HAVING c > 1 ORDER BY MAX(salary);

aggNotSpecifiedInTheAggregateWithHavingDesc
SELECT MIN(salary) AS min, COUNT(*) AS c FROM test_emp GROUP BY gender HAVING c > 1 ORDER BY MAX(salary) DESC;

aggNotSpecifiedInTheAggregateAndGroupWithHaving
SELECT gender, MIN(salary) AS min, COUNT(*) AS c FROM test_emp GROUP BY gender HAVING c > 1 ORDER BY MAX(salary), gender;

groupAndAggNotSpecifiedInTheAggregateWithHaving
SELECT gender, MIN(salary) AS min, COUNT(*) AS c FROM test_emp GROUP BY gender HAVING c > 1 ORDER BY gender, MAX(salary);

multipleAggsThatGetRewrittenWithAliasOnAMediumGroupBy
SELECT languages, MAX(salary) AS max, MIN(salary) AS min FROM test_emp GROUP BY languages ORDER BY max;

multipleAggsThatGetRewrittenWithAliasOnALargeGroupBy
SELECT emp_no, MAX(salary) AS max, MIN(salary) AS min FROM test_emp GROUP BY emp_no ORDER BY max;

multipleAggsThatGetRewrittenWithAliasOnAMediumGroupByWithHaving
SELECT languages, MAX(salary) AS max, MIN(salary) AS min FROM test_emp GROUP BY languages HAVING min BETWEEN 1000 AND 99999 ORDER BY max;

aggNotSpecifiedInTheAggregatemultipleAggsThatGetRewrittenWithAliasOnALargeGroupBy
SELECT emp_no, MIN(salary) AS min FROM test_emp GROUP BY emp_no ORDER BY MAX(salary);

aggNotSpecifiedWithHavingOnLargeGroupBy
SELECT MAX(salary) AS max FROM test_emp GROUP BY emp_no HAVING AVG(salary) > 1000 ORDER BY MIN(salary);

aggWithTieBreakerDescAsc
SELECT emp_no, MIN(languages) AS min FROM test_emp GROUP BY emp_no ORDER BY MIN(languages) DESC NULLS FIRST, emp_no ASC;

aggWithTieBreakerDescDesc
SELECT emp_no, MIN(languages) AS min FROM test_emp GROUP BY emp_no ORDER BY MIN(languages) DESC NULLS FIRST, emp_no DESC;

aggWithTieBreakerAscDesc
SELECT emp_no, MIN(languages) AS min FROM test_emp GROUP BY emp_no ORDER BY MAX(languages) ASC NULLS FIRST, emp_no DESC;

aggWithMixOfOrdinals
SELECT gender AS g, MAX(salary) AS m FROM test_emp GROUP BY gender ORDER BY 2 DESC LIMIT 3;
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@
import org.elasticsearch.xpack.sql.type.DataTypes;
import org.elasticsearch.xpack.sql.type.InvalidMappedField;
import org.elasticsearch.xpack.sql.type.UnsupportedEsField;
import org.elasticsearch.xpack.sql.util.CollectionUtils;
import org.elasticsearch.xpack.sql.util.Holder;

import java.util.ArrayList;
import java.util.Arrays;
Expand Down Expand Up @@ -106,7 +108,8 @@ protected Iterable<RuleExecutor<LogicalPlan>.Batch> batches() {
new ResolveFunctions(),
new ResolveAliases(),
new ProjectedAggregations(),
new ResolveAggsInHaving()
new ResolveAggsInHaving(),
new ResolveAggsInOrderBy()
//new ImplicitCasting()
);
Batch finish = new Batch("Finish Analysis",
Expand Down Expand Up @@ -926,62 +929,57 @@ protected LogicalPlan rule(Project p) {
// Handle aggs in HAVING. To help folding any aggs not found in Aggregation
// will be pushed down to the Aggregate and then projected. This also simplifies the Verifier's job.
//
private class ResolveAggsInHaving extends AnalyzeRule<LogicalPlan> {
private class ResolveAggsInHaving extends AnalyzeRule<Filter> {

@Override
protected boolean skipResolved() {
return false;
}

@Override
protected LogicalPlan rule(LogicalPlan plan) {
protected LogicalPlan rule(Filter f) {
// HAVING = Filter followed by an Agg
if (plan instanceof Filter) {
Filter f = (Filter) plan;
if (f.child() instanceof Aggregate && f.child().resolved()) {
Aggregate agg = (Aggregate) f.child();
if (f.child() instanceof Aggregate && f.child().resolved()) {
Aggregate agg = (Aggregate) f.child();

Set<NamedExpression> missing = null;
Expression condition = f.condition();
Set<NamedExpression> missing = null;
Expression condition = f.condition();

// the condition might contain an agg (AVG(salary)) that could have been resolved
// (salary cannot be pushed down to Aggregate since there's no grouping and thus the function wasn't resolved either)
// the condition might contain an agg (AVG(salary)) that could have been resolved
// (salary cannot be pushed down to Aggregate since there's no grouping and thus the function wasn't resolved either)

// so try resolving the condition in one go through a 'dummy' aggregate
if (!condition.resolved()) {
// that's why try to resolve the condition
Aggregate tryResolvingCondition = new Aggregate(agg.source(), agg.child(), agg.groupings(),
combine(agg.aggregates(), new Alias(f.source(), ".having", condition)));
// so try resolving the condition in one go through a 'dummy' aggregate
if (!condition.resolved()) {
// that's why try to resolve the condition
Aggregate tryResolvingCondition = new Aggregate(agg.source(), agg.child(), agg.groupings(),
combine(agg.aggregates(), new Alias(f.source(), ".having", condition)));

tryResolvingCondition = (Aggregate) analyze(tryResolvingCondition, false);
tryResolvingCondition = (Aggregate) analyze(tryResolvingCondition, false);

// if it got resolved
if (tryResolvingCondition.resolved()) {
// replace the condition with the resolved one
condition = ((Alias) tryResolvingCondition.aggregates()
.get(tryResolvingCondition.aggregates().size() - 1)).child();
} else {
// else bail out
return plan;
}
// if it got resolved
if (tryResolvingCondition.resolved()) {
// replace the condition with the resolved one
condition = ((Alias) tryResolvingCondition.aggregates()
.get(tryResolvingCondition.aggregates().size() - 1)).child();
} else {
// else bail out
return f;
}
}

missing = findMissingAggregate(agg, condition);

if (!missing.isEmpty()) {
Aggregate newAgg = new Aggregate(agg.source(), agg.child(), agg.groupings(),
combine(agg.aggregates(), missing));
Filter newFilter = new Filter(f.source(), newAgg, condition);
// preserve old output
return new Project(f.source(), newFilter, f.output());
}
missing = findMissingAggregate(agg, condition);

return new Filter(f.source(), f.child(), condition);
if (!missing.isEmpty()) {
Aggregate newAgg = new Aggregate(agg.source(), agg.child(), agg.groupings(),
combine(agg.aggregates(), missing));
Filter newFilter = new Filter(f.source(), newAgg, condition);
// preserve old output
return new Project(f.source(), newFilter, f.output());
}
return plan;
}

return plan;
return new Filter(f.source(), f.child(), condition);
}
return f;
}

private Set<NamedExpression> findMissingAggregate(Aggregate target, Expression from) {
Expand All @@ -1001,6 +999,66 @@ private Set<NamedExpression> findMissingAggregate(Aggregate target, Expression f
}
}


//
// Handle aggs in ORDER BY. To help folding any aggs not found in Aggregation
// will be pushed down to the Aggregate and then projected. This also simplifies the Verifier's job.
// Similar to Having however using a different matching pattern since HAVING is always Filter with Agg,
// while an OrderBy can have multiple intermediate nodes (Filter,Project, etc...)
//
private static class ResolveAggsInOrderBy extends AnalyzeRule<OrderBy> {

@Override
protected boolean skipResolved() {
return false;
}

@Override
protected LogicalPlan rule(OrderBy ob) {
List<Order> orders = ob.order();

// 1. collect aggs inside an order by
List<NamedExpression> aggs = new ArrayList<>();
for (Order order : orders) {
if (Functions.isAggregate(order.child())) {
aggs.add(Expressions.wrapAsNamed(order.child()));
}
}
if (aggs.isEmpty()) {
return ob;
}

// 2. find first Aggregate child and update it
final Holder<Boolean> found = new Holder<>(Boolean.FALSE);

LogicalPlan plan = ob.transformDown(a -> {
if (found.get() == Boolean.FALSE) {
found.set(Boolean.TRUE);

List<NamedExpression> missing = new ArrayList<>();

for (NamedExpression orderedAgg : aggs) {
if (Expressions.anyMatch(a.aggregates(), e -> Expressions.equalsAsAttribute(e, orderedAgg)) == false) {
missing.add(orderedAgg);
}
}
// agg already contains all aggs
if (missing.isEmpty() == false) {
// save aggregates
return new Aggregate(a.source(), a.child(), a.groupings(), CollectionUtils.combine(a.aggregates(), missing));
}
}
return a;
}, Aggregate.class);

// if the plan was updated, project the initial aggregates
if (plan != ob) {
return new Project(ob.source(), plan, ob.output());
}
return ob;
}
}

private class PruneDuplicateFunctions extends AnalyzeRule<LogicalPlan> {

@Override
Expand Down
Loading

0 comments on commit 783c9ed

Please sign in to comment.