diff --git a/docs/querying/groupbyquery.md b/docs/querying/groupbyquery.md index b20f24b02ce8..550238786b6c 100644 --- a/docs/querying/groupbyquery.md +++ b/docs/querying/groupbyquery.md @@ -436,6 +436,7 @@ Supported query contexts: |`sortByDimsFirst`|Sort the results first by dimension values and then by timestamp.|false| |`forceLimitPushDown`|When all fields in the orderby are part of the grouping key, the Broker will push limit application down to the Historical processes. When the sorting order uses fields that are not in the grouping key, applying this optimization can result in approximate results with unknown accuracy, so this optimization is disabled by default in that case. Enabling this context flag turns on limit push down for limit/orderbys that contain non-grouping key columns.|false| |`applyLimitPushDownToSegment`|If Broker pushes limit down to queryable nodes (historicals, peons) then limit results during segment scan. This context value can be used to override `druid.query.groupBy.applyLimitPushDownToSegment`.|true| +|`groupByEnableMultiValueUnnesting`|Safety flag to enable/disable the implicit unnesting on multi value column's as part of the grouping key. 'true' indicates multi-value grouping keys are unnested. 'false' returns an error if a multi value column is found as part of the grouping key.|true| #### GroupBy v1 configurations diff --git a/docs/querying/multi-value-dimensions.md b/docs/querying/multi-value-dimensions.md index 952f21aaf3c6..36529a4541f8 100644 --- a/docs/querying/multi-value-dimensions.md +++ b/docs/querying/multi-value-dimensions.md @@ -375,3 +375,10 @@ This query returns the following result: Note that, for groupBy queries, you could get similar result with a [having spec](having.md) but using a filtered `dimensionSpec` is much more efficient because that gets applied at the lowest level in the query processing pipeline. Having specs are applied at the outermost level of groupBy query processing. + +## Disable GroupBy on multi-value columns + +You can disable the implicit unnesting behavior for groupBy by setting groupByEnableMultiValueUnnesting: false in your +query context. In this mode, the groupBy engine will return an error instead of completing the query. This is a safety +feature for situations where you believe that all dimensions are singly-valued and want the engine to reject any +multi-valued dimensions that were inadvertently included. \ No newline at end of file diff --git a/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryConfig.java b/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryConfig.java index 7bf2d0d3b445..5e32e88f2303 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryConfig.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryConfig.java @@ -34,6 +34,7 @@ public class GroupByQueryConfig public static final String CTX_KEY_FORCE_PUSH_DOWN_NESTED_QUERY = "forcePushDownNestedQuery"; public static final String CTX_KEY_EXECUTING_NESTED_QUERY = "executingNestedQuery"; public static final String CTX_KEY_ARRAY_RESULT_ROWS = "resultAsArray"; + public static final String CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING = "groupByEnableMultiValueUnnesting"; private static final String CTX_KEY_IS_SINGLE_THREADED = "groupByIsSingleThreaded"; private static final String CTX_KEY_MAX_INTERMEDIATE_ROWS = "maxIntermediateRows"; private static final String CTX_KEY_MAX_RESULTS = "maxResults"; @@ -97,6 +98,9 @@ public class GroupByQueryConfig @JsonProperty private boolean vectorize = true; + @JsonProperty + private boolean enableMultiValueUnnesting = true; + public String getDefaultStrategy() { return defaultStrategy; @@ -192,6 +196,11 @@ public boolean isForcePushDownNestedQuery() return forcePushDownNestedQuery; } + public boolean isMultiValueUnnestingEnabled() + { + return enableMultiValueUnnesting; + } + public GroupByQueryConfig withOverrides(final GroupByQuery query) { final GroupByQueryConfig newConfig = new GroupByQueryConfig(); @@ -244,6 +253,10 @@ public GroupByQueryConfig withOverrides(final GroupByQuery query) getNumParallelCombineThreads() ); newConfig.vectorize = query.getContextBoolean(QueryContexts.VECTORIZE_KEY, isVectorize()); + newConfig.enableMultiValueUnnesting = query.getContextBoolean( + CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING, + isMultiValueUnnestingEnabled() + ); return newConfig; } @@ -266,6 +279,7 @@ public String toString() ", numParallelCombineThreads=" + numParallelCombineThreads + ", vectorize=" + vectorize + ", forcePushDownNestedQuery=" + forcePushDownNestedQuery + + ", enableMultiValueUnnesting=" + enableMultiValueUnnesting + '}'; } } diff --git a/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryEngine.java b/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryEngine.java index d3e072e2e4f1..62a82dc84413 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryEngine.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryEngine.java @@ -32,6 +32,7 @@ import org.apache.druid.guice.annotations.Global; import org.apache.druid.java.util.common.IAE; import org.apache.druid.java.util.common.ISE; +import org.apache.druid.java.util.common.UOE; import org.apache.druid.java.util.common.guava.BaseSequence; import org.apache.druid.java.util.common.guava.FunctionalIterator; import org.apache.druid.java.util.common.guava.Sequence; @@ -91,7 +92,13 @@ public Sequence process(final GroupByQuery query, final StorageAdapter stor "Null storage adapter found. Probably trying to issue a query against a segment being memory unmapped." ); } - + if (!query.getContextValue(GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING, true)) { + throw new UOE( + "GroupBy v1 does not support %s as false. Set %s to true or use groupBy v2", + GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING, + GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING + ); + } final List intervals = query.getQuerySegmentSpec().getIntervals(); if (intervals.size() != 1) { throw new IAE("Should only have one interval, got[%s]", intervals); diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java index 5a5263efff1f..f38f6ce2c104 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java @@ -459,6 +459,8 @@ private abstract static class GroupByEngineIterator implements Iterator @Nullable protected CloseableGrouperIterator delegate = null; protected final boolean allSingleValueDims; + protected final boolean allowMultiValueGrouping; + public GroupByEngineIterator( final GroupByQuery query, @@ -480,6 +482,10 @@ public GroupByEngineIterator( // Time is the same for every row in the cursor this.timestamp = fudgeTimestamp != null ? fudgeTimestamp : cursor.getTime(); this.allSingleValueDims = allSingleValueDims; + this.allowMultiValueGrouping = query.getContextBoolean( + GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING, + true + ); } private CloseableGrouperIterator initNewDelegate() @@ -593,6 +599,19 @@ protected int getSingleValue(IndexedInts indexedInts) return indexedInts.size() == 1 ? indexedInts.get(0) : GroupByColumnSelectorStrategy.GROUP_BY_MISSING_VALUE; } + protected void checkIfMultiValueGroupingIsAllowed(String dimName) + { + if (!allowMultiValueGrouping) { + throw new ISE( + "Encountered multi-value dimension %s that cannot be processed with %s set to false." + + " Consider setting %s to true.", + dimName, + GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY, + GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY + ); + } + } + } private static class HashAggregateIterator extends GroupByEngineIterator @@ -764,6 +783,9 @@ protected void aggregateMultiValueDims(Grouper grouper) ); if (doAggregate) { + // this check is done during the row aggregation as a dimension can become multi-value col if column + // capabilities is unknown. + checkIfMultiValueGroupingIsAllowed(dims[stackPointer].getName()); stack[stackPointer]++; for (int i = stackPointer + 1; i < stack.length; i++) { dims[i].getColumnSelectorStrategy().initGroupingKeyColumnValue( @@ -889,12 +911,17 @@ private void aggregateMultiValueDims(IntGrouper grouper) } while (!cursor.isDone()) { - int multiValuesSize = multiValues.size(); + final int multiValuesSize = multiValues.size(); if (multiValuesSize == 0) { if (!grouper.aggregate(GroupByColumnSelectorStrategy.GROUP_BY_MISSING_VALUE).isOk()) { return; } } else { + if (multiValuesSize > 1) { + // this check is done during the row aggregation as a dimension can become multi-value col if column + // capabilities is unknown. + checkIfMultiValueGroupingIsAllowed(dim.getName()); + } for (; nextValIndex < multiValuesSize; nextValIndex++) { if (!grouper.aggregate(multiValues.get(nextValIndex)).isOk()) { return; diff --git a/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java b/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java index 5b89eb3a85b2..4b958d3c45e9 100644 --- a/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java +++ b/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java @@ -40,6 +40,7 @@ import org.apache.druid.java.util.common.Intervals; import org.apache.druid.java.util.common.Pair; import org.apache.druid.java.util.common.StringUtils; +import org.apache.druid.java.util.common.UOE; import org.apache.druid.java.util.common.concurrent.Execs; import org.apache.druid.java.util.common.granularity.DurationGranularity; import org.apache.druid.java.util.common.granularity.Granularities; @@ -1310,6 +1311,41 @@ public void testMultiValueDimension() TestHelper.assertExpectedObjects(expectedResults, results, "multi-value-dim"); } + @Test + public void testMultiValueDimensionNotAllowed() + { + + if (config.getDefaultStrategy().equals(GroupByStrategySelector.STRATEGY_V1)) { + expectedException.expect(UOE.class); + expectedException.expectMessage(StringUtils.format( + "GroupBy v1 does not support %s as false", + GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING + )); + } else if (!vectorize) { + expectedException.expect(RuntimeException.class); + expectedException.expectMessage(StringUtils.format( + "Encountered multi-value dimension %s that cannot be processed with %s set to false." + + " Consider setting %s to true.", + "placementish", + GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY, + GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY + )); + } else { + cannotVectorize(); + } + + GroupByQuery query = makeQueryBuilder() + .setDataSource(QueryRunnerTestHelper.DATA_SOURCE) + .setQuerySegmentSpec(QueryRunnerTestHelper.FIRST_TO_THIRD) + .setDimensions(new DefaultDimensionSpec("placementish", "alias")) + .setAggregatorSpecs(QueryRunnerTestHelper.ROWS_COUNT, new LongSumAggregatorFactory("idx", "index")) + .setGranularity(QueryRunnerTestHelper.ALL_GRAN) + .overrideContext(ImmutableMap.of(GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING, false)) + .build(); + + GroupByQueryRunnerTestHelper.runQuery(factory, runner, query); + } + @Test public void testMultiValueDimensionAsArray() { diff --git a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteMultiValueStringQueryTest.java b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteMultiValueStringQueryTest.java index e7203331ccc1..b28f418e9643 100644 --- a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteMultiValueStringQueryTest.java +++ b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteMultiValueStringQueryTest.java @@ -22,6 +22,7 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; import org.apache.druid.common.config.NullHandling; +import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.java.util.common.granularity.Granularities; import org.apache.druid.math.expr.ExpressionProcessing; import org.apache.druid.query.Druids; @@ -32,6 +33,7 @@ import org.apache.druid.query.filter.LikeDimFilter; import org.apache.druid.query.filter.SelectorDimFilter; import org.apache.druid.query.groupby.GroupByQuery; +import org.apache.druid.query.groupby.GroupByQueryConfig; import org.apache.druid.query.groupby.orderby.DefaultLimitSpec; import org.apache.druid.query.groupby.orderby.OrderByColumnSpec; import org.apache.druid.query.ordering.StringComparators; @@ -43,7 +45,9 @@ import org.apache.druid.sql.calcite.util.CalciteTests; import org.junit.Test; +import java.util.HashMap; import java.util.List; +import java.util.Map; public class CalciteMultiValueStringQueryTest extends BaseCalciteQueryTest { @@ -53,7 +57,8 @@ public void testMultiValueStringWorksLikeStringGroupBy() throws Exception { // Cannot vectorize due to usage of expressions. cannotVectorize(); - + Map groupByOnMultiValueColumnEnabled = new HashMap<>(QUERY_CONTEXT_DEFAULT); + groupByOnMultiValueColumnEnabled.put(GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING, true); List expected; if (NullHandling.replaceWithDefault()) { expected = ImmutableList.of( @@ -76,6 +81,7 @@ public void testMultiValueStringWorksLikeStringGroupBy() throws Exception } testQuery( "SELECT concat(dim3, 'foo'), SUM(cnt) FROM druid.numfoo GROUP BY 1 ORDER BY 2 DESC", + groupByOnMultiValueColumnEnabled, ImmutableList.of( GroupByQuery.builder() .setDataSource(CalciteTests.DATASOURCE3) @@ -103,6 +109,30 @@ public void testMultiValueStringWorksLikeStringGroupBy() throws Exception ); } + @Test + public void testMultiValueStringGroupByDoesNotWork() throws Exception + { + // Cannot vectorize due to usage of expressions. + cannotVectorize(); + Map groupByOnMultiValueColumnDisabled = new HashMap<>(QUERY_CONTEXT_DEFAULT); + groupByOnMultiValueColumnDisabled.put(GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING, false); + testQueryThrows( + "SELECT concat(dim3, 'foo'), SUM(cnt) FROM druid.numfoo GROUP BY 1 ORDER BY 2 DESC", + groupByOnMultiValueColumnDisabled, + ImmutableList.of(), + exception -> { + exception.expect(RuntimeException.class); + expectedException.expectMessage(StringUtils.format( + "Encountered multi-value dimension %s that cannot be processed with %s set to false." + + " Consider setting %s to true.", + "v0", + GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY, + GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY + )); + } + ); + } + @Test public void testMultiValueStringWorksLikeStringGroupByWithFilter() throws Exception { diff --git a/website/.spelling b/website/.spelling index 8c44966bd327..f67272fa38af 100644 --- a/website/.spelling +++ b/website/.spelling @@ -1565,6 +1565,8 @@ outputName pushdown row1 subtotalsSpec +unnested +unnesting - ../docs/querying/having.md HavingSpec HavingSpecs @@ -1595,6 +1597,8 @@ row4 t3 t4 t5 +groupByEnableMultiValueUnnesting +unnesting - ../docs/querying/multitenancy.md 500ms tenant_id