Skip to content

Commit

Permalink
Adding new config for disabling group by on multiValue column (#12253)
Browse files Browse the repository at this point in the history
As part of #12078 one of the followup's was to have a specific config which does not allow accidental unnesting of multi value columns if such columns become part of the grouping key.
Added a config groupByEnableMultiValueUnnesting which can be set in the query context.

The default value of groupByEnableMultiValueUnnesting is true, therefore it does not change the current engine behavior.
If groupByEnableMultiValueUnnesting is set to false, the query will fail if it encounters a multi-value column in the grouping key.
  • Loading branch information
cryptoe authored Feb 16, 2022
1 parent 8fc0e5c commit 5794331
Show file tree
Hide file tree
Showing 8 changed files with 129 additions and 3 deletions.
1 change: 1 addition & 0 deletions docs/querying/groupbyquery.md
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,7 @@ Supported query contexts:
|`sortByDimsFirst`|Sort the results first by dimension values and then by timestamp.|false|
|`forceLimitPushDown`|When all fields in the orderby are part of the grouping key, the Broker will push limit application down to the Historical processes. When the sorting order uses fields that are not in the grouping key, applying this optimization can result in approximate results with unknown accuracy, so this optimization is disabled by default in that case. Enabling this context flag turns on limit push down for limit/orderbys that contain non-grouping key columns.|false|
|`applyLimitPushDownToSegment`|If Broker pushes limit down to queryable nodes (historicals, peons) then limit results during segment scan. This context value can be used to override `druid.query.groupBy.applyLimitPushDownToSegment`.|true|
|`groupByEnableMultiValueUnnesting`|Safety flag to enable/disable the implicit unnesting on multi value column's as part of the grouping key. 'true' indicates multi-value grouping keys are unnested. 'false' returns an error if a multi value column is found as part of the grouping key.|true|


#### GroupBy v1 configurations
Expand Down
7 changes: 7 additions & 0 deletions docs/querying/multi-value-dimensions.md
Original file line number Diff line number Diff line change
Expand Up @@ -375,3 +375,10 @@ This query returns the following result:
Note that, for groupBy queries, you could get similar result with a [having spec](having.md) but using a filtered
`dimensionSpec` is much more efficient because that gets applied at the lowest level in the query processing pipeline.
Having specs are applied at the outermost level of groupBy query processing.

## Disable GroupBy on multi-value columns

You can disable the implicit unnesting behavior for groupBy by setting groupByEnableMultiValueUnnesting: false in your
query context. In this mode, the groupBy engine will return an error instead of completing the query. This is a safety
feature for situations where you believe that all dimensions are singly-valued and want the engine to reject any
multi-valued dimensions that were inadvertently included.
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ public class GroupByQueryConfig
public static final String CTX_KEY_FORCE_PUSH_DOWN_NESTED_QUERY = "forcePushDownNestedQuery";
public static final String CTX_KEY_EXECUTING_NESTED_QUERY = "executingNestedQuery";
public static final String CTX_KEY_ARRAY_RESULT_ROWS = "resultAsArray";
public static final String CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING = "groupByEnableMultiValueUnnesting";
private static final String CTX_KEY_IS_SINGLE_THREADED = "groupByIsSingleThreaded";
private static final String CTX_KEY_MAX_INTERMEDIATE_ROWS = "maxIntermediateRows";
private static final String CTX_KEY_MAX_RESULTS = "maxResults";
Expand Down Expand Up @@ -97,6 +98,9 @@ public class GroupByQueryConfig
@JsonProperty
private boolean vectorize = true;

@JsonProperty
private boolean enableMultiValueUnnesting = true;

public String getDefaultStrategy()
{
return defaultStrategy;
Expand Down Expand Up @@ -192,6 +196,11 @@ public boolean isForcePushDownNestedQuery()
return forcePushDownNestedQuery;
}

public boolean isMultiValueUnnestingEnabled()
{
return enableMultiValueUnnesting;
}

public GroupByQueryConfig withOverrides(final GroupByQuery query)
{
final GroupByQueryConfig newConfig = new GroupByQueryConfig();
Expand Down Expand Up @@ -244,6 +253,10 @@ public GroupByQueryConfig withOverrides(final GroupByQuery query)
getNumParallelCombineThreads()
);
newConfig.vectorize = query.getContextBoolean(QueryContexts.VECTORIZE_KEY, isVectorize());
newConfig.enableMultiValueUnnesting = query.getContextBoolean(
CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING,
isMultiValueUnnestingEnabled()
);
return newConfig;
}

Expand All @@ -266,6 +279,7 @@ public String toString()
", numParallelCombineThreads=" + numParallelCombineThreads +
", vectorize=" + vectorize +
", forcePushDownNestedQuery=" + forcePushDownNestedQuery +
", enableMultiValueUnnesting=" + enableMultiValueUnnesting +
'}';
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import org.apache.druid.guice.annotations.Global;
import org.apache.druid.java.util.common.IAE;
import org.apache.druid.java.util.common.ISE;
import org.apache.druid.java.util.common.UOE;
import org.apache.druid.java.util.common.guava.BaseSequence;
import org.apache.druid.java.util.common.guava.FunctionalIterator;
import org.apache.druid.java.util.common.guava.Sequence;
Expand Down Expand Up @@ -91,7 +92,13 @@ public Sequence<Row> process(final GroupByQuery query, final StorageAdapter stor
"Null storage adapter found. Probably trying to issue a query against a segment being memory unmapped."
);
}

if (!query.getContextValue(GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING, true)) {
throw new UOE(
"GroupBy v1 does not support %s as false. Set %s to true or use groupBy v2",
GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING,
GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING
);
}
final List<Interval> intervals = query.getQuerySegmentSpec().getIntervals();
if (intervals.size() != 1) {
throw new IAE("Should only have one interval, got[%s]", intervals);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,8 @@ private abstract static class GroupByEngineIterator<KeyType> implements Iterator
@Nullable
protected CloseableGrouperIterator<KeyType, ResultRow> delegate = null;
protected final boolean allSingleValueDims;
protected final boolean allowMultiValueGrouping;


public GroupByEngineIterator(
final GroupByQuery query,
Expand All @@ -480,6 +482,10 @@ public GroupByEngineIterator(
// Time is the same for every row in the cursor
this.timestamp = fudgeTimestamp != null ? fudgeTimestamp : cursor.getTime();
this.allSingleValueDims = allSingleValueDims;
this.allowMultiValueGrouping = query.getContextBoolean(
GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING,
true
);
}

private CloseableGrouperIterator<KeyType, ResultRow> initNewDelegate()
Expand Down Expand Up @@ -593,6 +599,19 @@ protected int getSingleValue(IndexedInts indexedInts)
return indexedInts.size() == 1 ? indexedInts.get(0) : GroupByColumnSelectorStrategy.GROUP_BY_MISSING_VALUE;
}

protected void checkIfMultiValueGroupingIsAllowed(String dimName)
{
if (!allowMultiValueGrouping) {
throw new ISE(
"Encountered multi-value dimension %s that cannot be processed with %s set to false."
+ " Consider setting %s to true.",
dimName,
GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY,
GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY
);
}
}

}

private static class HashAggregateIterator extends GroupByEngineIterator<ByteBuffer>
Expand Down Expand Up @@ -764,6 +783,9 @@ protected void aggregateMultiValueDims(Grouper<ByteBuffer> grouper)
);

if (doAggregate) {
// this check is done during the row aggregation as a dimension can become multi-value col if column
// capabilities is unknown.
checkIfMultiValueGroupingIsAllowed(dims[stackPointer].getName());
stack[stackPointer]++;
for (int i = stackPointer + 1; i < stack.length; i++) {
dims[i].getColumnSelectorStrategy().initGroupingKeyColumnValue(
Expand Down Expand Up @@ -889,12 +911,17 @@ private void aggregateMultiValueDims(IntGrouper grouper)
}

while (!cursor.isDone()) {
int multiValuesSize = multiValues.size();
final int multiValuesSize = multiValues.size();
if (multiValuesSize == 0) {
if (!grouper.aggregate(GroupByColumnSelectorStrategy.GROUP_BY_MISSING_VALUE).isOk()) {
return;
}
} else {
if (multiValuesSize > 1) {
// this check is done during the row aggregation as a dimension can become multi-value col if column
// capabilities is unknown.
checkIfMultiValueGroupingIsAllowed(dim.getName());
}
for (; nextValIndex < multiValuesSize; nextValIndex++) {
if (!grouper.aggregate(multiValues.get(nextValIndex)).isOk()) {
return;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
import org.apache.druid.java.util.common.Intervals;
import org.apache.druid.java.util.common.Pair;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.java.util.common.UOE;
import org.apache.druid.java.util.common.concurrent.Execs;
import org.apache.druid.java.util.common.granularity.DurationGranularity;
import org.apache.druid.java.util.common.granularity.Granularities;
Expand Down Expand Up @@ -1310,6 +1311,41 @@ public void testMultiValueDimension()
TestHelper.assertExpectedObjects(expectedResults, results, "multi-value-dim");
}

@Test
public void testMultiValueDimensionNotAllowed()
{

if (config.getDefaultStrategy().equals(GroupByStrategySelector.STRATEGY_V1)) {
expectedException.expect(UOE.class);
expectedException.expectMessage(StringUtils.format(
"GroupBy v1 does not support %s as false",
GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING
));
} else if (!vectorize) {
expectedException.expect(RuntimeException.class);
expectedException.expectMessage(StringUtils.format(
"Encountered multi-value dimension %s that cannot be processed with %s set to false."
+ " Consider setting %s to true.",
"placementish",
GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY,
GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY
));
} else {
cannotVectorize();
}

GroupByQuery query = makeQueryBuilder()
.setDataSource(QueryRunnerTestHelper.DATA_SOURCE)
.setQuerySegmentSpec(QueryRunnerTestHelper.FIRST_TO_THIRD)
.setDimensions(new DefaultDimensionSpec("placementish", "alias"))
.setAggregatorSpecs(QueryRunnerTestHelper.ROWS_COUNT, new LongSumAggregatorFactory("idx", "index"))
.setGranularity(QueryRunnerTestHelper.ALL_GRAN)
.overrideContext(ImmutableMap.of(GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING, false))
.build();

GroupByQueryRunnerTestHelper.runQuery(factory, runner, query);
}

@Test
public void testMultiValueDimensionAsArray()
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import org.apache.druid.common.config.NullHandling;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.java.util.common.granularity.Granularities;
import org.apache.druid.math.expr.ExpressionProcessing;
import org.apache.druid.query.Druids;
Expand All @@ -32,6 +33,7 @@
import org.apache.druid.query.filter.LikeDimFilter;
import org.apache.druid.query.filter.SelectorDimFilter;
import org.apache.druid.query.groupby.GroupByQuery;
import org.apache.druid.query.groupby.GroupByQueryConfig;
import org.apache.druid.query.groupby.orderby.DefaultLimitSpec;
import org.apache.druid.query.groupby.orderby.OrderByColumnSpec;
import org.apache.druid.query.ordering.StringComparators;
Expand All @@ -43,7 +45,9 @@
import org.apache.druid.sql.calcite.util.CalciteTests;
import org.junit.Test;

import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class CalciteMultiValueStringQueryTest extends BaseCalciteQueryTest
{
Expand All @@ -53,7 +57,8 @@ public void testMultiValueStringWorksLikeStringGroupBy() throws Exception
{
// Cannot vectorize due to usage of expressions.
cannotVectorize();

Map<String, Object> groupByOnMultiValueColumnEnabled = new HashMap<>(QUERY_CONTEXT_DEFAULT);
groupByOnMultiValueColumnEnabled.put(GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING, true);
List<Object[]> expected;
if (NullHandling.replaceWithDefault()) {
expected = ImmutableList.of(
Expand All @@ -76,6 +81,7 @@ public void testMultiValueStringWorksLikeStringGroupBy() throws Exception
}
testQuery(
"SELECT concat(dim3, 'foo'), SUM(cnt) FROM druid.numfoo GROUP BY 1 ORDER BY 2 DESC",
groupByOnMultiValueColumnEnabled,
ImmutableList.of(
GroupByQuery.builder()
.setDataSource(CalciteTests.DATASOURCE3)
Expand Down Expand Up @@ -103,6 +109,30 @@ public void testMultiValueStringWorksLikeStringGroupBy() throws Exception
);
}

@Test
public void testMultiValueStringGroupByDoesNotWork() throws Exception
{
// Cannot vectorize due to usage of expressions.
cannotVectorize();
Map<String, Object> groupByOnMultiValueColumnDisabled = new HashMap<>(QUERY_CONTEXT_DEFAULT);
groupByOnMultiValueColumnDisabled.put(GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING, false);
testQueryThrows(
"SELECT concat(dim3, 'foo'), SUM(cnt) FROM druid.numfoo GROUP BY 1 ORDER BY 2 DESC",
groupByOnMultiValueColumnDisabled,
ImmutableList.of(),
exception -> {
exception.expect(RuntimeException.class);
expectedException.expectMessage(StringUtils.format(
"Encountered multi-value dimension %s that cannot be processed with %s set to false."
+ " Consider setting %s to true.",
"v0",
GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY,
GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY
));
}
);
}

@Test
public void testMultiValueStringWorksLikeStringGroupByWithFilter() throws Exception
{
Expand Down
4 changes: 4 additions & 0 deletions website/.spelling
Original file line number Diff line number Diff line change
Expand Up @@ -1565,6 +1565,8 @@ outputName
pushdown
row1
subtotalsSpec
unnested
unnesting
- ../docs/querying/having.md
HavingSpec
HavingSpecs
Expand Down Expand Up @@ -1595,6 +1597,8 @@ row4
t3
t4
t5
groupByEnableMultiValueUnnesting
unnesting
- ../docs/querying/multitenancy.md
500ms
tenant_id
Expand Down

0 comments on commit 5794331

Please sign in to comment.