Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

nested columns + arrays = array columns! #13803

Merged
merged 24 commits into from
Mar 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
a8bf82c
array columns!
clintropolis Feb 8, 2023
f66f0e5
add v4 segment test
clintropolis Feb 14, 2023
1acdad4
adjustments
clintropolis Feb 14, 2023
eb6976a
Merge remote-tracking branch 'upstream/master' into nested-array-columns
clintropolis Feb 19, 2023
524975b
add array element indexes
clintropolis Feb 20, 2023
dd4b899
Merge remote-tracking branch 'upstream/master' into nested-array-columns
clintropolis Mar 1, 2023
d8e1b3f
fixup
clintropolis Mar 2, 2023
8ff7933
Merge remote-tracking branch 'upstream/master' into nested-array-columns
clintropolis Mar 2, 2023
4f72c47
Merge remote-tracking branch 'upstream/master' into nested-array-columns
clintropolis Mar 3, 2023
0070157
more test
clintropolis Mar 3, 2023
ae382d0
Merge remote-tracking branch 'upstream/master' into nested-array-columns
clintropolis Mar 14, 2023
6712e7a
Merge remote-tracking branch 'upstream/master' into nested-array-columns
clintropolis Mar 15, 2023
26ebeef
Merge remote-tracking branch 'upstream/master' into nested-array-columns
clintropolis Mar 21, 2023
2ecd8bd
Merge remote-tracking branch 'upstream/master' into nested-array-columns
clintropolis Mar 22, 2023
7bfb113
Merge remote-tracking branch 'upstream/master' into nested-array-columns
clintropolis Mar 23, 2023
9ea04a3
changes:
clintropolis Mar 23, 2023
82f01aa
reuse field index for stuff
clintropolis Mar 23, 2023
810649c
fix tests
clintropolis Mar 23, 2023
76da0fb
adjust
clintropolis Mar 23, 2023
313f486
adjust again
clintropolis Mar 23, 2023
665e0bc
nested array tests
clintropolis Mar 24, 2023
e3f48a7
style and comments
clintropolis Mar 24, 2023
e988d05
remove negative tests
clintropolis Mar 24, 2023
1515267
fix stuff
clintropolis Mar 27, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,6 @@ public class AvroStreamInputFormatTest extends InitializedNullHandlingTest
private static final String TOPIC = "aTopic";
static final List<String> DIMENSIONS = Arrays.asList(EVENT_TYPE, ID, SOME_OTHER_ID, IS_VALID);
private static final List<String> DIMENSIONS_SCHEMALESS = Arrays.asList(
"nested",
SOME_OTHER_ID,
"someIntArray",
"someFloat",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ public class AvroStreamInputRowParserTest
private static final ZonedDateTime DATE_TIME = ZonedDateTime.of(2015, 10, 25, 19, 30, 0, 0, ZoneOffset.UTC);
static final List<String> DIMENSIONS = Arrays.asList(EVENT_TYPE, ID, SOME_OTHER_ID, IS_VALID);
private static final List<String> DIMENSIONS_SCHEMALESS = Arrays.asList(
"nested",
SOME_OTHER_ID,
"someIntArray",
"someFloat",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
import org.apache.druid.java.util.common.parsers.JSONPathFieldSpec;
import org.apache.druid.java.util.common.parsers.JSONPathFieldType;
import org.apache.druid.java.util.common.parsers.JSONPathSpec;
import org.apache.druid.math.expr.ExpressionProcessing;
import org.apache.druid.query.expression.TestExprMacroTable;
import org.apache.druid.segment.NestedDataDimensionSchema;
import org.apache.druid.segment.transform.ExpressionTransform;
Expand Down Expand Up @@ -453,7 +452,6 @@ public void testNestedColumnSchemaless() throws IOException
"middle",
"list",
"map",
"ts",
"decimal1"
);
try (CloseableIterator<InputRow> iterator = reader.read()) {
Expand Down Expand Up @@ -593,7 +591,6 @@ public void testListMap() throws IOException
@Test
public void testNestedArray() throws IOException
{
ExpressionProcessing.initializeForTests(true);
final InputFormat inputFormat = new OrcInputFormat(
new JSONPathSpec(
true,
Expand Down Expand Up @@ -669,9 +666,6 @@ public void testNestedArray() throws IOException
Assert.assertArrayEquals(new Object[]{1L, 2L}, (Object[]) row.getRaw("t_d_0"));
Assert.assertFalse(iterator.hasNext());
}
finally {
ExpressionProcessing.initializeForTests(null);
}
}

@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ public void testNestedColumnSchemalessNestedTestFileNoNested() throws IOExceptio
);

List<InputRow> rows = readAllRows(reader);
Assert.assertEquals(ImmutableList.of("dim1", "metric1", "timestamp"), rows.get(0).getDimensions());
Assert.assertEquals(ImmutableList.of("dim1", "metric1"), rows.get(0).getDimensions());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why the change in expectation?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is related to the change in dimension filter in MapBasedInputRow to always filter out the timestamp spec column.

In 'normal' production code the timestamp spec is added to dimension exclusions so the code in MapBasedInputRow that computes dimensions to ensure time would not be in the dimensions list. However, in test code, especially in processing, which doesn't have access to the methods that take a dataschema and transform it into an input row schema, its pretty easy to not explicitly add timestamp column to the dimensions exclusion list. So as a result of not manually adding timestamp column to exclusions, it would end up in the dimensions list in schema discovery modes, as a string (or nested column, depending on config), which when doing rollup tests means it ends up as part of the rollup key, and so on. (Again this doesn't happen in current production code because it goes through that translator utility method).

I made the change there to always filter the timestamp spec column from the dimensions list to make it easier to not write wrong tests for schema discovery mode, which caused the change here and other places.

Assert.assertEquals(FlattenSpecParquetInputTest.TS1, rows.get(0).getTimestamp().toString());
Assert.assertEquals(ImmutableList.of("d1v1"), rows.get(0).getDimension("dim1"));
Assert.assertEquals("d1v1", rows.get(0).getRaw("dim1"));
Expand Down Expand Up @@ -218,7 +218,7 @@ public void testNestedColumnSchemalessNestedTestFile() throws IOException
);

List<InputRow> rows = readAllRows(reader);
Assert.assertEquals(ImmutableList.of("nestedData", "dim1", "metric1", "timestamp"), rows.get(0).getDimensions());
Assert.assertEquals(ImmutableList.of("nestedData", "dim1", "metric1"), rows.get(0).getDimensions());
Assert.assertEquals(FlattenSpecParquetInputTest.TS1, rows.get(0).getTimestamp().toString());
Assert.assertEquals(ImmutableList.of("d1v1"), rows.get(0).getDimension("dim1"));
Assert.assertEquals("d1v1", rows.get(0).getRaw("dim1"));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ public class ProtobufInputFormatTest
public void setUp() throws Exception
{
NullHandling.initializeForTests();
ExpressionProcessing.initializeForTests(null);
ExpressionProcessing.initializeForTests();
timestampSpec = new TimestampSpec("timestamp", "iso", null);
dimensionsSpec = new DimensionsSpec(Lists.newArrayList(
new StringDimensionSchema("event"),
Expand Down Expand Up @@ -243,7 +243,6 @@ public void testParseFlattenDataDiscover() throws Exception
.add("someFloatColumn")
.add("id")
.add("someBytesColumn")
.add("timestamp")
.build(),
row.getDimensions()
);
Expand Down Expand Up @@ -380,8 +379,7 @@ public void testParseNestedDataSchemaless() throws Exception
"someFloatColumn",
"eventType",
"id",
"someBytesColumn",
"timestamp"
"someBytesColumn"
),
row.getDimensions()
);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
public class DelimitedInputFormat extends FlatTextInputFormat
{
public static final String TYPE_KEY = "tsv";

private static final String DEFAULT_DELIMITER = "\t";

@JsonCreator
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
public class JsonInputFormat extends NestedInputFormat
{
public static final String TYPE_KEY = "json";

private final Map<String, Boolean> featureSpec;
private final ObjectMapper objectMapper;
private final boolean keepNullColumns;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
import com.fasterxml.jackson.annotation.JsonProperty;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Sets;
import org.apache.druid.data.input.InputRow;
import org.apache.druid.data.input.InputRowSchema;
import org.apache.druid.data.input.MapBasedInputRow;
Expand All @@ -36,6 +35,7 @@
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

public class MapInputRowParser implements InputRowParser<Map<String, Object>>
{
Expand Down Expand Up @@ -77,22 +77,38 @@ public static InputRow parse(InputRowSchema inputRowSchema, Map<String, Object>
* 3) If isIncludeAllDimensions is not set and {@link DimensionsSpec#getDimensionNames()} is empty,
* the dimensions in the given map is returned.
*
* In any case, the returned list does not include any dimensions in {@link DimensionsSpec#getDimensionExclusions()}.
* In any case, the returned list does not include any dimensions in {@link DimensionsSpec#getDimensionExclusions()}
* or {@link TimestampSpec#getTimestampColumn()}.
*/
private static List<String> findDimensions(
TimestampSpec timestampSpec,
DimensionsSpec dimensionsSpec,
Map<String, Object> rawInputRow
)
{
final String timestampColumn = timestampSpec.getTimestampColumn();
final Set<String> exclusions = dimensionsSpec.getDimensionExclusions();
if (dimensionsSpec.isIncludeAllDimensions()) {
LinkedHashSet<String> dimensions = new LinkedHashSet<>(dimensionsSpec.getDimensionNames());
dimensions.addAll(Sets.difference(rawInputRow.keySet(), dimensionsSpec.getDimensionExclusions()));
for (String field : rawInputRow.keySet()) {
if (timestampColumn.equals(field) || exclusions.contains(field)) {
continue;
}
dimensions.add(field);
}
return new ArrayList<>(dimensions);
} else {
if (!dimensionsSpec.getDimensionNames().isEmpty()) {
return dimensionsSpec.getDimensionNames();
} else {
return new ArrayList<>(Sets.difference(rawInputRow.keySet(), dimensionsSpec.getDimensionExclusions()));
List<String> dimensions = new ArrayList<>();
for (String field : rawInputRow.keySet()) {
if (timestampColumn.equals(field) || exclusions.contains(field)) {
continue;
}
dimensions.add(field);
}
return dimensions;
}
}
}
Expand All @@ -104,7 +120,7 @@ static InputRow parse(
Map<String, Object> theMap
) throws ParseException
{
final List<String> dimensionsToUse = findDimensions(dimensionsSpec, theMap);
final List<String> dimensionsToUse = findDimensions(timestampSpec, dimensionsSpec, theMap);

final DateTime timestamp;
try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,6 @@ public ArrayExpr(ExpressionType outputType, @Nullable Object[] value)
{
super(outputType, value);
Preconditions.checkArgument(outputType.isArray(), "Output type %s is not an array", outputType);
ExpressionType.checkNestedArrayAllowed(outputType);
}

@Override
Expand Down
33 changes: 18 additions & 15 deletions processing/src/main/java/org/apache/druid/math/expr/ExprEval.java
Original file line number Diff line number Diff line change
Expand Up @@ -370,11 +370,26 @@ public static ExprEval ofComplex(ExpressionType outputType, @Nullable Object val
return new ComplexExprEval(outputType, value);
}

public static ExprEval bestEffortArray(@Nullable List<?> theList)
{
// do not convert empty lists to arrays with a single null element here, because that should have been done
// by the selectors preparing their ObjectBindings if necessary. If we get to this point it was legitimately
// empty
NonnullPair<ExpressionType, Object[]> coerced = coerceListToArray(theList, false);
if (coerced == null) {
return bestEffortOf(null);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Calling bestEffortOf(null) looks to me like it's going to go through like 15 if statements, failing them all before just returning new StringExprEval(null). Why not just return the good thing here given that we already know what it should be and avoid the potential branches?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would probably be best to have null handled first in bestEffortOf so that if we ever decide to represent null as something more sensible (like introduce a null type), we won't be making a 'string' here, not to mention the saving of not running through a bunch of checks for a null value.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, I expected you to actually to create a method for makeNullNode() or something in ExprEval instead of newing it up directly here. Just moving around the ordering of the statements is 🤷 . Though, checking for null first is probably better than last.

}
return ofArray(coerced.lhs, coerced.rhs);
}

/**
* Examine java type to find most appropriate expression type
*/
public static ExprEval bestEffortOf(@Nullable Object val)
{
if (val == null) {
return new StringExprEval(null);
}
if (val instanceof ExprEval) {
return (ExprEval) val;
}
Expand Down Expand Up @@ -468,14 +483,7 @@ public static ExprEval bestEffortOf(@Nullable Object val)

if (val instanceof List || val instanceof Object[]) {
final List<?> theList = val instanceof List ? ((List<?>) val) : Arrays.asList((Object[]) val);
// do not convert empty lists to arrays with a single null element here, because that should have been done
// by the selectors preparing their ObjectBindings if necessary. If we get to this point it was legitimately
// empty
NonnullPair<ExpressionType, Object[]> coerced = coerceListToArray(theList, false);
if (coerced == null) {
return bestEffortOf(null);
}
return ofArray(coerced.lhs, coerced.rhs);
return bestEffortArray(theList);
}

// in 'best effort' mode, we couldn't possibly use byte[] as a complex or anything else useful without type
Expand All @@ -485,12 +493,8 @@ public static ExprEval bestEffortOf(@Nullable Object val)
return new StringExprEval(StringUtils.encodeBase64String((byte[]) val));
}

if (val != null) {
// is this cool?
return new ComplexExprEval(ExpressionType.UNKNOWN_COMPLEX, val);
}

return new StringExprEval(null);
// is this cool?
return new ComplexExprEval(ExpressionType.UNKNOWN_COMPLEX, val);
}

public static ExprEval ofType(@Nullable ExpressionType type, @Nullable Object value)
Expand Down Expand Up @@ -1109,7 +1113,6 @@ private ArrayExprEval(ExpressionType arrayType, @Nullable Object[] value)
super(value);
this.arrayType = arrayType;
Preconditions.checkArgument(arrayType.isArray(), "Output type %s is not an array", arrayType);
ExpressionType.checkNestedArrayAllowed(arrayType);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@
import com.google.common.annotations.VisibleForTesting;
import com.google.inject.Inject;

import javax.annotation.Nullable;

/**
* Like {@link org.apache.druid.common.config.NullHandling}, except for expressions processing configs
*/
Expand All @@ -43,33 +41,23 @@ public class ExpressionProcessing
/**
* Many unit tests do not setup modules for this value to be injected, this method provides a manual way to initialize
* {@link #INSTANCE}
* @param allowNestedArrays
*/
@VisibleForTesting
public static void initializeForTests(@Nullable Boolean allowNestedArrays)
public static void initializeForTests()
{
INSTANCE = new ExpressionProcessingConfig(allowNestedArrays, null, null, null);
INSTANCE = new ExpressionProcessingConfig(null, null, null);
}

@VisibleForTesting
public static void initializeForStrictBooleansTests(boolean useStrict)
{
INSTANCE = new ExpressionProcessingConfig(null, useStrict, null, null);
INSTANCE = new ExpressionProcessingConfig(useStrict, null, null);
}

@VisibleForTesting
public static void initializeForHomogenizeNullMultiValueStrings()
{
INSTANCE = new ExpressionProcessingConfig(null, null, null, true);
}

/**
* [['is expression support for'],['nested arrays'],['enabled?']]
*/
public static boolean allowNestedArrays()
{
checkInitialized();
return INSTANCE.allowNestedArrays();
INSTANCE = new ExpressionProcessingConfig(null, null, true);
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@

public class ExpressionProcessingConfig
{
public static final String NESTED_ARRAYS_CONFIG_STRING = "druid.expressions.allowNestedArrays";
public static final String NULL_HANDLING_LEGACY_LOGICAL_OPS_STRING = "druid.expressions.useStrictBooleans";
// Coerce arrays to multi value strings
public static final String PROCESS_ARRAYS_AS_MULTIVALUE_STRINGS_CONFIG_STRING =
Expand All @@ -35,9 +34,6 @@ public class ExpressionProcessingConfig
public static final String HOMOGENIZE_NULL_MULTIVALUE_STRING_ARRAYS =
"druid.expressions.homogenizeNullMultiValueStringArrays";

@JsonProperty("allowNestedArrays")
private final boolean allowNestedArrays;

@JsonProperty("useStrictBooleans")
private final boolean useStrictBooleans;

Expand All @@ -49,13 +45,11 @@ public class ExpressionProcessingConfig

@JsonCreator
public ExpressionProcessingConfig(
@JsonProperty("allowNestedArrays") @Nullable Boolean allowNestedArrays,
@JsonProperty("useStrictBooleans") @Nullable Boolean useStrictBooleans,
@JsonProperty("processArraysAsMultiValueStrings") @Nullable Boolean processArraysAsMultiValueStrings,
@JsonProperty("homogenizeNullMultiValueStringArrays") @Nullable Boolean homogenizeNullMultiValueStringArrays
)
{
this.allowNestedArrays = getWithPropertyFallbackFalse(allowNestedArrays, NESTED_ARRAYS_CONFIG_STRING);
this.useStrictBooleans = getWithPropertyFallbackFalse(useStrictBooleans, NULL_HANDLING_LEGACY_LOGICAL_OPS_STRING);
this.processArraysAsMultiValueStrings = getWithPropertyFallbackFalse(
processArraysAsMultiValueStrings,
Expand All @@ -67,11 +61,6 @@ public ExpressionProcessingConfig(
);
}

public boolean allowNestedArrays()
{
return allowNestedArrays;
}

public boolean isUseStrictBooleans()
{
return useStrictBooleans;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.annotation.JsonSerialize;
import com.fasterxml.jackson.databind.ser.std.ToStringSerializer;
import org.apache.druid.java.util.common.IAE;
import org.apache.druid.java.util.common.ISE;
import org.apache.druid.segment.column.BaseTypeSignature;
import org.apache.druid.segment.column.ColumnType;
Expand Down Expand Up @@ -210,11 +209,4 @@ public static ColumnType toColumnType(ExpressionType exprType)
throw new ISE("Unsupported expression type[%s]", exprType);
}
}

public static void checkNestedArrayAllowed(ExpressionType outputType)
{
if (outputType.isArray() && outputType.getElementType().isArray() && !ExpressionProcessing.allowNestedArrays()) {
throw new IAE("Cannot create a nested array type [%s], 'druid.expressions.allowNestedArrays' must be set to true", outputType);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3054,7 +3054,6 @@ static ExpressionType setArrayOutput(@Nullable ExpressionType arrayType, Object[
if (arrayType == null) {
arrayType = ExpressionTypeFactory.getInstance().ofArray(evaluated.type());
}
ExpressionType.checkNestedArrayAllowed(arrayType);
if (arrayType.getElementType().isNumeric() && evaluated.isNumericNull()) {
out[i] = null;
} else if (!evaluated.asArrayType().equals(arrayType)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -480,10 +480,25 @@ public Expr apply(List<Expr> args)
final StructuredDataProcessor processor = new StructuredDataProcessor()
{
@Override
public StructuredDataProcessor.ProcessedLiteral<?> processLiteralField(ArrayList<NestedPathPart> fieldPath, Object fieldValue)
public ProcessedValue<?> processField(ArrayList<NestedPathPart> fieldPath, @Nullable Object fieldValue)
{
// do nothing, we only want the list of fields returned by this processor
return StructuredDataProcessor.ProcessedLiteral.NULL_LITERAL;
return ProcessedValue.NULL_LITERAL;
}

@Nullable
@Override
public ProcessedValue<?> processArrayField(
ArrayList<NestedPathPart> fieldPath,
@Nullable List<?> array
)
{
// we only want to return a non-null value here if the value is an array of primitive values
ExprEval<?> eval = ExprEval.bestEffortArray(array);
if (eval.type().isArray() && eval.type().getElementType().isPrimitive()) {
return ProcessedValue.NULL_LITERAL;
}
return null;
}
};

Expand Down
Loading