From fdcc4c6c911f162344d50a90f32625aa5601f330 Mon Sep 17 00:00:00 2001 From: Venki Korukanti Date: Sun, 18 Jun 2023 08:28:31 -0700 Subject: [PATCH 1/9] [Kernel] Default Parquet reader implementation --- kernel/build.sbt | 6 +- .../delta/kernel/utils/CloseableIterator.java | 3 +- .../io/delta/kernel/DefaultKernelUtils.java | 187 ++++++ .../kernel/client/DefaultParquetHandler.java | 133 +++++ .../kernel/data/DefaultColumnarBatch.java | 50 ++ .../data/vector/AbstractColumnVector.java | 171 ++++++ .../data/vector/DefaultArrayVector.java | 80 +++ .../data/vector/DefaultBinaryVector.java | 96 +++ .../data/vector/DefaultBooleanVector.java | 69 +++ .../kernel/data/vector/DefaultByteVector.java | 70 +++ .../data/vector/DefaultConstantVector.java | 135 +++++ .../data/vector/DefaultDoubleVector.java | 69 +++ .../data/vector/DefaultFloatVector.java | 69 +++ .../kernel/data/vector/DefaultIntVector.java | 73 +++ .../kernel/data/vector/DefaultLongVector.java | 69 +++ .../kernel/data/vector/DefaultMapVector.java | 85 +++ .../data/vector/DefaultShortVector.java | 70 +++ .../data/vector/DefaultStructVector.java | 146 +++++ .../delta/kernel/data/vector/VectorUtils.java | 69 +++ .../delta/kernel/parquet/ArrayConverter.java | 181 ++++++ .../io/delta/kernel/parquet/MapConverter.java | 198 +++++++ .../kernel/parquet/ParquetBatchReader.java | 207 +++++++ .../kernel/parquet/ParquetConverters.java | 553 ++++++++++++++++++ .../io/delta/kernel/parquet/RowConverter.java | 153 +++++ .../parquet/TestParquetBatchReader.java | 403 +++++++++++++ .../kernel/utils/DefaultKernelTestUtils.java | 25 + .../test/resources/parquet/all_types.parquet | Bin 0 -> 24532 bytes 27 files changed, 3364 insertions(+), 6 deletions(-) create mode 100644 kernel/kernel-default/src/main/java/io/delta/kernel/DefaultKernelUtils.java create mode 100644 kernel/kernel-default/src/main/java/io/delta/kernel/client/DefaultParquetHandler.java create mode 100644 kernel/kernel-default/src/main/java/io/delta/kernel/data/DefaultColumnarBatch.java create mode 100644 kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/AbstractColumnVector.java create mode 100644 kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultArrayVector.java create mode 100644 kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultBinaryVector.java create mode 100644 kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultBooleanVector.java create mode 100644 kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultByteVector.java create mode 100644 kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultConstantVector.java create mode 100644 kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultDoubleVector.java create mode 100644 kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultFloatVector.java create mode 100644 kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultIntVector.java create mode 100644 kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultLongVector.java create mode 100644 kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultMapVector.java create mode 100644 kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultShortVector.java create mode 100644 kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultStructVector.java create mode 100644 kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/VectorUtils.java create mode 100644 kernel/kernel-default/src/main/java/io/delta/kernel/parquet/ArrayConverter.java create mode 100644 kernel/kernel-default/src/main/java/io/delta/kernel/parquet/MapConverter.java create mode 100644 kernel/kernel-default/src/main/java/io/delta/kernel/parquet/ParquetBatchReader.java create mode 100644 kernel/kernel-default/src/main/java/io/delta/kernel/parquet/ParquetConverters.java create mode 100644 kernel/kernel-default/src/main/java/io/delta/kernel/parquet/RowConverter.java create mode 100644 kernel/kernel-default/src/test/java/io/delta/kernel/parquet/TestParquetBatchReader.java create mode 100644 kernel/kernel-default/src/test/java/io/delta/kernel/utils/DefaultKernelTestUtils.java create mode 100644 kernel/kernel-default/src/test/resources/parquet/all_types.parquet diff --git a/kernel/build.sbt b/kernel/build.sbt index 257e4ea4f69..940db269b13 100644 --- a/kernel/build.sbt +++ b/kernel/build.sbt @@ -78,17 +78,13 @@ lazy val kernelDefault = (project in file("kernel-default")) scalaStyleSettings, releaseSettings, libraryDependencies ++= Seq( - "org.apache.hadoop" % "hadoop-client-api" % hadoopVersion, // Configuration, Path + "org.apache.hadoop" % "hadoop-client-runtime" % hadoopVersion, // Configuration, Path "io.delta" % "delta-storage" % deltaStorageVersion, // LogStore "com.fasterxml.jackson.core" % "jackson-databind" % "2.13.5", // ObjectMapper "org.apache.parquet" % "parquet-hadoop" % "1.12.3", "org.scalatest" %% "scalatest" % scalaTestVersion % "test", "io.delta" %% "delta-core" % deltaSparkVersion % "test", - "org.apache.spark" %% "spark-sql" % sparkVersion % "test", // SparkSession - "org.apache.spark" %% "spark-sql" % sparkVersion % "test" classifier "tests", - "org.apache.spark" %% "spark-core" % sparkVersion % "test" classifier "tests", - "org.apache.spark" %% "spark-catalyst" % sparkVersion % "test" classifier "tests", "junit" % "junit" % "4.11" % "test", "com.novocode" % "junit-interface" % "0.11" % "test" ) diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/utils/CloseableIterator.java b/kernel/kernel-api/src/main/java/io/delta/kernel/utils/CloseableIterator.java index d32e92ca8c0..b334e947024 100644 --- a/kernel/kernel-api/src/main/java/io/delta/kernel/utils/CloseableIterator.java +++ b/kernel/kernel-api/src/main/java/io/delta/kernel/utils/CloseableIterator.java @@ -22,7 +22,8 @@ import java.util.function.Consumer; import java.util.function.Function; -public interface CloseableIterator extends Iterator, Closeable { +public interface CloseableIterator extends Iterator, Closeable +{ default CloseableIterator map(Function mapper) { CloseableIterator delegate = this; return new CloseableIterator() { diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/DefaultKernelUtils.java b/kernel/kernel-default/src/main/java/io/delta/kernel/DefaultKernelUtils.java new file mode 100644 index 00000000000..7be9eaacdee --- /dev/null +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/DefaultKernelUtils.java @@ -0,0 +1,187 @@ +/* + * Copyright (2023) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.delta.kernel; + +import java.util.ArrayList; +import java.util.List; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Type; + +import io.delta.kernel.types.DataType; +import io.delta.kernel.types.StructField; +import io.delta.kernel.types.StructType; + +public class DefaultKernelUtils +{ + private DefaultKernelUtils() {} + + /** + * Given the file schema in Parquet file and selected columns by Delta, return + * a subschema of the file schema. + * + * @param fileSchema + * @param deltaType + * @return + */ + public static final MessageType pruneSchema( + MessageType fileSchema, // parquet + StructType deltaType) // delta-core + { + return deltaType.fields().stream() + .map(column -> { + Type type = findStructField(fileSchema, column); + if (type == null) { + return null; + } + Type prunedSubfields = pruneSubfields(type, column.getDataType()); + return new MessageType(column.getName(), prunedSubfields); + }) + .filter(type -> type != null) + .reduce(MessageType::union) + .get(); + } + + private static Type findStructField(MessageType fileSchema, StructField column) + { + // TODO: Need a way to search by id once we start supporting column mapping `id` mode. + final String columnName = column.getName(); + if (fileSchema.containsField(columnName)) { + return fileSchema.getType(columnName); + } + // Parquet is case-sensitive, but hive is not. all hive columns get converted to lowercase + // check for direct match above but if no match found, try case-insensitive match + for (org.apache.parquet.schema.Type type : fileSchema.getFields()) { + if (type.getName().equalsIgnoreCase(columnName)) { + return type; + } + } + + return null; + } + + private static Type pruneSubfields(Type type, DataType deltaDatatype) + { + if (!(deltaDatatype instanceof StructType)) { + // there is no pruning for non-struct types + return type; + } + + GroupType groupType = (GroupType) type; + StructType deltaStructType = (StructType) deltaDatatype; + List newParquetSubFields = new ArrayList<>(); + for (StructField subField : deltaStructType.fields()) { + String subFieldName = subField.getName(); + Type parquetSubFieldType = groupType.getType(subFieldName); + if (parquetSubFieldType == null) { + for (org.apache.parquet.schema.Type typeTemp : groupType.getFields()) { + if (typeTemp.getName().equalsIgnoreCase(subFieldName)) { + parquetSubFieldType = type; + } + } + } + newParquetSubFields.add(parquetSubFieldType); + } + return groupType.withNewFields(newParquetSubFields); + } + + /** + * Precondition-style validation that throws {@link IllegalArgumentException}. + * + * @param isValid {@code true} if valid, {@code false} if an exception should be thrown + * @throws IllegalArgumentException if {@code isValid} is false + */ + public static void checkArgument(boolean isValid) + throws IllegalArgumentException + { + if (!isValid) { + throw new IllegalArgumentException(); + } + } + + /** + * Precondition-style validation that throws {@link IllegalArgumentException}. + * + * @param isValid {@code true} if valid, {@code false} if an exception should be thrown + * @param message A String message for the exception. + * @throws IllegalArgumentException if {@code isValid} is false + */ + public static void checkArgument(boolean isValid, String message) + throws IllegalArgumentException + { + if (!isValid) { + throw new IllegalArgumentException(message); + } + } + + /** + * Precondition-style validation that throws {@link IllegalArgumentException}. + * + * @param isValid {@code true} if valid, {@code false} if an exception should be thrown + * @param message A String message for the exception. + * @param args Objects used to fill in {@code %s} placeholders in the message + * @throws IllegalArgumentException if {@code isValid} is false + */ + public static void checkArgument(boolean isValid, String message, Object... args) + throws IllegalArgumentException + { + if (!isValid) { + throw new IllegalArgumentException( + String.format(String.valueOf(message), args)); + } + } + + /** + * Precondition-style validation that throws {@link IllegalStateException}. + * + * @param isValid {@code true} if valid, {@code false} if an exception should be thrown + * @param message A String message for the exception. + * @throws IllegalStateException if {@code isValid} is false + */ + public static void checkState(boolean isValid, String message) + throws IllegalStateException + { + if (!isValid) { + throw new IllegalStateException(message); + } + } + + /** + * Search for the Parquet type for in the {@code groupType} for the field equilant to + * {@code field}. + * + * @param groupType Parquet group type coming from the file schema. + * @param field Sub field given as Delta Kernel's {@link StructField} + * @return {@link Type} of the Parquet field. Returns {@code null}, if not found. + */ + public static Type findFieldType(GroupType groupType, StructField field) + { + // TODO: Need a way to search by id once we start supporting column mapping `id` mode. + final String columnName = field.getName(); + if (groupType.containsField(columnName)) { + return groupType.getType(columnName); + } + // Parquet is case-sensitive, but hive is not. all hive columns get converted to lowercase + // check for direct match above but if no match found, try case-insensitive match + for (org.apache.parquet.schema.Type type : groupType.getFields()) { + if (type.getName().equalsIgnoreCase(columnName)) { + return type; + } + } + + return null; + } +} diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/client/DefaultParquetHandler.java b/kernel/kernel-default/src/main/java/io/delta/kernel/client/DefaultParquetHandler.java new file mode 100644 index 00000000000..0c2dd8ea78d --- /dev/null +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/client/DefaultParquetHandler.java @@ -0,0 +1,133 @@ +/* + * Copyright (2023) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.delta.kernel.client; + +import java.io.IOException; +import org.apache.hadoop.conf.Configuration; + +import io.delta.kernel.data.ColumnarBatch; +import io.delta.kernel.data.FileDataReadResult; +import io.delta.kernel.data.Row; +import io.delta.kernel.expressions.Expression; +import io.delta.kernel.fs.FileStatus; +import io.delta.kernel.parquet.ParquetBatchReader; +import io.delta.kernel.types.StructType; +import io.delta.kernel.utils.CloseableIterator; +import io.delta.kernel.utils.Utils; + +public class DefaultParquetHandler + implements ParquetHandler +{ + private final Configuration hadoopConf; + + public DefaultParquetHandler(Configuration hadoopConf) + { + this.hadoopConf = hadoopConf; + } + + @Override + public CloseableIterator contextualizeFileReads( + CloseableIterator fileIter, + Expression predicate) + { + return new CloseableIterator() + { + @Override + public void close() + throws IOException + { + fileIter.close(); + } + + @Override + public boolean hasNext() + { + return fileIter.hasNext(); + } + + @Override + public FileReadContext next() + { + return () -> fileIter.next(); + } + }; + } + + @Override + public CloseableIterator readParquetFiles( + CloseableIterator fileIter, + StructType physicalSchema) throws IOException + { + return new CloseableIterator() + { + private FileReadContext currentFile; + private CloseableIterator currentFileReader; + + @Override + public void close() + throws IOException + { + if (currentFileReader != null) { + currentFileReader.close(); + } + + fileIter.close(); + // TODO: implement safe close of multiple closeables. + } + + @Override + public boolean hasNext() + { + // There is no file in reading or the current file being read has no more data + // initialize the next file reader or return false if there are no more files to + // read. + if (currentFileReader == null || !currentFileReader.hasNext()) { + if (fileIter.hasNext()) { + currentFile = fileIter.next(); + FileStatus fileStatus = Utils.getFileStatus(currentFile.getScanFileRow()); + ParquetBatchReader batchReader = new ParquetBatchReader(hadoopConf); + currentFileReader = batchReader.read(fileStatus.getPath(), physicalSchema); + } + else { + return false; + } + } + + return currentFileReader.hasNext(); + } + + @Override + public FileDataReadResult next() + { + final ColumnarBatch data = currentFileReader.next(); + return new FileDataReadResult() + { + @Override + public ColumnarBatch getData() + { + return data; + } + + @Override + public Row getScanFileRow() + { + return currentFile.getScanFileRow(); + } + }; + } + }; + } +} diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/data/DefaultColumnarBatch.java b/kernel/kernel-default/src/main/java/io/delta/kernel/data/DefaultColumnarBatch.java new file mode 100644 index 00000000000..e227b05e993 --- /dev/null +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/data/DefaultColumnarBatch.java @@ -0,0 +1,50 @@ +package io.delta.kernel.data; + +import io.delta.kernel.types.StructType; + +public class DefaultColumnarBatch + implements ColumnarBatch +{ + private final StructType dataType; + private final int size; + private final ColumnVector[] columnVectors; + + public DefaultColumnarBatch( + int size, + StructType dataType, + ColumnVector[] columnVectors + ) + { + this.dataType = dataType; + this.size = size; + this.columnVectors = new ColumnVector[columnVectors.length]; + // TODO: argument check. + System.arraycopy(columnVectors, 0, this.columnVectors, 0, columnVectors.length); + } + + @Override + public StructType getSchema() + { + return dataType; + } + + @Override + public ColumnVector getColumnVector(int ordinal) + { + checkColumnOrdinal(ordinal); + return columnVectors[ordinal]; + } + + @Override + public int getSize() + { + return size; + } + + private void checkColumnOrdinal(int ordinal) + { + if (ordinal < 0 || ordinal >= columnVectors.length) { + throw new IllegalArgumentException("invalid column ordinal"); + } + } +} diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/AbstractColumnVector.java b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/AbstractColumnVector.java new file mode 100644 index 00000000000..d3866fd96de --- /dev/null +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/AbstractColumnVector.java @@ -0,0 +1,171 @@ +/* + * Copyright (2023) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.delta.kernel.data.vector; + +import static io.delta.kernel.DefaultKernelUtils.checkArgument; +import java.util.List; +import java.util.Map; +import static java.util.Objects.requireNonNull; +import java.util.Optional; + +import io.delta.kernel.data.ColumnVector; +import io.delta.kernel.data.Row; +import io.delta.kernel.types.DataType; + +/** + * Abstract implementation of {@link ColumnVector} that provides the default functionality + * common to most of the specific data type {@link ColumnVector} implementations. + */ +public abstract class AbstractColumnVector + implements ColumnVector +{ + private final int size; + private final DataType dataType; + private final Optional nullability; + + protected AbstractColumnVector(int size, DataType dataType, Optional nullability) + { + checkArgument(size >= 0, "invalid size: %s", size); + this.size = size; + this.dataType = requireNonNull(dataType); + this.nullability = requireNonNull(nullability); + } + + @Override + public DataType getDataType() + { + return dataType; + } + + @Override + public int getSize() + { + return size; + } + + @Override + public void close() + { + // By default, nothing to close, if the implementation has any resources to release + // it can override it + } + + /** + * Is the value at given {@code rowId} index is null? + * + * @param rowId + * @return + */ + @Override + public boolean isNullAt(int rowId) + { + checkValidRowId(rowId); + return !nullability.isPresent() || nullability.get()[rowId]; + } + + @Override + public boolean getBoolean(int rowId) + { + throw unsupportedDataAccessException("boolean"); + } + + @Override + public byte getByte(int rowId) + { + throw unsupportedDataAccessException("byte"); + } + + @Override + public short getShort(int rowId) + { + throw unsupportedDataAccessException("short"); + } + + @Override + public int getInt(int rowId) + { + throw unsupportedDataAccessException("int"); + } + + @Override + public long getLong(int rowId) + { + throw unsupportedDataAccessException("long"); + } + + @Override + public float getFloat(int rowId) + { + throw unsupportedDataAccessException("float"); + } + + @Override + public double getDouble(int rowId) + { + throw unsupportedDataAccessException("double"); + } + + @Override + public byte[] getBinary(int rowId) + { + throw unsupportedDataAccessException("binary"); + } + + @Override + public String getString(int rowId) + { + throw unsupportedDataAccessException("string"); + } + + @Override + public Map getMap(int rowId) + { + throw unsupportedDataAccessException("map"); + } + + @Override + public Row getStruct(int rowId) + { + throw unsupportedDataAccessException("struct"); + } + + @Override + public List getArray(int rowId) + { + throw unsupportedDataAccessException("array"); + } + + protected UnsupportedOperationException unsupportedDataAccessException(String accessType) + { + String msg = String.format( + "Trying to access a `%s` value from vector of type `%s`", + accessType, + getDataType()); + throw new UnsupportedOperationException(msg); + } + + /** + * Helper method that make sure the given {@code rowId} position is valid in this vector + * + * @param rowId + */ + protected void checkValidRowId(int rowId) + { + if (rowId < 0 || rowId >= size) { + throw new IllegalArgumentException("invalid row access"); + } + } +} diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultArrayVector.java b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultArrayVector.java new file mode 100644 index 00000000000..be85b62b554 --- /dev/null +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultArrayVector.java @@ -0,0 +1,80 @@ +/* + * Copyright (2023) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.delta.kernel.data.vector; + +import static io.delta.kernel.DefaultKernelUtils.checkArgument; +import java.util.ArrayList; +import java.util.List; +import static java.util.Objects.requireNonNull; +import java.util.Optional; + +import io.delta.kernel.data.ColumnVector; +import io.delta.kernel.types.DataType; + +/** + * {@link io.delta.kernel.data.ColumnVector} implementation for array type data. + */ +public class DefaultArrayVector + extends AbstractColumnVector +{ + private final int[] offsets; + private final ColumnVector elementVector; + + /** + * Create an instance of {@link io.delta.kernel.data.ColumnVector} for array type. + * + * @param size number of elements in the vector. + * @param nullability Optional array of nullability value for each element in the vector. + * All values in the vector are considered non-null when parameter is empty. + * @param offsets Offsets into element vector on where the index of particular row + * values start and end. + * @param elementVector Vector containing the array elements. + */ + public DefaultArrayVector( + int size, + DataType type, + Optional nullability, + int[] offsets, + ColumnVector elementVector) + { + super(size, type, nullability); + checkArgument(offsets.length >= size + 1, "invalid offset array size"); + this.offsets = requireNonNull(offsets, "offsets is null"); + this.elementVector = requireNonNull(elementVector, "elementVector is null"); + } + + /** + * Get the value at given {@code rowId}. The return value is undefined and can be + * anything, if the slot for {@code rowId} is null. + * + * @param rowId + * @return + */ + @Override + public List getArray(int rowId) + { + checkValidRowId(rowId); + int start = offsets[rowId]; + int end = offsets[rowId + 1]; + + List values = new ArrayList<>(); + for (int entry = start; entry < end; entry++) { + Object key = VectorUtils.getValueAsObject(elementVector, entry); + values.add((T) key); + } + return values; + } +} diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultBinaryVector.java b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultBinaryVector.java new file mode 100644 index 00000000000..43129eac93a --- /dev/null +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultBinaryVector.java @@ -0,0 +1,96 @@ +/* + * Copyright (2023) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.delta.kernel.data.vector; + +import static io.delta.kernel.DefaultKernelUtils.checkArgument; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import static java.util.Objects.requireNonNull; +import java.util.Optional; + +import io.delta.kernel.types.BinaryType; +import io.delta.kernel.types.DataType; +import io.delta.kernel.types.StringType; + +/** + * {@link io.delta.kernel.data.ColumnVector} implementation for binary type data. + */ +public class DefaultBinaryVector + extends AbstractColumnVector +{ + private final byte[][] values; + + /** + * Create an instance of {@link io.delta.kernel.data.ColumnVector} for binary type. + * + * @param size number of elements in the vector. + * @param values column vector values. + */ + public DefaultBinaryVector(DataType dataType, int size, byte[][] values) + { + super(size, dataType, Optional.empty()); + checkArgument(dataType instanceof StringType || dataType instanceof BinaryType, + "invalid type"); + this.values = requireNonNull(values, "values is null"); + checkArgument(values.length >= size, + "invalid number of values (%s) for given size (%s)", values.length, size); + checkArgument(values.length >= 0, "invalid vector size: %s", values.length); + } + + @Override + public boolean isNullAt(int rowId) + { + checkValidRowId(rowId); + return values[rowId] == null; + } + + /** + * Get the value at given {@code rowId}. The return value is undefined and can be + * anything, if the slot for {@code rowId} is null. + * The error check on {@code rowId} explicitly skipped for performance reasons. + * + * @param rowId + * @return + */ + @Override + public String getString(int rowId) + { + if (!(getDataType() instanceof StringType)) { + throw unsupportedDataAccessException("string"); + } + checkValidRowId(rowId); + byte[] value = values[rowId]; + if (value == null) { + return null; + } + return StandardCharsets.UTF_8.decode(ByteBuffer.wrap(value)).toString(); + } + + /** + * Get the value at given {@code rowId}. The return value is undefined and can be + * anything, if the slot for {@code rowId} is null. + * The error check on {@code rowId} explicitly skipped for performance reasons. + * + * @param rowId + * @return + */ + @Override + public byte[] getBinary(int rowId) + { + checkValidRowId(rowId); + return values[rowId]; + } +} diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultBooleanVector.java b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultBooleanVector.java new file mode 100644 index 00000000000..1e68effb456 --- /dev/null +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultBooleanVector.java @@ -0,0 +1,69 @@ +/* + * Copyright (2023) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.delta.kernel.data.vector; + +import static io.delta.kernel.DefaultKernelUtils.checkArgument; +import static java.util.Objects.requireNonNull; +import java.util.Optional; + +import io.delta.kernel.types.BooleanType; + +/** + * {@link io.delta.kernel.data.ColumnVector} implementation for boolean type data. + */ +public class DefaultBooleanVector + extends AbstractColumnVector +{ + private final boolean[] values; + + /** + * Create an instance of {@link io.delta.kernel.data.ColumnVector} for boolean type. + * + * @param size number of elements in the vector. + * @param nullability Optional array of nullability value for each element in the vector. + * All values in the vector are considered non-null when parameter is empty. + * @param values column vector values. + */ + public DefaultBooleanVector(int size, Optional nullability, boolean[] values) + { + super(size, BooleanType.INSTANCE, nullability); + this.values = requireNonNull(values, "values is null"); + checkArgument(values.length >= 0, "invalid vector size: %s", values.length); + checkArgument(values.length >= size, + "invalid number of values (%s) for given size (%s)", values.length, size); + if (nullability.isPresent()) { + checkArgument(values.length == nullability.get().length, + "vector element components are not of same size" + + "value array size = %s, nullability array size = %s", + values.length, nullability.get().length + ); + } + } + + /** + * Get the value at given {@code rowId}. The return value is undefined and can be + * anything, if the slot for {@code rowId} is null. + * + * @param rowId + * @return + */ + @Override + public boolean getBoolean(int rowId) + { + checkValidRowId(rowId); + return values[rowId]; + } +} diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultByteVector.java b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultByteVector.java new file mode 100644 index 00000000000..e146d9a99aa --- /dev/null +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultByteVector.java @@ -0,0 +1,70 @@ +/* + * Copyright (2023) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.delta.kernel.data.vector; + +import static io.delta.kernel.DefaultKernelUtils.checkArgument; +import static java.util.Objects.requireNonNull; +import java.util.Optional; + +import io.delta.kernel.types.ByteType; +import io.delta.kernel.types.IntegerType; + +/** + * {@link io.delta.kernel.data.ColumnVector} implementation for byte type data. + */ +public class DefaultByteVector + extends AbstractColumnVector +{ + private final byte[] values; + + /** + * Create an instance of {@link io.delta.kernel.data.ColumnVector} for byte type. + * + * @param size number of elements in the vector. + * @param nullability Optional array of nullability value for each element in the vector. + * All values in the vector are considered non-null when parameter is empty. + * @param values column vector values. + */ + public DefaultByteVector(int size, Optional nullability, byte[] values) + { + super(size, ByteType.INSTANCE, nullability); + this.values = requireNonNull(values, "values is null"); + checkArgument(values.length >= 0, "invalid vector size: %s", values.length); + checkArgument(values.length >= size, + "invalid number of values (%s) for given size (%s)", values.length, size); + if (nullability.isPresent()) { + checkArgument(values.length == nullability.get().length, + "vector element components are not of same size" + + "value array size = %s, nullability array size = %s", + values.length, nullability.get().length + ); + } + } + + /** + * Get the value at given {@code rowId}. The return value is undefined and can be + * anything, if the slot for {@code rowId} is null. + * + * @param rowId + * @return + */ + @Override + public byte getByte(int rowId) + { + checkValidRowId(rowId); + return values[rowId]; + } +} diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultConstantVector.java b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultConstantVector.java new file mode 100644 index 00000000000..9729972f0de --- /dev/null +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultConstantVector.java @@ -0,0 +1,135 @@ +/* + * Copyright (2023) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.delta.kernel.data.vector; + +import java.util.List; +import java.util.Map; + +import io.delta.kernel.data.ColumnVector; +import io.delta.kernel.data.Row; +import io.delta.kernel.types.DataType; + +public class DefaultConstantVector + implements ColumnVector +{ + private final DataType dataType; + private final int numRows; + private final Object value; + + public DefaultConstantVector(DataType dataType, int numRows, Object value) + { + // TODO: Validate datatype and value object type + this.dataType = dataType; + this.numRows = numRows; + this.value = value; + } + + @Override + public DataType getDataType() + { + return dataType; + } + + @Override + public int getSize() + { + return numRows; + } + + @Override + public void close() + { + // nothing to close + } + + @Override + public boolean isNullAt(int rowId) + { + return value == null; + } + + @Override + public boolean getBoolean(int rowId) + { + return (boolean) value; + } + + @Override + public byte getByte(int rowId) + { + return (byte) value; + } + + @Override + public short getShort(int rowId) + { + return (short) value; + } + + @Override + public int getInt(int rowId) + { + return (int) value; + } + + @Override + public long getLong(int rowId) + { + return (long) value; + } + + @Override + public float getFloat(int rowId) + { + return (float) value; + } + + @Override + public double getDouble(int rowId) + { + return (double) value; + } + + @Override + public byte[] getBinary(int rowId) + { + return (byte[]) value; + } + + @Override + public String getString(int rowId) + { + return (String) value; + } + + @Override + public Map getMap(int rowId) + { + return (Map) value; + } + + @Override + public Row getStruct(int rowId) + { + return (Row) value; + } + + @Override + public List getArray(int rowId) + { + return (List) value; + } +} diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultDoubleVector.java b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultDoubleVector.java new file mode 100644 index 00000000000..7f3b8fb6e30 --- /dev/null +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultDoubleVector.java @@ -0,0 +1,69 @@ +/* + * Copyright (2023) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.delta.kernel.data.vector; + +import static io.delta.kernel.DefaultKernelUtils.checkArgument; +import static java.util.Objects.requireNonNull; +import java.util.Optional; + +import io.delta.kernel.types.LongType; + +/** + * {@link io.delta.kernel.data.ColumnVector} implementation for double type data. + */ +public class DefaultDoubleVector + extends AbstractColumnVector +{ + private final double[] values; + + /** + * Create an instance of {@link io.delta.kernel.data.ColumnVector} for float type. + * + * @param size number of elements in the vector. + * @param nullability Optional array of nullability value for each element in the vector. + * All values in the vector are considered non-null when parameter is empty. + * @param values column vector values. + */ + public DefaultDoubleVector(int size, Optional nullability, double[] values) + { + super(size, LongType.INSTANCE, nullability); + this.values = requireNonNull(values, "values is null"); + checkArgument(values.length >= 0, "invalid vector size: %s", values.length); + checkArgument(values.length >= size, + "invalid number of values (%s) for given size (%s)", values.length, size); + if (nullability.isPresent()) { + checkArgument(values.length == nullability.get().length, + "vector element components are not of same size" + + "value array size = %s, nullability array size = %s", + values.length, nullability.get().length + ); + } + } + + /** + * Get the value at given {@code rowId}. The return value is undefined and can be + * anything, if the slot for {@code rowId} is null. + * + * @param rowId + * @return + */ + @Override + public double getDouble(int rowId) + { + checkValidRowId(rowId); + return values[rowId]; + } +} diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultFloatVector.java b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultFloatVector.java new file mode 100644 index 00000000000..56173207d00 --- /dev/null +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultFloatVector.java @@ -0,0 +1,69 @@ +/* + * Copyright (2023) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.delta.kernel.data.vector; + +import static io.delta.kernel.DefaultKernelUtils.checkArgument; +import static java.util.Objects.requireNonNull; +import java.util.Optional; + +import io.delta.kernel.types.LongType; + +/** + * {@link io.delta.kernel.data.ColumnVector} implementation for float type data. + */ +public class DefaultFloatVector + extends AbstractColumnVector +{ + private final float[] values; + + /** + * Create an instance of {@link io.delta.kernel.data.ColumnVector} for float type. + * + * @param size number of elements in the vector. + * @param nullability Optional array of nullability value for each element in the vector. + * All values in the vector are considered non-null when parameter is empty. + * @param values column vector values. + */ + public DefaultFloatVector(int size, Optional nullability, float[] values) + { + super(size, LongType.INSTANCE, nullability); + this.values = requireNonNull(values, "values is null"); + checkArgument(values.length >= 0, "invalid vector size: %s", values.length); + checkArgument(values.length >= size, + "invalid number of values (%s) for given size (%s)", values.length, size); + if (nullability.isPresent()) { + checkArgument(values.length == nullability.get().length, + "vector element components are not of same size" + + "value array size = %s, nullability array size = %s", + values.length, nullability.get().length + ); + } + } + + /** + * Get the value at given {@code rowId}. The return value is undefined and can be + * anything, if the slot for {@code rowId} is null. + * + * @param rowId + * @return + */ + @Override + public float getFloat(int rowId) + { + checkValidRowId(rowId); + return values[rowId]; + } +} diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultIntVector.java b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultIntVector.java new file mode 100644 index 00000000000..2dfdfa55843 --- /dev/null +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultIntVector.java @@ -0,0 +1,73 @@ +/* + * Copyright (2023) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.delta.kernel.data.vector; + +import static io.delta.kernel.DefaultKernelUtils.checkArgument; +import static java.util.Objects.requireNonNull; +import java.util.Optional; + +import io.delta.kernel.types.DataType; +import io.delta.kernel.types.DateType; +import io.delta.kernel.types.IntegerType; + +/** + * {@link io.delta.kernel.data.ColumnVector} implementation for integer type data. + */ +public class DefaultIntVector + extends AbstractColumnVector +{ + private final int[] values; + + /** + * Create an instance of {@link io.delta.kernel.data.ColumnVector} for integer type. + * + * @param size number of elements in the vector. + * @param nullability Optional array of nullability value for each element in the vector. + * All values in the vector are considered non-null when parameter is empty. + * @param values column vector values. + */ + public DefaultIntVector( + DataType dataType, int size, Optional nullability, int[] values) + { + super(size, dataType, nullability); + checkArgument(dataType instanceof IntegerType || dataType instanceof DateType); + this.values = requireNonNull(values, "values is null"); + checkArgument(values.length >= 0, "invalid vector size: %s", values.length); + checkArgument(values.length >= size, + "invalid number of values (%s) for given size (%s)", values.length, size); + if (nullability.isPresent()) { + checkArgument(values.length == nullability.get().length, + "vector element components are not of same size" + + "value array size = %s, nullability array size = %s", + values.length, nullability.get().length + ); + } + } + + /** + * Get the value at given {@code rowId}. The return value is undefined and can be + * anything, if the slot for {@code rowId} is null. + * + * @param rowId + * @return + */ + @Override + public int getInt(int rowId) + { + checkValidRowId(rowId); + return values[rowId]; + } +} diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultLongVector.java b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultLongVector.java new file mode 100644 index 00000000000..2f9394c1902 --- /dev/null +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultLongVector.java @@ -0,0 +1,69 @@ +/* + * Copyright (2023) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.delta.kernel.data.vector; + +import static io.delta.kernel.DefaultKernelUtils.checkArgument; +import static java.util.Objects.requireNonNull; +import java.util.Optional; + +import io.delta.kernel.types.LongType; + +/** + * {@link io.delta.kernel.data.ColumnVector} implementation for long type data. + */ +public class DefaultLongVector + extends AbstractColumnVector +{ + private final long[] values; + + /** + * Create an instance of {@link io.delta.kernel.data.ColumnVector} for long type. + * + * @param size number of elements in the vector. + * @param nullability Optional array of nullability value for each element in the vector. + * All values in the vector are considered non-null when parameter is empty. + * @param values column vector values. + */ + public DefaultLongVector(int size, Optional nullability, long[] values) + { + super(size, LongType.INSTANCE, nullability); + this.values = requireNonNull(values, "values is null"); + checkArgument(values.length >= 0, "invalid vector size: %s", values.length); + checkArgument(values.length >= size, + "invalid number of values (%s) for given size (%s)", values.length, size); + if (nullability.isPresent()) { + checkArgument(values.length == nullability.get().length, + "vector element components are not of same size" + + "value array size = %s, nullability array size = %s", + values.length, nullability.get().length + ); + } + } + + /** + * Get the value at given {@code rowId}. The return value is undefined and can be + * anything, if the slot for {@code rowId} is null. + * + * @param rowId + * @return + */ + @Override + public long getLong(int rowId) + { + checkValidRowId(rowId); + return values[rowId]; + } +} diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultMapVector.java b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultMapVector.java new file mode 100644 index 00000000000..d870dc7e0c2 --- /dev/null +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultMapVector.java @@ -0,0 +1,85 @@ +/* + * Copyright (2023) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.delta.kernel.data.vector; + +import static io.delta.kernel.DefaultKernelUtils.checkArgument; +import java.util.HashMap; +import java.util.Map; +import static java.util.Objects.requireNonNull; +import java.util.Optional; + +import io.delta.kernel.data.ColumnVector; +import io.delta.kernel.types.DataType; + +/** + * {@link io.delta.kernel.data.ColumnVector} implementation for map type data. + */ +public class DefaultMapVector + extends AbstractColumnVector +{ + private final int[] offsets; + private final ColumnVector keyVector; + private final ColumnVector valueVector; + + /** + * Create an instance of {@link io.delta.kernel.data.ColumnVector} for map type. + * + * @param size number of elements in the vector. + * @param nullability Optional array of nullability value for each element in the vector. + * All values in the vector are considered non-null when parameter is empty. + * @param offsets Offsets into key and value column vectors on where the index of particular row + * values start and end. + * @param keyVector Vector containing the `key` values from the kv map. + * @param valueVector Vector containing the `value` values from the kv map. + */ + public DefaultMapVector( + int size, + DataType type, + Optional nullability, + int[] offsets, + ColumnVector keyVector, + ColumnVector valueVector) + { + super(size, type, nullability); + checkArgument(offsets.length >= size + 1, "invalid offset array size"); + this.offsets = requireNonNull(offsets, "offsets is null"); + this.keyVector = requireNonNull(keyVector, "keyVector is null"); + this.valueVector = requireNonNull(valueVector, "valueVector is null"); + } + + /** + * Get the value at given {@code rowId}. The return value is undefined and can be + * anything, if the slot for {@code rowId} is null. + * + * @param rowId + * @return + */ + @Override + public Map getMap(int rowId) + { + checkValidRowId(rowId); + int start = offsets[rowId]; + int end = offsets[rowId + 1]; + + Map values = new HashMap<>(); + for (int entry = start; entry < end; entry++) { + Object key = VectorUtils.getValueAsObject(keyVector, entry); + Object value = VectorUtils.getValueAsObject(valueVector, entry); + values.put((K) key, (V) value); + } + return values; + } +} diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultShortVector.java b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultShortVector.java new file mode 100644 index 00000000000..4f27879fd86 --- /dev/null +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultShortVector.java @@ -0,0 +1,70 @@ +/* + * Copyright (2023) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.delta.kernel.data.vector; + +import static io.delta.kernel.DefaultKernelUtils.checkArgument; +import static java.util.Objects.requireNonNull; +import java.util.Optional; + +import io.delta.kernel.types.ByteType; +import io.delta.kernel.types.IntegerType; + +/** + * {@link io.delta.kernel.data.ColumnVector} implementation for short type data. + */ +public class DefaultShortVector + extends AbstractColumnVector +{ + private final short[] values; + + /** + * Create an instance of {@link io.delta.kernel.data.ColumnVector} for short type. + * + * @param size number of elements in the vector. + * @param nullability Optional array of nullability value for each element in the vector. + * All values in the vector are considered non-null when parameter is empty. + * @param values column vector values. + */ + public DefaultShortVector(int size, Optional nullability, short[] values) + { + super(size, ByteType.INSTANCE, nullability); + this.values = requireNonNull(values, "values is null"); + checkArgument(values.length >= 0, "invalid vector size: %s", values.length); + checkArgument(values.length >= size, + "invalid number of values (%s) for given size (%s)", values.length, size); + if (nullability.isPresent()) { + checkArgument(values.length == nullability.get().length, + "vector element components are not of same size" + + "value array size = %s, nullability array size = %s", + values.length, nullability.get().length + ); + } + } + + /** + * Get the value at given {@code rowId}. The return value is undefined and can be + * anything, if the slot for {@code rowId} is null. + * + * @param rowId + * @return + */ + @Override + public short getShort(int rowId) + { + checkValidRowId(rowId); + return values[rowId]; + } +} diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultStructVector.java b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultStructVector.java new file mode 100644 index 00000000000..9bb452ca624 --- /dev/null +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultStructVector.java @@ -0,0 +1,146 @@ +/* + * Copyright (2023) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.delta.kernel.data.vector; + +import static io.delta.kernel.DefaultKernelUtils.checkArgument; +import java.util.List; +import java.util.Map; +import static java.util.Objects.requireNonNull; +import java.util.Optional; + +import io.delta.kernel.data.ColumnVector; +import io.delta.kernel.data.Row; +import io.delta.kernel.types.DataType; +import io.delta.kernel.types.StructType; + +/** + * {@link io.delta.kernel.data.ColumnVector} implementation for struct type data. + */ +public class DefaultStructVector + extends AbstractColumnVector +{ + private final ColumnVector[] memberVectors; + private final int size; + + /** + * Create an instance of {@link ColumnVector} for {@code struct} type. + * + * @param size number of elements in the vector. + * @param dataType {@code struct} datatype definition. + * @param nullability Optional array of nullability value for each element in the vector. + * All values in the vector are considered non-null when parameter is empty. + * @param memberVectors column vectors for each member of the struct. + */ + public DefaultStructVector( + int size, + DataType dataType, + Optional nullability, + ColumnVector[] memberVectors) + { + super(size, dataType, nullability); + checkArgument(dataType instanceof StructType, "not a struct type"); + + StructType structType = (StructType) dataType; + checkArgument( + structType.length() == memberVectors.length, + "expected a one column vector for each member"); + this.memberVectors = memberVectors; + this.size = size; + } + + @Override + public Row getStruct(int rowId) + { + checkValidRowId(rowId); + if (isNullAt(rowId)) { + return null; + } + return new StructRow(this, rowId); + } + + /** + * Wrapper class to expose one member as a {@link Row} + */ + private static class StructRow + implements Row + { + private final DefaultStructVector structVector; + private final int rowId; + + StructRow(DefaultStructVector structVector, int rowId) + { + this.structVector = requireNonNull(structVector, "structVector is null"); + checkArgument( + rowId >= 0 && rowId < structVector.getSize(), + "invalid row id: %s", rowId); + this.rowId = rowId; + } + + @Override + public StructType getSchema() + { + return (StructType) structVector.getDataType(); + } + + @Override + public boolean isNullAt(int ordinal) + { + return structVector.memberVectors[ordinal].isNullAt(rowId); + } + + @Override + public boolean getBoolean(int ordinal) + { + return structVector.memberVectors[ordinal].getBoolean(rowId); + } + + @Override + public int getInt(int ordinal) + { + return structVector.memberVectors[ordinal].getInt(rowId); + } + + @Override + public long getLong(int ordinal) + { + return structVector.memberVectors[ordinal].getLong(rowId); + } + + @Override + public String getString(int ordinal) + { + return structVector.memberVectors[ordinal].getString(rowId); + } + + @Override + public Row getStruct(int ordinal) + { + return structVector.memberVectors[ordinal].getStruct(rowId); + } + + @Override + public List getArray(int ordinal) + { + return structVector.memberVectors[ordinal].getArray(rowId); + } + + @Override + public Map getMap(int ordinal) + { + return structVector.memberVectors[ordinal].getMap(rowId); + } + } +} diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/VectorUtils.java b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/VectorUtils.java new file mode 100644 index 00000000000..4b348a13fd1 --- /dev/null +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/VectorUtils.java @@ -0,0 +1,69 @@ +package io.delta.kernel.data.vector; + +import io.delta.kernel.data.ColumnVector; +import io.delta.kernel.types.ArrayType; +import io.delta.kernel.types.BooleanType; +import io.delta.kernel.types.ByteType; +import io.delta.kernel.types.DataType; +import io.delta.kernel.types.DateType; +import io.delta.kernel.types.DoubleType; +import io.delta.kernel.types.FloatType; +import io.delta.kernel.types.IntegerType; +import io.delta.kernel.types.LongType; +import io.delta.kernel.types.MapType; +import io.delta.kernel.types.ShortType; +import io.delta.kernel.types.StringType; +import io.delta.kernel.types.StructType; +import io.delta.kernel.types.TimestampType; + +/** + * Utility methods for {@link io.delta.kernel.data.ColumnVector} implementations. + */ +public class VectorUtils +{ + private VectorUtils() {} + + /** + * Get the value at given {@code rowId} from the column vector. The type of the value object + * depends on the data type of the {@code vector}. + * + * @param vector + * @param rowId + * @return + */ + public static Object getValueAsObject(ColumnVector vector, int rowId) { + // TODO: may be it is better to just provide a `getObject` on the `ColumnVector` to + // avoid the nested if-else statements. + final DataType dataType = vector.getDataType(); + + if (vector.isNullAt(rowId)) { + return null; + } + + if (dataType instanceof BooleanType) { + return vector.getBoolean(rowId); + } else if (dataType instanceof ByteType) { + return vector.getByte(rowId); + } else if (dataType instanceof ShortType) { + return vector.getShort(rowId); + } else if (dataType instanceof IntegerType || dataType instanceof DateType) { + return vector.getInt(rowId); + } else if (dataType instanceof LongType || dataType instanceof TimestampType) { + return vector.getLong(rowId); + } else if (dataType instanceof FloatType) { + return vector.getFloat(rowId); + } else if (dataType instanceof DoubleType) { + return vector.getDouble(rowId); + } else if (dataType instanceof StringType) { + return vector.getString(rowId); + } else if (dataType instanceof StructType) { + return vector.getStruct(rowId); + } else if (dataType instanceof MapType) { + return vector.getMap(rowId); + } else if (dataType instanceof ArrayType) { + return vector.getArray(rowId); + } + + throw new UnsupportedOperationException(dataType + " is not supported yet"); + } +} diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/ArrayConverter.java b/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/ArrayConverter.java new file mode 100644 index 00000000000..64cce0e2760 --- /dev/null +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/ArrayConverter.java @@ -0,0 +1,181 @@ +/* + * Copyright (2023) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.delta.kernel.parquet; + +import static io.delta.kernel.parquet.ParquetConverters.initNullabilityVector; +import static io.delta.kernel.parquet.ParquetConverters.setNullabilityToTrue; +import java.util.Arrays; +import java.util.Optional; +import org.apache.parquet.io.api.Converter; +import org.apache.parquet.io.api.GroupConverter; +import org.apache.parquet.schema.GroupType; + +import io.delta.kernel.data.ColumnVector; +import io.delta.kernel.data.vector.DefaultArrayVector; +import io.delta.kernel.types.ArrayType; + +class ArrayConverter + extends GroupConverter + implements ParquetConverters.BaseConverter +{ + private final ArrayType typeFromClient; + private final ArrayCollector converter; + + // working state + private int currentRowIndex; + private boolean[] nullability; + private int[] offsets; + private int collectorIndexAtStart; + + public ArrayConverter( + int maxBatchSize, + ArrayType typeFromClient, + GroupType typeFromFile) + { + this.typeFromClient = typeFromClient; + final GroupType innerElementType = (GroupType) typeFromFile.getType("list"); + this.converter = new ArrayCollector( + maxBatchSize, + typeFromClient, + innerElementType + ); + + // initialize working state + this.nullability = initNullabilityVector(maxBatchSize); + this.offsets = new int[maxBatchSize + 1]; + } + + @Override + public Converter getConverter(int fieldIndex) + { + switch (fieldIndex) { + case 0: + return converter; + default: + throw new IllegalArgumentException( + "Invalid field index for a map column: " + fieldIndex); + } + } + + @Override + public void start() + { + collectorIndexAtStart = converter.currentEntryIndex; + } + + @Override + public void end() + { + int collectorIndexAtEnd = converter.currentEntryIndex; + this.nullability[currentRowIndex] = collectorIndexAtEnd == collectorIndexAtStart; + this.offsets[currentRowIndex + 1] = collectorIndexAtEnd; + } + + @Override + public ColumnVector getDataColumnVector(int batchSize) + { + ColumnVector vector = new DefaultArrayVector( + batchSize, + typeFromClient, + Optional.of(nullability), + offsets, + converter.getArrayVector() + ); + this.currentRowIndex = 0; + this.nullability = initNullabilityVector(nullability.length); + this.offsets = new int[offsets.length]; + + return vector; + } + + @Override + public boolean moveToNextRow() + { + currentRowIndex++; + resizeIfNeeded(); + + return nullability[currentRowIndex - 1]; + } + + @Override + public void resizeIfNeeded() + { + if (nullability.length == currentRowIndex) { + int newSize = nullability.length * 2; + this.nullability = Arrays.copyOf(this.nullability, newSize); + setNullabilityToTrue(this.nullability, newSize / 2, newSize); + + this.offsets = Arrays.copyOf(this.offsets, newSize + 1); + } + } + + public static class ArrayCollector + extends GroupConverter + { + private final Converter converter; + + // working state + private int currentEntryIndex; + + public ArrayCollector(int maxBatchSize, ArrayType typeFromClient, GroupType innerArrayType) + { + this.converter = ParquetConverters.createConverter( + maxBatchSize, + typeFromClient.getElementType(), + innerArrayType.getType("element")); + } + + @Override + public Converter getConverter(int fieldIndex) + { + switch (fieldIndex) { + case 0: + return converter; + default: + throw new IllegalArgumentException( + "Invalid field index for a map column: " + fieldIndex); + } + } + + @Override + public void start() + { + if (!converter.isPrimitive()) { + converter.asGroupConverter().start(); + } + } + + @Override + public void end() + { + if (!converter.isPrimitive()) { + converter.asGroupConverter().end(); + } + ((ParquetConverters.BaseConverter) converter).moveToNextRow(); + currentEntryIndex++; + } + + public ColumnVector getArrayVector() + { + ColumnVector vector = ((ParquetConverters.BaseConverter) converter) + .getDataColumnVector(currentEntryIndex); + + currentEntryIndex = 0; + return vector; + } + } +} diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/MapConverter.java b/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/MapConverter.java new file mode 100644 index 00000000000..b056f286f6c --- /dev/null +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/MapConverter.java @@ -0,0 +1,198 @@ +/* + * Copyright (2023) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.delta.kernel.parquet; + +import static io.delta.kernel.parquet.ParquetConverters.initNullabilityVector; +import static io.delta.kernel.parquet.ParquetConverters.setNullabilityToTrue; +import java.util.Arrays; +import java.util.Optional; +import org.apache.parquet.io.api.Converter; +import org.apache.parquet.io.api.GroupConverter; +import org.apache.parquet.schema.GroupType; + +import io.delta.kernel.data.ColumnVector; +import io.delta.kernel.data.vector.DefaultMapVector; +import io.delta.kernel.types.MapType; + +class MapConverter + extends GroupConverter + implements ParquetConverters.BaseConverter +{ + private final MapType typeFromClient; + private final MapCollector converter; + + // working state + private int currentRowIndex; + private boolean[] nullability; + private int[] offsets; + private int collectorIndexAtStart; + + public MapConverter( + int maxBatchSize, + MapType typeFromClient, + GroupType typeFromFile) + { + this.typeFromClient = typeFromClient; + final GroupType innerMapType = (GroupType) typeFromFile.getType("key_value"); + this.converter = new MapCollector( + maxBatchSize, + typeFromClient, + innerMapType + ); + + // initialize working state + this.nullability = initNullabilityVector(maxBatchSize); + this.offsets = new int[maxBatchSize + 1]; + } + + @Override + public Converter getConverter(int fieldIndex) + { + switch (fieldIndex) { + case 0: + return converter; + default: + throw new IllegalArgumentException( + "Invalid field index for a map column: " + fieldIndex); + } + } + + @Override + public void start() + { + collectorIndexAtStart = converter.currentEntryIndex; + } + + @Override + public void end() + { + int collectorIndexAtEnd = converter.currentEntryIndex; + this.nullability[currentRowIndex] = collectorIndexAtEnd == collectorIndexAtStart; + this.offsets[currentRowIndex + 1] = collectorIndexAtEnd; + } + + @Override + public ColumnVector getDataColumnVector(int batchSize) + { + ColumnVector vector = new DefaultMapVector( + batchSize, + typeFromClient, + Optional.of(nullability), + offsets, + converter.getKeyVector(), + converter.getValueVector() + ); + this.currentRowIndex = 0; + this.converter.currentEntryIndex = 0; + this.nullability = initNullabilityVector(nullability.length); + this.offsets = new int[offsets.length]; + + return vector; + } + + @Override + public boolean moveToNextRow() + { + currentRowIndex++; + resizeIfNeeded(); + + return nullability[currentRowIndex - 1]; + } + + @Override + public void resizeIfNeeded() + { + if (nullability.length == currentRowIndex) { + int newSize = nullability.length * 2; + this.nullability = Arrays.copyOf(this.nullability, newSize); + setNullabilityToTrue(this.nullability, newSize / 2, newSize); + this.offsets = Arrays.copyOf(this.offsets, newSize + 1); + } + } + + public static class MapCollector + extends GroupConverter + { + private final Converter[] converters; + + // working state + private int currentEntryIndex; + + public MapCollector( + int maxBatchSize, + MapType typeFromClient, + GroupType innerMapType) + { + this.converters = new Converter[2]; + this.converters[0] = ParquetConverters.createConverter( + maxBatchSize, + typeFromClient.getKeyType(), + innerMapType.getType("key")); + this.converters[1] = ParquetConverters.createConverter( + maxBatchSize, + typeFromClient.getValueType(), + innerMapType.getType("value")); + } + + @Override + public Converter getConverter(int fieldIndex) + { + switch (fieldIndex) { + case 0: // fall through + case 1: + return converters[fieldIndex]; + default: + throw new IllegalArgumentException( + "Invalid field index for a map column: " + fieldIndex); + } + } + + @Override + public void start() + { + Arrays.stream(converters) + .filter(conv -> !conv.isPrimitive()) + .forEach(conv -> ((GroupConverter) conv).start()); + } + + @Override + public void end() + { + Arrays.stream(converters) + .filter(conv -> !conv.isPrimitive()) + .forEach(conv -> ((GroupConverter) conv).end()); + + Arrays.stream(converters) + .map(converter -> (ParquetConverters.BaseConverter) converter) + .forEach(converter -> converter.moveToNextRow()); + + currentEntryIndex++; + } + + public ColumnVector getKeyVector() + { + return ((ParquetConverters.BaseConverter) converters[0]) + .getDataColumnVector(currentEntryIndex); + } + + public ColumnVector getValueVector() + { + return ((ParquetConverters.BaseConverter) converters[1]) + .getDataColumnVector(currentEntryIndex); + } + } +} diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/ParquetBatchReader.java b/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/ParquetBatchReader.java new file mode 100644 index 00000000000..49d1720b7b3 --- /dev/null +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/ParquetBatchReader.java @@ -0,0 +1,207 @@ +/* + * Copyright (2023) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.delta.kernel.parquet; + +import java.io.IOException; +import java.util.Map; +import static java.util.Objects.requireNonNull; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.Reporter; +import org.apache.parquet.hadoop.ParquetRecordReader; +import org.apache.parquet.hadoop.api.InitContext; +import org.apache.parquet.hadoop.api.ReadSupport; +import org.apache.parquet.io.api.GroupConverter; +import org.apache.parquet.io.api.RecordMaterializer; +import org.apache.parquet.schema.MessageType; + +import io.delta.kernel.DefaultKernelUtils; +import io.delta.kernel.data.ColumnarBatch; +import io.delta.kernel.types.StructType; +import io.delta.kernel.utils.CloseableIterator; + +public class ParquetBatchReader +{ + private final Configuration configuration; + private final int maxBatchSize; + + public ParquetBatchReader(Configuration configuration) + { + this.configuration = requireNonNull(configuration, "configuration is null"); + this.maxBatchSize = + configuration.getInt("delta.kernel.default.parquet.reader.batch-size", 1024); + } + + public CloseableIterator read(String path, StructType schema) + { + BatchReadSupport batchReadSupport = new BatchReadSupport(maxBatchSize, schema); + ParquetRecordReader reader = new ParquetRecordReader<>(batchReadSupport); + + Path filePath = new Path(path); + try { + FileSystem fs = filePath.getFileSystem(configuration); + FileStatus fileStatus = fs.getFileStatus(filePath); + reader.initialize( + new FileSplit(filePath, 0, fileStatus.getLen(), new String[0]), + configuration, + Reporter.NULL + ); + } + catch (IOException | InterruptedException e) { + throw new RuntimeException(e); + } + + return new CloseableIterator() + { + @Override + public void close() + throws IOException + { + reader.close(); + } + + @Override + public boolean hasNext() + { + try { + return reader.nextKeyValue(); + } + catch (IOException | InterruptedException e) { + throw new RuntimeException(e); + } + } + + @Override + public ColumnarBatch next() + { + int batchSize = 0; + do { + // hasNext reads to row to confirm there is a next element. + batchReadSupport.moveToNextRow(); + batchSize++; + } + while (batchSize < maxBatchSize && hasNext()); + + return batchReadSupport.getDataAsColumnarBatch(batchSize); + } + }; + } + + /** + * Implement a {@link ReadSupport} that will collect the data for each row and return + * as a {@link ColumnarBatch}. + */ + public static class BatchReadSupport + extends ReadSupport + { + private final int maxBatchSize; + private final StructType readSchema; + private RowRecordCollector rowRecordCollector; + + public BatchReadSupport(int maxBatchSize, StructType readSchema) + { + this.maxBatchSize = maxBatchSize; + this.readSchema = requireNonNull(readSchema, "readSchema is not null"); + } + + @Override + public ReadContext init(InitContext context) + { + return new ReadContext( + DefaultKernelUtils.pruneSchema(context.getFileSchema(), readSchema)); + } + + @Override + public RecordMaterializer prepareForRead( + Configuration configuration, + Map keyValueMetaData, + MessageType fileSchema, + ReadContext readContext) + { + rowRecordCollector = new RowRecordCollector(maxBatchSize, readSchema, fileSchema); + return rowRecordCollector; + } + + public ColumnarBatch getDataAsColumnarBatch(int batchSize) + { + return rowRecordCollector.getDataAsColumnarBatch(batchSize); + } + + public void moveToNextRow() + { + rowRecordCollector.moveToNextRow(); + } + } + + /** + * Collects the records given by the Parquet reader as columnar data. Parquet reader allows + * reading data row by row, but {@link ParquetBatchReader} wants to expose the data as a + * columnar batch. Parquet reader takes an implementation of {@link RecordMaterializer} + * to which it gives data for each column one row a time. This {@link RecordMaterializer} + * implementation collects the column values for multiple rows and returns a + * {@link ColumnarBatch} at the end. + */ + public static class RowRecordCollector + extends RecordMaterializer + { + private static final Object FAKE_ROW_RECORD = new Object(); + private final RowConverter rowRecordGroupConverter; + + public RowRecordCollector(int maxBatchSize, StructType readSchema, MessageType fileSchema) + { + this.rowRecordGroupConverter = + new RowConverter(maxBatchSize, readSchema, fileSchema); + } + + @Override + public void skipCurrentRecord() + { + super.skipCurrentRecord(); + } + + /** + * Return a fake object. This is not used by {@link ParquetBatchReader}, instead + * {@link #getDataAsColumnarBatch}} once a sufficient number of rows are collected. + */ + @Override + public Object getCurrentRecord() + { + return FAKE_ROW_RECORD; + } + + @Override + public GroupConverter getRootConverter() + { + return rowRecordGroupConverter; + } + + /** + * Return the data collected so far as a {@link ColumnarBatch}. + */ + public ColumnarBatch getDataAsColumnarBatch(int batchSize) + { + return rowRecordGroupConverter.getDataAsColumnarBatch(batchSize); + } + + public void moveToNextRow() + { + rowRecordGroupConverter.moveToNextRow(); + } + } +} diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/ParquetConverters.java b/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/ParquetConverters.java new file mode 100644 index 00000000000..9d8bb713826 --- /dev/null +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/ParquetConverters.java @@ -0,0 +1,553 @@ +/* + * Copyright (2023) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.delta.kernel.parquet; + +import static io.delta.kernel.DefaultKernelUtils.checkArgument; +import java.util.Arrays; +import java.util.Objects; +import java.util.Optional; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.io.api.Converter; +import org.apache.parquet.io.api.PrimitiveConverter; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.Type; + +import io.delta.kernel.data.ColumnVector; +import io.delta.kernel.data.vector.DefaultBinaryVector; +import io.delta.kernel.data.vector.DefaultBooleanVector; +import io.delta.kernel.data.vector.DefaultByteVector; +import io.delta.kernel.data.vector.DefaultConstantVector; +import io.delta.kernel.data.vector.DefaultDoubleVector; +import io.delta.kernel.data.vector.DefaultFloatVector; +import io.delta.kernel.data.vector.DefaultIntVector; +import io.delta.kernel.data.vector.DefaultLongVector; +import io.delta.kernel.data.vector.DefaultShortVector; +import io.delta.kernel.types.ArrayType; +import io.delta.kernel.types.BinaryType; +import io.delta.kernel.types.BooleanType; +import io.delta.kernel.types.ByteType; +import io.delta.kernel.types.DataType; +import io.delta.kernel.types.DateType; +import io.delta.kernel.types.DoubleType; +import io.delta.kernel.types.FloatType; +import io.delta.kernel.types.IntegerType; +import io.delta.kernel.types.LongType; +import io.delta.kernel.types.MapType; +import io.delta.kernel.types.ShortType; +import io.delta.kernel.types.StringType; +import io.delta.kernel.types.StructType; + +class ParquetConverters +{ + public static Converter createConverter( + int maxBatchSize, + DataType typeFromClient, + Type typeFromFile + ) + { + if (typeFromClient instanceof StructType) { + return new RowConverter( + maxBatchSize, + (StructType) typeFromClient, + (GroupType) typeFromFile); + } + else if (typeFromClient instanceof ArrayType) { + return new ArrayConverter( + maxBatchSize, + (ArrayType) typeFromClient, + (GroupType) typeFromFile + ); + } + else if (typeFromClient instanceof MapType) { + return new MapConverter( + maxBatchSize, + (MapType) typeFromClient, + (GroupType) typeFromFile); + } + else if (typeFromClient instanceof StringType || typeFromClient instanceof BinaryType) { + return new BinaryColumnConverter(typeFromClient, maxBatchSize); + } + else if (typeFromClient instanceof BooleanType) { + return new BooleanColumnConverter(maxBatchSize); + } + else if (typeFromClient instanceof IntegerType || typeFromClient instanceof DateType) { + return new IntColumnConverter(typeFromClient, maxBatchSize); + } + else if (typeFromClient instanceof ByteType) { + return new ByteColumnConverter(maxBatchSize); + } + else if (typeFromClient instanceof ShortType) { + return new ShortColumnConverter(maxBatchSize); + } + else if (typeFromClient instanceof LongType) { + return new LongColumnConverter(maxBatchSize); + } + else if (typeFromClient instanceof FloatType) { + return new FloatColumnConverter(maxBatchSize); + } + else if (typeFromClient instanceof DoubleType) { + return new DoubleColumnConverter(maxBatchSize); + } +// else if (typeFromClient instanceof DecimalType) { +// +// } +// else if (typeFromClient instanceof TimestampType) { +// +// } + + throw new UnsupportedOperationException(typeFromClient + " is not supported"); + } + + public interface BaseConverter + { + ColumnVector getDataColumnVector(int batchSize); + + /** + * Move the converter to accept the next row value. + * + * @return True if the last converted value is null, false otherwise + */ + boolean moveToNextRow(); + + void resizeIfNeeded(); + } + + public static class NonExistentColumnConverter + extends PrimitiveConverter + implements BaseConverter + { + private final DataType dataType; + + public NonExistentColumnConverter(DataType dataType) + { + this.dataType = Objects.requireNonNull(dataType, "dataType is null"); + } + + @Override + public ColumnVector getDataColumnVector(int batchSize) + { + return new DefaultConstantVector(dataType, batchSize, null); + } + + @Override + public boolean moveToNextRow() + { + return true; + } + + @Override + public void resizeIfNeeded() + { + // nothing to resize + } + } + + public abstract static class BasePrimitiveColumnConverter + extends PrimitiveConverter + implements BaseConverter + { + // working state + protected int currentRowIndex; + protected boolean[] nullability; + + BasePrimitiveColumnConverter(int maxBatchSize) + { + checkArgument(maxBatchSize >= 0, "invalid maxBatchSize: %s", maxBatchSize); + + // Initialize the working state + this.nullability = initNullabilityVector(maxBatchSize); + } + + @Override + public boolean moveToNextRow() + { + resizeIfNeeded(); + currentRowIndex++; + return this.nullability[currentRowIndex - 1]; + } + } + + public static class BooleanColumnConverter + extends BasePrimitiveColumnConverter + { + // working state + private boolean[] values; + + BooleanColumnConverter(int maxBatchSize) + { + super(maxBatchSize); + this.values = new boolean[maxBatchSize]; + } + + @Override + public void addBoolean(boolean value) + { + resizeIfNeeded(); + this.nullability[currentRowIndex] = false; + this.values[currentRowIndex] = value; + } + + @Override + public ColumnVector getDataColumnVector(int batchSize) + { + ColumnVector vector = + new DefaultBooleanVector(batchSize, Optional.of(nullability), values); + this.nullability = initNullabilityVector(nullability.length); + this.values = new boolean[values.length]; + this.currentRowIndex = 0; + return vector; + } + + @Override + public void resizeIfNeeded() + { + if (values.length == currentRowIndex) { + int newSize = values.length * 2; + this.values = Arrays.copyOf(this.values, newSize); + this.nullability = Arrays.copyOf(this.nullability, newSize); + setNullabilityToTrue(this.nullability, newSize / 2, newSize); + } + } + } + + public static class ByteColumnConverter + extends BasePrimitiveColumnConverter + { + + // working state + private byte[] values; + + ByteColumnConverter(int maxBatchSize) + { + super(maxBatchSize); + this.values = new byte[maxBatchSize]; + } + + @Override + public void addInt(int value) + { + resizeIfNeeded(); + this.nullability[currentRowIndex] = false; + this.values[currentRowIndex] = (byte) value; + } + + @Override + public ColumnVector getDataColumnVector(int batchSize) + { + ColumnVector vector = + new DefaultByteVector(batchSize, Optional.of(nullability), values); + this.nullability = initNullabilityVector(nullability.length); + this.values = new byte[values.length]; + this.currentRowIndex = 0; + return vector; + } + + @Override + public void resizeIfNeeded() + { + if (values.length == currentRowIndex) { + int newSize = values.length * 2; + this.values = Arrays.copyOf(this.values, newSize); + this.nullability = Arrays.copyOf(this.nullability, newSize); + setNullabilityToTrue(this.nullability, newSize / 2, newSize); + } + } + } + + public static class ShortColumnConverter + extends BasePrimitiveColumnConverter + { + + // working state + private short[] values; + + ShortColumnConverter(int maxBatchSize) + { + super(maxBatchSize); + this.values = new short[maxBatchSize]; + } + + @Override + public void addInt(int value) + { + resizeIfNeeded(); + this.nullability[currentRowIndex] = false; + this.values[currentRowIndex] = (short) value; + } + + @Override + public ColumnVector getDataColumnVector(int batchSize) + { + ColumnVector vector = + new DefaultShortVector(batchSize, Optional.of(nullability), values); + this.nullability = initNullabilityVector(nullability.length); + this.values = new short[values.length]; + this.currentRowIndex = 0; + return vector; + } + + @Override + public void resizeIfNeeded() + { + if (values.length == currentRowIndex) { + int newSize = values.length * 2; + this.values = Arrays.copyOf(this.values, newSize); + this.nullability = Arrays.copyOf(this.nullability, newSize); + setNullabilityToTrue(this.nullability, newSize / 2, newSize); + } + } + } + + public static class IntColumnConverter + extends BasePrimitiveColumnConverter + { + private final DataType dataType; + // working state + private int[] values; + + IntColumnConverter(DataType dataType, int maxBatchSize) + { + super(maxBatchSize); + checkArgument(dataType instanceof IntegerType || dataType instanceof DataType); + this.dataType = dataType; + this.values = new int[maxBatchSize]; + } + + @Override + public void addInt(int value) + { + resizeIfNeeded(); + this.nullability[currentRowIndex] = false; + this.values[currentRowIndex] = value; + } + + @Override + public ColumnVector getDataColumnVector(int batchSize) + { + ColumnVector vector = + new DefaultIntVector(dataType, batchSize, Optional.of(nullability), values); + this.nullability = initNullabilityVector(nullability.length); + this.values = new int[values.length]; + this.currentRowIndex = 0; + return vector; + } + + @Override + public void resizeIfNeeded() + { + if (values.length == currentRowIndex) { + int newSize = values.length * 2; + this.values = Arrays.copyOf(this.values, newSize); + this.nullability = Arrays.copyOf(this.nullability, newSize); + setNullabilityToTrue(this.nullability, newSize / 2, newSize); + } + } + } + + public static class LongColumnConverter + extends BasePrimitiveColumnConverter + { + + // working state + private long[] values; + + LongColumnConverter(int maxBatchSize) + { + super(maxBatchSize); + this.values = new long[maxBatchSize]; + } + + @Override + public void addLong(long value) + { + resizeIfNeeded(); + this.nullability[currentRowIndex] = false; + this.values[currentRowIndex] = value; + } + + @Override + public ColumnVector getDataColumnVector(int batchSize) + { + ColumnVector vector = + new DefaultLongVector(batchSize, Optional.of(nullability), values); + this.nullability = initNullabilityVector(nullability.length); + this.values = new long[values.length]; + this.currentRowIndex = 0; + return vector; + } + + @Override + public void resizeIfNeeded() + { + if (values.length == currentRowIndex) { + int newSize = values.length * 2; + this.values = Arrays.copyOf(this.values, newSize); + this.nullability = Arrays.copyOf(this.nullability, newSize); + setNullabilityToTrue(this.nullability, newSize / 2, newSize); + } + } + } + + public static class FloatColumnConverter + extends BasePrimitiveColumnConverter + { + // working state + private float[] values; + + FloatColumnConverter(int maxBatchSize) + { + super(maxBatchSize); + this.values = new float[maxBatchSize]; + } + + @Override + public void addFloat(float value) + { + resizeIfNeeded(); + this.nullability[currentRowIndex] = false; + this.values[currentRowIndex] = value; + } + + @Override + public ColumnVector getDataColumnVector(int batchSize) + { + ColumnVector vector = + new DefaultFloatVector(batchSize, Optional.of(nullability), values); + this.nullability = initNullabilityVector(nullability.length); + this.values = new float[values.length]; + this.currentRowIndex = 0; + return vector; + } + + @Override + public void resizeIfNeeded() + { + if (values.length == currentRowIndex) { + int newSize = values.length * 2; + this.values = Arrays.copyOf(this.values, newSize); + this.nullability = Arrays.copyOf(this.nullability, newSize); + setNullabilityToTrue(this.nullability, newSize / 2, newSize); + } + } + } + + public static class DoubleColumnConverter + extends BasePrimitiveColumnConverter + { + + // working state + private double[] values; + + DoubleColumnConverter(int maxBatchSize) + { + super(maxBatchSize); + this.values = new double[maxBatchSize]; + } + + @Override + public void addDouble(double value) + { + resizeIfNeeded(); + this.nullability[currentRowIndex] = false; + this.values[currentRowIndex] = value; + } + + @Override + public ColumnVector getDataColumnVector(int batchSize) + { + ColumnVector vector = + new DefaultDoubleVector(batchSize, Optional.of(nullability), values); + // re-initialize the working space + this.nullability = initNullabilityVector(nullability.length); + this.values = new double[values.length]; + this.currentRowIndex = 0; + return vector; + } + + @Override + public void resizeIfNeeded() + { + if (values.length == currentRowIndex) { + int newSize = values.length * 2; + this.values = Arrays.copyOf(this.values, newSize); + this.nullability = Arrays.copyOf(this.nullability, newSize); + setNullabilityToTrue(this.nullability, newSize / 2, newSize); + } + } + } + + public static class BinaryColumnConverter + extends BasePrimitiveColumnConverter + { + private final DataType dataType; + + // working state + private byte[][] values; + + BinaryColumnConverter(DataType dataType, int maxBatchSize) + { + super(maxBatchSize); + this.dataType = dataType; + this.values = new byte[maxBatchSize][]; + } + + @Override + public void addBinary(Binary value) + { + resizeIfNeeded(); + this.nullability[currentRowIndex] = false; + this.values[currentRowIndex] = value.getBytes(); + } + + @Override + public ColumnVector getDataColumnVector(int batchSize) + { + ColumnVector vector = new DefaultBinaryVector(dataType, batchSize, values); + // re-initialize the working space + this.values = new byte[values.length][]; + this.currentRowIndex = 0; + return vector; + } + + @Override + public void resizeIfNeeded() + { + if (values.length == currentRowIndex) { + int newSize = values.length * 2; + this.values = Arrays.copyOf(this.values, newSize); + this.nullability = Arrays.copyOf(this.nullability, newSize); + setNullabilityToTrue(this.nullability, newSize / 2, newSize); + } + } + } + + static boolean[] initNullabilityVector(int size) + { + boolean[] nullability = new boolean[size]; + // Initialize all values as null. As Parquet calls this converter only for non-null + // values, make the corresponding value to false. + Arrays.fill(nullability, true); + + return nullability; + } + + static void setNullabilityToTrue(boolean[] nullability, int start, int end) + { + // Initialize all values as null. As Parquet calls this converter only for non-null + // values, make the corresponding value to false. + Arrays.fill(nullability, start, end, true); + } +} diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/RowConverter.java b/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/RowConverter.java new file mode 100644 index 00000000000..f612d7b7350 --- /dev/null +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/RowConverter.java @@ -0,0 +1,153 @@ +package io.delta.kernel.parquet; + +import static io.delta.kernel.DefaultKernelUtils.findFieldType; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import static java.util.Objects.requireNonNull; +import java.util.Optional; +import org.apache.parquet.io.api.Converter; +import org.apache.parquet.io.api.GroupConverter; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.Type; + +import io.delta.kernel.data.ColumnVector; +import io.delta.kernel.data.ColumnarBatch; +import io.delta.kernel.data.DefaultColumnarBatch; +import io.delta.kernel.data.vector.DefaultStructVector; +import io.delta.kernel.types.DataType; +import io.delta.kernel.types.StructField; +import io.delta.kernel.types.StructType; + +class RowConverter + extends GroupConverter + implements ParquetConverters.BaseConverter +{ + private final StructType readSchema; + private final Converter[] converters; + // The delta may request columns that don't exists in Parquet + // This map is to track the ordinal known to Parquet reader to the converter array ordinal. + // If a column is missing, a dummy converter added to the `converters` array and which + // generates all null vector at the end. + private final Map parquetOrdinalToConverterOrdinal; + + // Working state + private int currentRowIndex; + private boolean[] nullability; + + RowConverter( + int maxBatchSize, + StructType readSchema, + GroupType fileSchema) + { + this.readSchema = requireNonNull(readSchema, "readSchema is not null"); + List fields = readSchema.fields(); + this.converters = new Converter[fields.size()]; + this.parquetOrdinalToConverterOrdinal = new HashMap<>(); + + // Initialize the working state + this.nullability = ParquetConverters.initNullabilityVector(maxBatchSize); + + int parquetOrdinal = 0; + for (int i = 0; i < converters.length; i++) { + final StructField field = fields.get(i); + final DataType typeFromClient = field.getDataType(); + final Type typeFromFile = findFieldType(fileSchema, field); + if (typeFromFile == null) { + converters[i] = new ParquetConverters.NonExistentColumnConverter(typeFromClient); + } + else { + converters[i] = + ParquetConverters.createConverter(maxBatchSize, typeFromClient, typeFromFile); + parquetOrdinalToConverterOrdinal.put(parquetOrdinal, i); + parquetOrdinal++; + } + } + } + + @Override + public Converter getConverter(int fieldIndex) + { + return converters[fieldIndex]; + } + + @Override + public void start() + { + Arrays.stream(converters) + .filter(conv -> !conv.isPrimitive()) + .forEach(conv -> ((GroupConverter) conv).start()); + } + + @Override + public void end() + { + Arrays.stream(converters) + .filter(conv -> !conv.isPrimitive()) + .forEach(conv -> ((GroupConverter) conv).end()); + } + + public ColumnarBatch getDataAsColumnarBatch(int batchSize) + { + ColumnVector[] memberVectors = collectMemberVectors(batchSize); + ColumnarBatch batch = new DefaultColumnarBatch(batchSize, readSchema, memberVectors); + resetWorkingState(); + return batch; + } + + @Override + public boolean moveToNextRow() + { + resizeIfNeeded(); + long memberNullCount = Arrays.stream(converters) + .map(converter -> (ParquetConverters.BaseConverter) converter) + .map(converters -> converters.moveToNextRow()) + .filter(result -> result) + .count(); + + boolean isNull = memberNullCount == converters.length; + nullability[currentRowIndex] = isNull; + + currentRowIndex++; + + return isNull; + } + + public ColumnVector getDataColumnVector(int batchSize) + { + ColumnVector[] memberVectors = collectMemberVectors(batchSize); + ColumnVector vector = new DefaultStructVector( + batchSize, + readSchema, + Optional.of(nullability), + memberVectors + ); + resetWorkingState(); + return vector; + } + + private ColumnVector[] collectMemberVectors(int batchSize) + { + return Arrays.stream(converters) + .map(converter -> ((ParquetConverters.BaseConverter) converter).getDataColumnVector( + batchSize)) + .toArray(ColumnVector[]::new); + } + + @Override + public void resizeIfNeeded() + { + if (nullability.length == currentRowIndex) { + int newSize = nullability.length * 2; + this.nullability = Arrays.copyOf(this.nullability, newSize); + ParquetConverters.setNullabilityToTrue(this.nullability, newSize / 2, newSize); + } + } + + private void resetWorkingState() + { + this.currentRowIndex = 0; + this.nullability = ParquetConverters.initNullabilityVector(this.nullability.length); + } +} diff --git a/kernel/kernel-default/src/test/java/io/delta/kernel/parquet/TestParquetBatchReader.java b/kernel/kernel-default/src/test/java/io/delta/kernel/parquet/TestParquetBatchReader.java new file mode 100644 index 00000000000..3d60e93d811 --- /dev/null +++ b/kernel/kernel-default/src/test/java/io/delta/kernel/parquet/TestParquetBatchReader.java @@ -0,0 +1,403 @@ +/* + * Copyright (2023) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.delta.kernel.parquet; + +import java.sql.Date; +import java.time.LocalDate; +import java.time.temporal.ChronoUnit; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import org.apache.hadoop.conf.Configuration; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; +import org.junit.Test; + +import io.delta.kernel.data.ColumnVector; +import io.delta.kernel.data.ColumnarBatch; +import io.delta.kernel.data.Row; +import io.delta.kernel.types.ArrayType; +import io.delta.kernel.types.BinaryType; +import io.delta.kernel.types.BooleanType; +import io.delta.kernel.types.ByteType; +import io.delta.kernel.types.DateType; +import io.delta.kernel.types.DoubleType; +import io.delta.kernel.types.FloatType; +import io.delta.kernel.types.IntegerType; +import io.delta.kernel.types.LongType; +import io.delta.kernel.types.MapType; +import io.delta.kernel.types.ShortType; +import io.delta.kernel.types.StringType; +import io.delta.kernel.types.StructField; +import io.delta.kernel.types.StructType; +import io.delta.kernel.utils.CloseableIterator; +import io.delta.kernel.utils.DefaultKernelTestUtils; +import io.delta.kernel.utils.Tuple2; + +public class TestParquetBatchReader +{ + /** + * Test reads data from a Parquet file with data of various combinations of data types supported + * byt Delta Lake table protocol. Code for generating the golden parquet files is located: + * https://gist.github.com/vkorukanti/238bad726545e466202278966989f02b (TODO: Move this a better + * place). + */ + private static final String ALL_TYPES_FILE = + DefaultKernelTestUtils.getTestResourceFilePath("parquet/all_types.parquet"); + + private static final StructType ALL_TYPES_FILE_SCHEMA = new StructType() + .add("byteType", ByteType.INSTANCE) + .add("shortType", ShortType.INSTANCE) + .add("integerType", IntegerType.INSTANCE) + .add("longType", LongType.INSTANCE) + .add("floatType", FloatType.INSTANCE) + .add("doubleType", DoubleType.INSTANCE) + // .add("decimal", new DecimalType(10, 2)) // TODO + .add("booleanType", BooleanType.INSTANCE) + .add("stringType", StringType.INSTANCE) + .add("binaryType", BinaryType.INSTANCE) + .add("dateType", DateType.INSTANCE) + // .add("timestampType", TimestampType.INSTANCE) // TODO + .add("nested_struct", + new StructType() + .add("aa", StringType.INSTANCE) + .add("ac", new StructType().add("aca", IntegerType.INSTANCE)) + ).add("array_of_prims", + new ArrayType(IntegerType.INSTANCE, true) + ).add("array_of_structs", + new ArrayType(new StructType().add("ab", LongType.INSTANCE), true) + ).add("map_of_prims", new MapType(IntegerType.INSTANCE, LongType.INSTANCE, true)) + .add("map_of_complex", new MapType( + IntegerType.INSTANCE, + new StructType().add("ab", LongType.INSTANCE), + true + )); + + private static final LocalDate EPOCH = LocalDate.ofEpochDay(0); + + @Test + public void readAllTypesOfData() + throws Exception + { + readAndVerify(ALL_TYPES_FILE_SCHEMA, 90 /* readBatchSize */); + } + + @Test + public void readSubsetOfColumns() + throws Exception + { + StructType readSchema = new StructType() + .add("byteType", ByteType.INSTANCE) + .add("booleanType", BooleanType.INSTANCE) + .add("stringType", StringType.INSTANCE) + .add("dateType", DateType.INSTANCE) + .add("nested_struct", + new StructType() + .add("aa", StringType.INSTANCE) + .add("ac", new StructType().add("aca", IntegerType.INSTANCE)) + ).add("array_of_prims", + new ArrayType(IntegerType.INSTANCE, true) + ); + + readAndVerify(readSchema, 73 /* readBatchSize */); + } + + @Test + public void readSubsetOfColumnsWithMissingColumnsInFile() + throws Exception + { + StructType readSchema = new StructType() + .add("booleanType", BooleanType.INSTANCE) + .add("integerType", IntegerType.INSTANCE) + .add("missing_column_primitive", DateType.INSTANCE) + .add("missing_column_struct", + new StructType().add("ab", IntegerType.INSTANCE)); + + readAndVerify(readSchema, 23 /* readBatchSize */); + } + + private static Configuration newConf(Optional batchSize) + { + Configuration conf = new Configuration(); + if (batchSize.isPresent()) { + conf.set("delta.kernel.default.parquet.reader.batch-size", batchSize.get().toString()); + } + return conf; + } + + private static void readAndVerify(StructType readSchema, int readBatchSize) + throws Exception + { + ParquetBatchReader batchReader = + new ParquetBatchReader(newConf(Optional.of(readBatchSize))); + List batches = + readAsBatches(batchReader, ALL_TYPES_FILE, readSchema); + + for (int rowId = 0; rowId < 200; rowId++) { + verifyRowFromAllTypesFile(readSchema, batches, rowId); + } + } + + private static List readAsBatches( + ParquetBatchReader parquetReader, + String path, + StructType readSchema) throws Exception + { + List batches = new ArrayList<>(); + try (CloseableIterator dataIter = parquetReader.read(path, readSchema)) { + while (dataIter.hasNext()) { + batches.add(dataIter.next()); + } + } + return batches; + } + + private static void verifyRowFromAllTypesFile( + StructType readSchema, + List batches, + int rowId) + { + Tuple2 batchWithIdx = getBatchForRowId(batches, rowId); + int ordinal = 0; + for (StructField structField : readSchema.fields()) { + String name = structField.getName().toLowerCase(); + ColumnVector vector = batchWithIdx._1.getColumnVector(ordinal); + switch (name) { + case "booleantype": { + Boolean expValue = (rowId % 87 != 0) ? rowId % 2 == 0 : null; + if (expValue == null) { + assertTrue(vector.isNullAt(batchWithIdx._2)); + } + else { + assertEquals(expValue.booleanValue(), vector.getBoolean(batchWithIdx._2)); + } + break; + } + case "bytetype": { + Byte expValue = (rowId % 72 != 0) ? (byte) rowId : null; + if (expValue == null) { + assertTrue(vector.isNullAt(batchWithIdx._2)); + } + else { + assertEquals(expValue.byteValue(), vector.getByte(batchWithIdx._2)); + } + break; + } + case "shorttype": { + Short expValue = (rowId % 56 != 0) ? (short) rowId : null; + if (expValue == null) { + assertTrue(vector.isNullAt(batchWithIdx._2)); + } + else { + assertEquals(expValue.shortValue(), vector.getShort(batchWithIdx._2)); + } + break; + } + case "datetype": { + LocalDate expValue = (rowId % 61 != 0) ? + new Date(rowId * 20000000L).toLocalDate() : null; + if (expValue == null) { + assertTrue(vector.isNullAt(batchWithIdx._2)); + } + else { + long numDaysSinceEpoch = ChronoUnit.DAYS.between(EPOCH, expValue); + assertEquals(numDaysSinceEpoch, vector.getInt(batchWithIdx._2)); + } + break; + } + case "integertype": { + Integer expValue = (rowId % 23 != 0) ? rowId : null; + if (expValue == null) { + assertTrue(vector.isNullAt(batchWithIdx._2)); + } + else { + assertEquals(expValue.intValue(), vector.getInt(batchWithIdx._2)); + } + break; + } + case "longtype": { + Long expValue = (rowId % 25 != 0) ? rowId + 1L : null; + if (expValue == null) { + assertTrue(vector.isNullAt(batchWithIdx._2)); + } + else { + assertEquals(expValue.longValue(), vector.getLong(batchWithIdx._2)); + } + break; + } + case "floattype": { + Float expValue = (rowId % 28 != 0) ? (rowId * 0.234f) : null; + if (expValue == null) { + assertTrue(vector.isNullAt(batchWithIdx._2)); + } + else { + assertEquals(expValue.floatValue(), vector.getFloat(batchWithIdx._2), 0.02); + } + break; + } + case "doubletype": { + Double expValue = (rowId % 54 != 0) ? (rowId * 234234.23d) : null; + if (expValue == null) { + assertTrue(vector.isNullAt(batchWithIdx._2)); + } + else { + assertEquals(expValue.doubleValue(), vector.getDouble(batchWithIdx._2), + 0.02); + } + break; + } + case "stringtype": { + String expValue = (rowId % 57 != 0) ? Integer.toString(rowId) : null; + if (expValue == null) { + assertTrue(vector.isNullAt(batchWithIdx._2)); + } + else { + assertArrayEquals(expValue.getBytes(), vector.getBinary(batchWithIdx._2)); + } + break; + } + case "binarytype": { + byte[] expValue = (rowId % 59 != 0) ? Integer.toString(rowId).getBytes() : null; + if (expValue == null) { + assertTrue(vector.isNullAt(batchWithIdx._2)); + } + else { + assertArrayEquals(expValue, vector.getBinary(batchWithIdx._2)); + } + break; + } + case "timestamptype": { + throw new UnsupportedOperationException("not yet implemented: " + name); + } + case "decimal": { + throw new UnsupportedOperationException("not yet implemented: " + name); + } + case "nested_struct": { + Row struct = vector.getStruct(batchWithIdx._2); + assertFalse(vector.isNullAt(batchWithIdx._2)); + String aaVal = struct.getString(0); + assertEquals(Integer.toString(rowId), aaVal); + + boolean expAcValNull = rowId % 23 == 0; + Row acVal = struct.getStruct(1); + if (expAcValNull) { + assertTrue(struct.isNullAt(1)); + assertNull(acVal); + } + else { + int actAcaVal = acVal.getInt(0); + assertEquals(rowId, actAcaVal); + } + break; + } + case "array_of_prims": { + boolean expIsNull = rowId % 25 == 0; + if (expIsNull) { + assertTrue(vector.isNullAt(batchWithIdx._2)); + } + else { + List expArray = Arrays.asList(rowId, null, rowId + 1); + List actArray = vector.getArray(batchWithIdx._2); + assertEquals(expArray, actArray); + } + break; + } + case "array_of_structs": { + assertFalse(vector.isNullAt(batchWithIdx._2)); + List actArray = vector.getArray(batchWithIdx._2); + assertTrue(actArray.size() == 2); + Row item0 = actArray.get(0); + assertEquals(rowId, item0.getLong(0)); + assertNull(actArray.get(1)); + break; + } + case "map_of_prims": { + boolean expIsNull = rowId % 28 == 0; + if (expIsNull) { + assertTrue(vector.isNullAt(batchWithIdx._2)); + } + else { + Map actValue = vector.getMap(batchWithIdx._2); + assertTrue(actValue.size() == 2); + + // entry 0: key = rowId + Integer key0 = rowId; + Long actValue0 = actValue.get(key0); + Long expValue0 = (rowId % 29 == 0) ? null : (rowId + 2L); + assertEquals(expValue0, actValue0); + + // entry 1: key = if (rowId % 27 != 0) rowId + 2 else null + // TODO: Not sure if this is a bug or expected behavior. In Delta-Spark, + // whenever the map key value is null - it is stored as 0. Not sure + // what happens for non-integer keys. + // Integer key1 = (rowId % 27 == 0) ? null : rowId + 2; + Integer key1 = (rowId % 27 == 0) ? 0 : rowId + 2; + Long actValue1 = actValue.get(key1); + Long expValue1 = rowId + 9L; + assertEquals(expValue1, actValue1); + } + break; + } + case "map_of_complex": { + // Map(i + 1 -> (if (i % 10 == 0) Row((i*20).longValue()) else null)) + assertFalse(vector.isNullAt(batchWithIdx._2)); + Map actValue = vector.getMap(batchWithIdx._2); + + // entry 0: key = rowId + Integer key0 = rowId + 1; + boolean expValue0IsNull = rowId % 10 != 0; + Row actValue0 = actValue.get(key0); + if (expValue0IsNull) { + assertNull(actValue0); + } + else { + Long actValue0Member = actValue0.getLong(0); + Long expValue0Member = rowId * 20L; + assertEquals(expValue0Member, actValue0Member); + } + break; + } + case "missing_column_primitive": + case "missing_column_struct": { + assertTrue(vector.isNullAt(batchWithIdx._2)); + break; + } + default: + throw new IllegalArgumentException("unknown column: " + name); + } + ordinal++; + } + } + + private static Tuple2 getBatchForRowId( + List batches, int rowId) + { + int indexStart = 0; + for (ColumnarBatch batch : batches) { + if (indexStart <= rowId && rowId < indexStart + batch.getSize()) { + return new Tuple2<>(batch, rowId - indexStart); + } + indexStart += batch.getSize(); + } + + throw new IllegalArgumentException("row id is not found: " + rowId); + } +} diff --git a/kernel/kernel-default/src/test/java/io/delta/kernel/utils/DefaultKernelTestUtils.java b/kernel/kernel-default/src/test/java/io/delta/kernel/utils/DefaultKernelTestUtils.java new file mode 100644 index 00000000000..1af8bfeaca1 --- /dev/null +++ b/kernel/kernel-default/src/test/java/io/delta/kernel/utils/DefaultKernelTestUtils.java @@ -0,0 +1,25 @@ +/* + * Copyright (2023) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.delta.kernel.utils; + +public class DefaultKernelTestUtils +{ + private DefaultKernelTestUtils() {} + + public static String getTestResourceFilePath(String resourcePath) { + return DefaultKernelTestUtils.class.getClassLoader().getResource(resourcePath).getFile(); + } +} diff --git a/kernel/kernel-default/src/test/resources/parquet/all_types.parquet b/kernel/kernel-default/src/test/resources/parquet/all_types.parquet new file mode 100644 index 0000000000000000000000000000000000000000..460a69b66ec7f18d9e80205582c5e6e1abe0626e GIT binary patch literal 24532 zcmeI42Urx>`u}H^1$J>4QJHm7)CB<%0qI3ic2-5~sMtm6iqZrusOW--SV8O^5m6Bv zHtZ|bs7Z{;jmh<=F)`P~l&eWhH~Ra&duT?JkoJG>ef~clKl9yv=gfP~%z4k5cgh+j z4)GK?;hI9Yq7dFIn!iVCD-@c*VkXEquFOnh$#I;?Q$5csP1TV+k8cuQwG$!*roarC z0~F`vzydS@3ZMjh$m3g*rubk5nt|q^1+WGJP=S^}4QxOw&>Gl+HlQtN2Q2;0=6$FYp8YAOHk{E}$z20^LA&5Dc_H1Uk?I z^aQ;?2ffnX2_1B1a3FcgG?VPH5I0Y-vRU^Ey5#)1ei4vYtpAPPi- z7!V8MKs=ZLCV~Vo2}}lwAPFRc6p#w0fHaT}GQd$ zpcrfgC14vU1=~Rx*a3EeU7#H71{Gir*bDZ7{onwo1P8$(a2Ol`M?n=h29ARh;3PN& zPJ=VR0M3GHa1NXY7r;eu31F~XRtN@#aQ#yCzf2kqpHeeh3VM^Lk-Uvm?NKv&)R$pv z?C7E4M}vnCd;GfOKY8^0nUNEU9sExoJ%4KCVBfMq4wwgW!F-Sh7Jz)P5G(?V!4j|( zECU6g5G)5pU;@HJ z57-O#f&JhBs00VWA#fNR0Y^aE+w&kZuIki>I&~bhIr37XvFa>779LRtiSb!!#0hFM2@X#RqrLm*N zIvdamv<9}I4QLD60S&MN_MknWW;p^U&;dAuj=%+U00h#~>Py$QP6j*^~ zpgCv(tbqVjpe0ZP8_*iqf;ONnXs49fDKI+icvBj6_DtF{abV)e#ED4pB|i4PNBCVou(nFKHiWYUF6S0+JBx-sd_B$$boiO599qz99p zOnNa1VbYsPD3d-+`ZDRqWB`+aOa?ItV=|b@5GF&Jgfkh&WH^%%Ohz&p#bh*-F-*oX ziC{90$#^D_Orn@XGl^jm%Osx31SS)iBruu8WHOUPCP_?^nWQjDWio|H8k2M;8BC@! znZ{%~lNn4hnapG|i^*&zSxn|K(KE?rlEY*klUyeAndC89z$Bl^LMDrtEM~HV$xlTJElQATUG zU#EStYq0j_wn#0?YnQH>qdjyvPg~M!8Op8FW(99T*;4JuzzUQp;^Ut* zBI+04pXn@)40IJ=|F*M;wumP){6%l?u44HY-9@xZT$>ysTDtWW=YKjtOrJPReBd-n zjQrPF5p5L*M#qX5G!wPct{vqz|eG~iluIOlX3;Xy#;>PoL zu%CCu=JP(lzJ4e!6z^kyKNdOe6YTS+;`Gy>W4|AY|DOE?_Wdg{w(Hl}|8K=x-+zZ= z@Pjy{>Isg+Q`kA3q!X_zZY9y(`rbsB`9P{``L3xB{i7RFWv+X)L#A80My^9&>0X)E zL>C{g&`F0Yb?7%;uy0e{Z3ipeEbC@E^r7y^SIu?d_gd(_ykV_Ff9l*SRl2KNTk0k* zSL@KXy3*8EI$dOI-Ft&H{A-gyUtVAS%>)0C4JKoBHLyDRU)Umig zbqM08bhEhm#u|kvmuYN8VQ=rK+fy8sRvK4vLVUc_y*r*twZ>OG_qFrJo64nKq`!!(1@HEj=XJvT%-QZp1?S8${4(r{&@W*EAbd$eKraOFhJ z2y}g_(nd2a|9ClGd0jg}{9vZ_#bbLW zh_7C|7CxhBqVgS0f_SX${Cj@W6O?!Hxiqy~+Sg+yDL>{W@dgEVNIMyVHj@;jIVM7I z*h$)YS|qt~nj|_PdmjEf2__t`WZq!ab?=+WVn-`31*R*BRVn;PR~-3N47hC-s|Kd> zk+g@YRQ|Ccm7l1X1SGeQHm@RI$%pZYo+JRqmeZrZ;}yxk#o%5H%$Bi!w-XR7nyZ+@Fp;I}kc*1_Xzgf;4rigs=sc3JcFuDk=tGjVbJmlfD@oqFt(XfzjFLEY zn+tJ8*UqJQN#6LyTzG_e0Ic}5ns1cp5N1xkAl)op8_Ioh5EmF8Id+li;D09NVEl>snjD%I?#XxLp#KpqIcQ2=;9U{~ z4T(#F%{&NVhvc=jLr4%pBx#oylOU2vgzwIiAfQN&b$CjGahD5;7~xNX2qWPZq>>=e zNJ@|FB0;?6qJ^RNKgh)qz!)X@RIr^7LHv;v^cqHjS&YPc>M|029q&|jFnjEKfoG8bBU?5l@86bH9H%n}E2*navc_#c&}+`1*|>k{Zqxoob$+ZKmgE zG6%j~-~%QNtNYttUT%Rc7g-B$DTJz<>z11}+<3l4O?DPdj}Kgo_dq>)Q_c%`10UcE z`~YrtZ~?rjG*H63aUP~-oG0)C-oOX=0zcpn0!+iRE! zrp@cup(R1Tf*;oTp{2h@8rBCH@rO2s0F(pL05ARqefUKJvF~e1RYE2LW<(c?e4UoM70V#|z&ZAocPwO@JZf1-xNv3_d8=7<^Ig7m)h;n|@Tc z(hmh2gFnjoV{d+q0IUy?&j%sDN+8z%0zrZPCK!93fdNbc*%-R65b}2v?z4R97>cJc z)b{_6F~r8#ua6;&FS(y-TBAk}jj;cq5%l7SYA||eB-M_fhQn&UaWpk%^iY|`j2aJdb}w z55%9*iH-*nKqC&T{<7z9E#d3$dIpDS;T?%^Ln17DczuP$I4wxoi!Bv=&8c+~riDlA zn)7DfS}SR`Mwa8DIEPPzt$Pht=SA6GngWwUpCX4vCC*nXT@~j%`oHP3^j^U0Uj;Rl ziec8hmSd&QfH!*%wx*N%sMT>1H>XBnOMF^imT>DxQtL}oVA}jq=ACb|+R1Vv)p=W; z7nKAoyBAvIfI_2C3M)%sOITl4US9s`@x88BE0b;=+50IyRu!beT?e7Ejc|D5H;2uG zgnSd~nElOr8kU_uGC;sHux^f=zRkj<4K8wpwYFk_!-HVCq9@n&T%Qs%JRQSY!z&B8 zN3F3|KeGSPZk|-^Bdg@aA##O=o0hktrxFj)cn>RSzrIEGRBA<2{;ofjvVZHS+W;zc zpwI35E$}!RrGhP2`gEjHuifaI(;YQwBSKTVn&LS%YT`aqbXeF5YYj8sxczo#DkfjO z(xDf8XxBD5ejCqbm%~Sg>#3dWsMO(=9Wwo>RL@a&F7~5RZ@y-tZ;prEC>6cp_^TbL z)T0qoT6aTD`nj*JehcBZ0ySwZiw4-+U@f<4xHQ$Bigjvz;#3d#&>t_nI}J~QSHef1 zVQbE}qf+nK3~cU8rQ+{MBl}XRuZOzW;F<6$l*(1VAMZq^THIc9s4G?)whlR!gkWE7 zx8KU6WyXn?SgAd8%SG-+r3MziKU_zp{zovklH(~nTulDwsfe~z>W9r4yS=H@!kA9q zhf=AQ?}k-2#dCU;D%)hvIZ`R#=+gtbU?q3{<~=ivtaYep$j0^IDy-DMA33h!Cl)7y1zOFr$ z8u8JG_XDWZ{oNaU22!c1c4t4g#%ls7WpaOdS453QYo`gSSL#jLZt+yLLZ$^qEO*(9-4I`2rCV7E?YAY zD%()eHg@w|;O&hWEIzEB}gHr=(cq zGF-yOP%IiG56yN_Eb5oXx<(@wGf_&@@!mcMDz)$Pbw34SrJ-BQ8G5l}rd^oTwGPvy z6pPw1pScd9SmZw0e{T!LqG5)Ud=z4F7F?KrdZvtGQM-EIw8xlNG=_r@hWx0++haWL zknffEp%B!*k!Dy;A;?AgPydjn6T`nNHf}-)X2C_)l%b!}bfQ)GSp;I@(r_Dgcm5L7 z$sF`t$JG15#(AsCXVX%OMQ!izhu)&ugnK(Vb0uPNE?mTVdwf8%iT9Mumhu&vO!TGVd6jY6@;>_DbxbDnP^x-XaU0{b z6{a0AnPNr%-r?B}h?QJalswZrjberSsrx5qXfiP@HrcAASk$-5ym%3l$$Yrj)h)}~ zIBNw+e4;5-3>Pncvab)78Wv=gfKbUpsSmkJM`$wP(l2C5jk6Ztb=xUSCJQi`6h0f( z#5ilUJ8w0LB8A(Vf2mA_3q#P;&&DHC^5NoY^|n22uu?mE!RQ}pGT|QdYu3j&YfV4v zM=xzIL@Bep&j(SYXjkL~KyvRh`)~YlCwW>jyNgXhT+J z=OR?Bald9qH*3{Mtc!UqVzKeW^JS6syO>O@wb`;RMW(I#%dD(kee|C3th0If4mabR zsQz%$9U3tH~fp&jVGOnHapuOOkB0!E_?NDjC4AKA!9gh zB2<|OA6?#djJ?ZOlQG=pZE-ws*DsN8CVh0x^l7lno8u)WQd2W?ncSj@LTTC5s#)_E z)`F^~+NM=&+cs_6Y3%IVJ2*OZaPH{R$<@uhvxldbw~w!%e?VZDu0h?p2Wv%LkDk3k zdWZJu+pqtCfrG*Z4;dOhZ1{+gqehPz8!>KtWK?uaY+U?=i3yV?CnhDQq)tgo&zL%G z`i#t(vu0<_nXAvvnU_01Z$bXTMT?g#T~<)IylBPBRjb#mUAKP2#!Z{I6mKorR=T}x z$Ie~lyDRqW-M9Zh<-tRTj~uN!cKpQ2Q>V`u&Q_npKG9y6SPP#ig!4aTzwz7)z{@RM z5xQKViG=b$ynDg@#tjH{ASH>O|IK$G>hDg}-eveLZOk2rpIt+}Q+q-6QvHoP5Ojw^ z8W<=JlX-gl(c2I|zldVre|Fb_E~HXlxD7#t{>v|<>MyQ->o!E~`rmai#R>mZ2%8nc z!oBN`Joj?K9y5&_#)rvI&)S--HpPpLCN!=jyzl?L*cgHX(Hn$Kk?(g-jTi~^&<81R2DHX`B1aVP=XKq=S`%D@h=6YK)zU^l1$d%#|> z59|jAKqWW`4uQjfaz%Ox_)opqD5dx2@ynCDo=2e$u; z`wi}BXJ_C6Jb@SR20p+S_<=wA-fu5#`*#hIvGDzO4U=^E&Hz)vG%y{^0GVJWm<48o zEHDSm1$vMTa=<*03+977umI$PgbUU>PU?g1>%YvzLfNq#luY;xQxG84==?txT8^XA+{ z+bagA||rc>19q z2Hn9fR-EkNkFRxa&q*J4^+T@fbpz9~Ft9K8IGdM6aXc@Pn3zbVrlw|Q=H@b)Ty9~} zq=`bIR9ad#ZE9uJtXcEsEm~Mx3xZ13l2YPUty|mLwrSJ0ZM$|Fjh&snef#zf4vvma zP8~WpJ9q5p;?k*;tE-!vyL;!(9+WA2`}p|!`uX|$2LuELcInc!YfwGthqWjl84+_`I4 zdHL?$6%~6ZIp2TaKxO5@gNF_sK78cJ(W6yW$BrF8e&WQ*lc!FdK7Hnl!Ep9$b@jP( z=g(ibaPi_L^nazUbmGdZkz1_c{bjh9`P78NzrHk? zDtV5_i`x=l0{*TS#aqGmOI{Y|G_czNd(a*@07u{iIsj+T5x9U(z!kUwchDJl08ikh zl!=q@I0p|waU+19gOVVeN$@n31P??>0-1DS(v?XNlWt7@{#U>Ivx){VdEpIk8QM8` zAvc6qQj3SOQsGR7F&WNe1e1|WMll)9WDJwBOd^>4uD8FVYF^NdXWRbAzX3jnwe!E> zMR4lktwRyAYgwDuG5ODZ8+<=&;D6Xl;h{K;7XkfUVnR(|mq1ALtUXF)YM2hM{F zfJP0Cpv&MDfW+avx$xLbcw=8d0p0X8t4$?8Fj3PrOicJNQ=TgaE~X0UuvMl?X?b6{ zCAS?6ky|?KqG&eRSw2oO$CP`V!@E#=C^6yjCZp8M)Z9#FE|*!zn^-8CC>55)XHIfc^sq3>p|VXmFS<-Djm6t;0r+7&UVAs4=6*j)@pME@J$+$njB; z(NQtcu`zM6@o^L4Cr(J1I4NQBq{PX#bOSaC>0w&>l#KML8PldtpEhH9=8Tz{vu4ho zl{I@#*4#Pzx!L-h?0Gr4^XBK~&0mmbOLu1T7cWA3xU67lVZrjkqU9@!R<2mJa`mb; ztJkhsw|4!y4eK{<*tBu;rY)O`w`?sg*=kF-Y`2wd-%*D2uzdHfirsrE_U_rYcmKWv z`zsF|tUPq^@S!7zj~=NydaUaBu@lEno;Y>VmhR=AK5M8xd#)Pk;l)cAE~5vp+Xy*I z;jM~w59L9^dgGM%u|jG>KM}O3{fQv`6p(%M&# z6!ZaoK|jzR3;+YcAP@!ygCSrj2nWN!a4-Ul1f#%cFb0eT2ASvcMcL7i0oG$ObuJ9>@jr zK^|BD^1(u|2rLFmz*4Xb6o5jo929{SU?o@$)_}EO9as-GfK^~4*aS9%Eua`|1tnk` zC;@HJ57-O#f&HKo90Z5JVQ>T-1qVPCI0lY`6W}B`1x|xAzyQvI zYH$vm2N%Faa0y%nuYfDyD!2w-Q_6x~$HYc|*O|~%iA(Oi=0e$$f-lS)PL=l?HC4Lq zq7%KzlV{^hOuoa@6#Pam;K`5p^5o1o6VoTpetb5NH-DVZpLsSAwqQ1o)<0t6Elrx5 zTA4SKH*eBHX>BPqRav!crfzQ2qLp=P!B*9#Wm|PS8%--a`!?;{I<#}tIN9OZlf6s( zP7bb)ZcgqUI(PJN@$BT~>h0#^?%UbV!{0N&E6}@(Pgma{zit8DgS%+Eib1+=cqr99 zB)GRWRP3Yc+oNBv{viW;4-6gDC#>(_ena{X9S}Zn*r4HIBLM~{ga8ygWfE`I!ksEN@DF_U5^$0f!mO-P=Yl8`!S%H*`f^rVdBsj1VZOi!DU znK5%JuB)fbnLc-hJ~Mk}&a8P^xpU^v&C@T)&d*smZ&B{z`AhPaE?Aadu&{8^@+Cz} zS1emuuxj;kJl`r>zhcA6jjJ}T-n?eZy5jX)HJudyCShyRa){@wcYiw~&7+5e4WhWn#tYDz2%F`=5<8Xu2IaF?>_ z;}#}Tm`wo+{ptUnpe1~$femN{S_4}^IVRVL`bjcO7fi z%H&qkNI6W-7BD@6sj3M~dIe0OO1Uh^$`VfS_fC!4G?Unu) zJIu5{WXAu7U@ZMJb8}{vVJa~G*{Ef-_w+|wS=DSszet8N^l2o+|2g*M%30; z)3%0{)%GP-_OXPsu~o^0rwSFhv{mIPg~zp@f3Oj**M7d@B$Qd#lzG%ec&FBWxwr61 zt-W%raL1~qOkt9cQS&i3S;$w`d|on3$gTZ+d@(QNnbfS&z ziuBZ|w8`VsVI!e?sU1mmHcxGyxW83}}Eo9A%rD2y3otToAVO*iX3ah#m zP5LBf#7&4xqgA-mLN5BJBu`*pWwd2TVoH>8OHQ!BcZ;4WQ==1&b+Gd0v2ig8Nl}T8 z7Pu_2b{$Jgs+L3HCc2cA#JH$r=1xJI56wtRVE!Gs{&cbvjAmUza#UI-D`(u1o>4VB zqUfJUrOdFgGCUzEE^)ow#R=x|D+qG@wQQdDZg3MKtrS%vb+ahc<%MI}y+qt;6tIkgEaY+F;K-Pq!q z&0}ztniw|&E&eSBsFQ8puptZ!s>7Q`k= zHe;&HoQ+c^`-Z`W2_J36|L%ACpaf5V)>cRacg`sf|9efX*ZP2CZl5Srj}<5Zzcgj2 zjdP1i=G@?0L%(mquUd7H`TYSE*7|K2uXUcUE1Z6?dERsLU-JB!(eq>G`HGXRAgw(P zwQ*nLR;?t=qHm2=(HUvF)=1G$H22TJxxgzlWvqb|0Btfrd!@xI(@)@gKrJ}ZZ!J63%@wvmwVd^7Bzkwj7#QtqmAklYZSo_)ShqD zbtfSl`IlH5R&>Z%(Xze99jF{zf1GN^JFBJCMs>%^sd8+PpP6&riA!B~Hf!kdn$hDm z=CKgJq~mIu-f)&+UUPX>hEriu5YBmOoPW&nY+AR#4O)hx!SI z`8sW_%2%RDke|#MXG}Hk+0f4q#zs}Zj|1nlWL5(|^)phPpIjx1yxdRGVe0Se%n!~F zEMlcJzG`PO=4P*%YQLEiN;nnf^S12dtTvvhX*AHOntkJJTrrbMY4l<;Fb=kI)X_@y z7zO1q*b@y&d<-Nm`Rzp zx2;vhN-7!T$Falc(eNTAgpUvmSIlO)zRu#%sU#e8!SkpF6PhO#;PDw`198_rH+~Dg=j#AMP z<6%d&9;o&Xs+FQz&L;GQQ^Io--XP!-HF4 zY(VJ&iJ;}2kS)~h_j4ze`V$GAK|&?KKYeFH~B4vY~39*r^Hy+Q@1D0*P&Xwx_O30adonbvat}+_0xt9;HxwiWX=&)!J|9gzO|!6;?=)QQ5ev z)hTHc+@n&Xkn*`3vp@G~acSuZDamS6Z+CBZPfp$Og%asg65Wv~PMI2);hvT<-90HT zBPup3BT8LtImW>$b1UJL?^!Q!3hc4-FF3dLIcaYi+!j+LL>|i$tWk zjo-d53$Aw%Pl<8kJ=EvT^)6zKxpd=w)TG)C9I)Km+0(<-t7GH+G)T+qeMPfez42b^ z&$RVU5K2hz8}Fn3EL-m*+ITI{cptSV*m@5L@ZS-m&s|m2JIScM*l2u9p1T^U^VFDe z78O-joQC7?I)+g(b=x+os+b0KHF6+DWc|j7|IGsRQ$pQ^l=0&0#ikqA!(4OmMVES* zi0T^}laibfm5`i15Hk>)eSWKPFR7bNdFhR;y-|lXy3Uz>W9Ak*Vt-~=8eUM=)m+z5 ztc4MYzzc`ZcuiSjt!byx)ZYH#@|2A+ao5xBhcGBz%MSw*DJ<5CO*n9#@i>_KQ7+e eo1SKS#RU3v#8o%$cKqRg+}F5e6NdlK|9=6ZG4whB literal 0 HcmV?d00001 From a17bff9f94c0d66ef2cd53c6ef0f299ec35bd5a1 Mon Sep 17 00:00:00 2001 From: Venki Korukanti Date: Tue, 20 Jun 2023 23:46:26 -0700 Subject: [PATCH 2/9] small issue --- .../java/io/delta/kernel/parquet/RowConverter.java | 2 +- .../delta/kernel/parquet/TestParquetBatchReader.java | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/RowConverter.java b/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/RowConverter.java index f612d7b7350..a50432f5ca3 100644 --- a/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/RowConverter.java +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/RowConverter.java @@ -69,7 +69,7 @@ class RowConverter @Override public Converter getConverter(int fieldIndex) { - return converters[fieldIndex]; + return converters[parquetOrdinalToConverterOrdinal.get(fieldIndex)]; } @Override diff --git a/kernel/kernel-default/src/test/java/io/delta/kernel/parquet/TestParquetBatchReader.java b/kernel/kernel-default/src/test/java/io/delta/kernel/parquet/TestParquetBatchReader.java index 3d60e93d811..3814d67d9fd 100644 --- a/kernel/kernel-default/src/test/java/io/delta/kernel/parquet/TestParquetBatchReader.java +++ b/kernel/kernel-default/src/test/java/io/delta/kernel/parquet/TestParquetBatchReader.java @@ -127,9 +127,15 @@ public void readSubsetOfColumnsWithMissingColumnsInFile() StructType readSchema = new StructType() .add("booleanType", BooleanType.INSTANCE) .add("integerType", IntegerType.INSTANCE) - .add("missing_column_primitive", DateType.INSTANCE) .add("missing_column_struct", - new StructType().add("ab", IntegerType.INSTANCE)); + new StructType().add("ab", IntegerType.INSTANCE)) + .add("longType", LongType.INSTANCE) + .add("missing_column_primitive", DateType.INSTANCE) + .add("nested_struct", + new StructType() + .add("aa", StringType.INSTANCE) + .add("ac", new StructType().add("aca", IntegerType.INSTANCE)) + ); readAndVerify(readSchema, 23 /* readBatchSize */); } From 40011793b59eafd1543b2ba7694292c1a950d011 Mon Sep 17 00:00:00 2001 From: Venki Korukanti Date: Wed, 21 Jun 2023 10:12:29 -0700 Subject: [PATCH 3/9] Update DefaultColumnarBatch.java --- .../delta/kernel/data/DefaultColumnarBatch.java | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/data/DefaultColumnarBatch.java b/kernel/kernel-default/src/main/java/io/delta/kernel/data/DefaultColumnarBatch.java index e227b05e993..46f2c0dfbce 100644 --- a/kernel/kernel-default/src/main/java/io/delta/kernel/data/DefaultColumnarBatch.java +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/data/DefaultColumnarBatch.java @@ -1,3 +1,18 @@ +/* + * Copyright (2023) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package io.delta.kernel.data; import io.delta.kernel.types.StructType; From aabccb3ea93ceadc90d34980260bbbd6fefb4275 Mon Sep 17 00:00:00 2001 From: Venki Korukanti Date: Wed, 21 Jun 2023 17:39:09 -0700 Subject: [PATCH 4/9] address review --- .../java/io/delta/kernel/utils/Utils.java | 92 ++++++++++++++---- .../io/delta/kernel/DefaultKernelUtils.java | 76 +++++---------- .../kernel/client/DefaultParquetHandler.java | 13 +-- .../data/vector/DefaultArrayVector.java | 3 + .../data/vector/DefaultBinaryVector.java | 6 +- .../data/vector/DefaultBooleanVector.java | 1 - .../kernel/data/vector/DefaultByteVector.java | 8 -- .../data/vector/DefaultDoubleVector.java | 12 +-- .../data/vector/DefaultFloatVector.java | 12 +-- .../kernel/data/vector/DefaultIntVector.java | 8 -- .../kernel/data/vector/DefaultLongVector.java | 8 -- .../kernel/data/vector/DefaultMapVector.java | 3 + .../data/vector/DefaultShortVector.java | 13 +-- .../data/vector/DefaultStructVector.java | 1 - .../delta/kernel/parquet/ArrayConverter.java | 23 +++-- .../io/delta/kernel/parquet/MapConverter.java | 22 +++-- .../kernel/parquet/ParquetConverters.java | 88 ++++++++--------- .../io/delta/kernel/parquet/RowConverter.java | 15 +-- .../parquet/TestParquetBatchReader.java | 32 ++++-- .../test/resources/parquet/all_types.parquet | Bin 24532 -> 24577 bytes 20 files changed, 220 insertions(+), 216 deletions(-) diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/utils/Utils.java b/kernel/kernel-api/src/main/java/io/delta/kernel/utils/Utils.java index 16b57a3becc..1fbff7fbe8c 100644 --- a/kernel/kernel-api/src/main/java/io/delta/kernel/utils/Utils.java +++ b/kernel/kernel-api/src/main/java/io/delta/kernel/utils/Utils.java @@ -28,30 +28,36 @@ import io.delta.kernel.types.StringType; import io.delta.kernel.types.StructType; -public class Utils { +public class Utils +{ /** * Utility method to create a singleton {@link CloseableIterator}. * * @param elem Element to create iterator with. - * @param Element type. + * @param Element type. * @return A {@link CloseableIterator} with just one element. */ - public static CloseableIterator singletonCloseableIterator(T elem) { - return new CloseableIterator() { + public static CloseableIterator singletonCloseableIterator(T elem) + { + return new CloseableIterator() + { private boolean accessed; @Override - public void close() throws IOException { + public void close() throws IOException + { // nothing to close } @Override - public boolean hasNext() { + public boolean hasNext() + { return !accessed; } @Override - public T next() { + public T next() + { accessed = true; return elem; } @@ -61,14 +67,17 @@ public T next() { /** * Convert a {@link Iterator} to {@link CloseableIterator}. Useful when passing normal iterators * for arguments that require {@link CloseableIterator} type. + * * @param iter {@link Iterator} instance * @param Element type * @return A {@link CloseableIterator} wrapping the given {@link Iterator} */ - public static CloseableIterator toCloseableIterator(Iterator iter) { - return new CloseableIterator() { + public static CloseableIterator toCloseableIterator(Iterator iter) + { + return new CloseableIterator() + { @Override - public void close() { } + public void close() {} @Override public boolean hasNext() @@ -91,8 +100,10 @@ public T next() * @return A {@link ColumnVector} with a single element {@code value} */ // TODO: add String to method name or make generic? - public static ColumnVector singletonColumnVector(String value) { - return new ColumnVector() { + public static ColumnVector singletonColumnVector(String value) + { + return new ColumnVector() + { @Override public DataType getDataType() { @@ -100,21 +111,25 @@ public DataType getDataType() } @Override - public int getSize() { + public int getSize() + { return 1; } @Override - public void close() { + public void close() + { } @Override - public boolean isNullAt(int rowId) { + public boolean isNullAt(int rowId) + { return value == null; } @Override - public String getString(int rowId) { + public String getString(int rowId) + { if (rowId != 0) { throw new IllegalArgumentException("Invalid row id: " + rowId); } @@ -130,7 +145,8 @@ public String getString(int rowId) { * @param scanState Scan state {@link Row} * @return Physical schema to read from the data files. */ - public static StructType getPhysicalSchema(Row scanState) { + public static StructType getPhysicalSchema(Row scanState) + { // TODO needs io.delta.kernel.internal.data.ScanStateRow throw new UnsupportedOperationException("not implemented yet"); } @@ -142,7 +158,8 @@ public static StructType getPhysicalSchema(Row scanState) { * @param scanFileInfo {@link Row} representing one scan file. * @return a {@link FileStatus} object created from the given scan file row. */ - public static FileStatus getFileStatus(Row scanFileInfo) { + public static FileStatus getFileStatus(Row scanFileInfo) + { String path = scanFileInfo.getString(0); Long size = scanFileInfo.getLong(2); @@ -151,13 +168,48 @@ public static FileStatus getFileStatus(Row scanFileInfo) { /** * Close the iterator. + * * @param i1 */ - public static void safeClose(CloseableIterator i1) { + public static void safeClose(CloseableIterator i1) + { try { i1.close(); - } catch (IOException ioe) { + } + catch (IOException ioe) { throw new RuntimeException(ioe); } } + + /** + * Close the given one or more {@link CloseableIterator}s. {@link CloseableIterator#close()} + * will be called on all given non-null iterators. Will throw unchecked {@link RuntimeException} + * if an error occurs while closing. If multiple iterators causes exceptions in closing, the + * exceptions will be added as suppressed to the main exception that is thrown. + * + * @param iters + */ + public static void closeIterators(CloseableIterator... iters) + { + RuntimeException exception = null; + for (CloseableIterator iter : iters) { + if (iter == null) { + continue; + } + try { + iter.close(); + } + catch (Exception ex) { + if (exception == null) { + exception = new RuntimeException(ex); + } + else { + exception.addSuppressed(ex); + } + } + } + if (exception != null) { + throw exception; + } + } } diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/DefaultKernelUtils.java b/kernel/kernel-default/src/main/java/io/delta/kernel/DefaultKernelUtils.java index 7be9eaacdee..b98819e4e31 100644 --- a/kernel/kernel-default/src/main/java/io/delta/kernel/DefaultKernelUtils.java +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/DefaultKernelUtils.java @@ -17,6 +17,8 @@ import java.util.ArrayList; import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; import org.apache.parquet.schema.GroupType; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.Type; @@ -38,33 +40,41 @@ private DefaultKernelUtils() {} * @return */ public static final MessageType pruneSchema( - MessageType fileSchema, // parquet + GroupType fileSchema, // parquet StructType deltaType) // delta-core { return deltaType.fields().stream() .map(column -> { - Type type = findStructField(fileSchema, column); + Type type = findSubFieldType(fileSchema, column); if (type == null) { return null; } Type prunedSubfields = pruneSubfields(type, column.getDataType()); return new MessageType(column.getName(), prunedSubfields); }) - .filter(type -> type != null) + .filter(Objects::nonNull) .reduce(MessageType::union) .get(); } - private static Type findStructField(MessageType fileSchema, StructField column) + /** + * Search for the Parquet type in {@code groupType} of subfield which is equivalent to + * given {@code field}. + * + * @param groupType Parquet group type coming from the file schema. + * @param field Sub field given as Delta Kernel's {@link StructField} + * @return {@link Type} of the Parquet field. Returns {@code null}, if not found. + */ + public static Type findSubFieldType(GroupType groupType, StructField field) { // TODO: Need a way to search by id once we start supporting column mapping `id` mode. - final String columnName = column.getName(); - if (fileSchema.containsField(columnName)) { - return fileSchema.getType(columnName); + final String columnName = field.getName(); + if (groupType.containsField(columnName)) { + return groupType.getType(columnName); } - // Parquet is case-sensitive, but hive is not. all hive columns get converted to lowercase - // check for direct match above but if no match found, try case-insensitive match - for (org.apache.parquet.schema.Type type : fileSchema.getFields()) { + // Parquet is case-sensitive, but the engine that generated the parquet file may not be. + // Check for direct match above but if no match found, try case-insensitive match. + for (org.apache.parquet.schema.Type type : groupType.getFields()) { if (type.getName().equalsIgnoreCase(columnName)) { return type; } @@ -81,20 +91,12 @@ private static Type pruneSubfields(Type type, DataType deltaDatatype) } GroupType groupType = (GroupType) type; - StructType deltaStructType = (StructType) deltaDatatype; - List newParquetSubFields = new ArrayList<>(); - for (StructField subField : deltaStructType.fields()) { - String subFieldName = subField.getName(); - Type parquetSubFieldType = groupType.getType(subFieldName); - if (parquetSubFieldType == null) { - for (org.apache.parquet.schema.Type typeTemp : groupType.getFields()) { - if (typeTemp.getName().equalsIgnoreCase(subFieldName)) { - parquetSubFieldType = type; - } - } - } - newParquetSubFields.add(parquetSubFieldType); - } + List newParquetSubFields = + ((StructType) deltaDatatype).fields().stream() + .map(structField -> findSubFieldType(groupType, structField)) + .filter(Objects::nonNull) + .collect(Collectors.toList()); + return groupType.withNewFields(newParquetSubFields); } @@ -158,30 +160,4 @@ public static void checkState(boolean isValid, String message) throw new IllegalStateException(message); } } - - /** - * Search for the Parquet type for in the {@code groupType} for the field equilant to - * {@code field}. - * - * @param groupType Parquet group type coming from the file schema. - * @param field Sub field given as Delta Kernel's {@link StructField} - * @return {@link Type} of the Parquet field. Returns {@code null}, if not found. - */ - public static Type findFieldType(GroupType groupType, StructField field) - { - // TODO: Need a way to search by id once we start supporting column mapping `id` mode. - final String columnName = field.getName(); - if (groupType.containsField(columnName)) { - return groupType.getType(columnName); - } - // Parquet is case-sensitive, but hive is not. all hive columns get converted to lowercase - // check for direct match above but if no match found, try case-insensitive match - for (org.apache.parquet.schema.Type type : groupType.getFields()) { - if (type.getName().equalsIgnoreCase(columnName)) { - return type; - } - } - - return null; - } } diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/client/DefaultParquetHandler.java b/kernel/kernel-default/src/main/java/io/delta/kernel/client/DefaultParquetHandler.java index 0c2dd8ea78d..242e2c452c9 100644 --- a/kernel/kernel-default/src/main/java/io/delta/kernel/client/DefaultParquetHandler.java +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/client/DefaultParquetHandler.java @@ -61,7 +61,8 @@ public boolean hasNext() @Override public FileReadContext next() { - return () -> fileIter.next(); + Row row = fileIter.next(); + return () -> row; } }; } @@ -73,6 +74,7 @@ public CloseableIterator readParquetFiles( { return new CloseableIterator() { + private final ParquetBatchReader batchReader = new ParquetBatchReader(hadoopConf); private FileReadContext currentFile; private CloseableIterator currentFileReader; @@ -80,12 +82,7 @@ public CloseableIterator readParquetFiles( public void close() throws IOException { - if (currentFileReader != null) { - currentFileReader.close(); - } - - fileIter.close(); - // TODO: implement safe close of multiple closeables. + Utils.closeIterators(currentFileReader, fileIter); } @Override @@ -95,10 +92,10 @@ public boolean hasNext() // initialize the next file reader or return false if there are no more files to // read. if (currentFileReader == null || !currentFileReader.hasNext()) { + Utils.closeIterators(currentFileReader); if (fileIter.hasNext()) { currentFile = fileIter.next(); FileStatus fileStatus = Utils.getFileStatus(currentFile.getScanFileRow()); - ParquetBatchReader batchReader = new ParquetBatchReader(hadoopConf); currentFileReader = batchReader.read(fileStatus.getPath(), physicalSchema); } else { diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultArrayVector.java b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultArrayVector.java index be85b62b554..c093e09e252 100644 --- a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultArrayVector.java +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultArrayVector.java @@ -66,6 +66,9 @@ public DefaultArrayVector( @Override public List getArray(int rowId) { + if (isNullAt(rowId)) { + return null; + } checkValidRowId(rowId); int start = offsets[rowId]; int end = offsets[rowId + 1]; diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultBinaryVector.java b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultBinaryVector.java index 43129eac93a..4f37e58c79e 100644 --- a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultBinaryVector.java +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultBinaryVector.java @@ -43,11 +43,10 @@ public DefaultBinaryVector(DataType dataType, int size, byte[][] values) { super(size, dataType, Optional.empty()); checkArgument(dataType instanceof StringType || dataType instanceof BinaryType, - "invalid type"); + "invalid type for binary vector: " + dataType); this.values = requireNonNull(values, "values is null"); checkArgument(values.length >= size, "invalid number of values (%s) for given size (%s)", values.length, size); - checkArgument(values.length >= 0, "invalid vector size: %s", values.length); } @Override @@ -90,6 +89,9 @@ public String getString(int rowId) @Override public byte[] getBinary(int rowId) { + if (!(getDataType() instanceof BinaryType)) { + throw unsupportedDataAccessException("binary"); + } checkValidRowId(rowId); return values[rowId]; } diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultBooleanVector.java b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultBooleanVector.java index 1e68effb456..a09238ceeba 100644 --- a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultBooleanVector.java +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultBooleanVector.java @@ -41,7 +41,6 @@ public DefaultBooleanVector(int size, Optional nullability, boolean[] { super(size, BooleanType.INSTANCE, nullability); this.values = requireNonNull(values, "values is null"); - checkArgument(values.length >= 0, "invalid vector size: %s", values.length); checkArgument(values.length >= size, "invalid number of values (%s) for given size (%s)", values.length, size); if (nullability.isPresent()) { diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultByteVector.java b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultByteVector.java index e146d9a99aa..928aed22ded 100644 --- a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultByteVector.java +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultByteVector.java @@ -42,16 +42,8 @@ public DefaultByteVector(int size, Optional nullability, byte[] value { super(size, ByteType.INSTANCE, nullability); this.values = requireNonNull(values, "values is null"); - checkArgument(values.length >= 0, "invalid vector size: %s", values.length); checkArgument(values.length >= size, "invalid number of values (%s) for given size (%s)", values.length, size); - if (nullability.isPresent()) { - checkArgument(values.length == nullability.get().length, - "vector element components are not of same size" + - "value array size = %s, nullability array size = %s", - values.length, nullability.get().length - ); - } } /** diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultDoubleVector.java b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultDoubleVector.java index 7f3b8fb6e30..0d05c230247 100644 --- a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultDoubleVector.java +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultDoubleVector.java @@ -19,7 +19,7 @@ import static java.util.Objects.requireNonNull; import java.util.Optional; -import io.delta.kernel.types.LongType; +import io.delta.kernel.types.DoubleType; /** * {@link io.delta.kernel.data.ColumnVector} implementation for double type data. @@ -39,18 +39,10 @@ public class DefaultDoubleVector */ public DefaultDoubleVector(int size, Optional nullability, double[] values) { - super(size, LongType.INSTANCE, nullability); + super(size, DoubleType.INSTANCE, nullability); this.values = requireNonNull(values, "values is null"); - checkArgument(values.length >= 0, "invalid vector size: %s", values.length); checkArgument(values.length >= size, "invalid number of values (%s) for given size (%s)", values.length, size); - if (nullability.isPresent()) { - checkArgument(values.length == nullability.get().length, - "vector element components are not of same size" + - "value array size = %s, nullability array size = %s", - values.length, nullability.get().length - ); - } } /** diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultFloatVector.java b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultFloatVector.java index 56173207d00..6791cca59c6 100644 --- a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultFloatVector.java +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultFloatVector.java @@ -19,7 +19,7 @@ import static java.util.Objects.requireNonNull; import java.util.Optional; -import io.delta.kernel.types.LongType; +import io.delta.kernel.types.FloatType; /** * {@link io.delta.kernel.data.ColumnVector} implementation for float type data. @@ -39,18 +39,10 @@ public class DefaultFloatVector */ public DefaultFloatVector(int size, Optional nullability, float[] values) { - super(size, LongType.INSTANCE, nullability); + super(size, FloatType.INSTANCE, nullability); this.values = requireNonNull(values, "values is null"); - checkArgument(values.length >= 0, "invalid vector size: %s", values.length); checkArgument(values.length >= size, "invalid number of values (%s) for given size (%s)", values.length, size); - if (nullability.isPresent()) { - checkArgument(values.length == nullability.get().length, - "vector element components are not of same size" + - "value array size = %s, nullability array size = %s", - values.length, nullability.get().length - ); - } } /** diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultIntVector.java b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultIntVector.java index 2dfdfa55843..c145f9b5666 100644 --- a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultIntVector.java +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultIntVector.java @@ -45,16 +45,8 @@ public DefaultIntVector( super(size, dataType, nullability); checkArgument(dataType instanceof IntegerType || dataType instanceof DateType); this.values = requireNonNull(values, "values is null"); - checkArgument(values.length >= 0, "invalid vector size: %s", values.length); checkArgument(values.length >= size, "invalid number of values (%s) for given size (%s)", values.length, size); - if (nullability.isPresent()) { - checkArgument(values.length == nullability.get().length, - "vector element components are not of same size" + - "value array size = %s, nullability array size = %s", - values.length, nullability.get().length - ); - } } /** diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultLongVector.java b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultLongVector.java index 2f9394c1902..708733678cf 100644 --- a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultLongVector.java +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultLongVector.java @@ -41,16 +41,8 @@ public DefaultLongVector(int size, Optional nullability, long[] value { super(size, LongType.INSTANCE, nullability); this.values = requireNonNull(values, "values is null"); - checkArgument(values.length >= 0, "invalid vector size: %s", values.length); checkArgument(values.length >= size, "invalid number of values (%s) for given size (%s)", values.length, size); - if (nullability.isPresent()) { - checkArgument(values.length == nullability.get().length, - "vector element components are not of same size" + - "value array size = %s, nullability array size = %s", - values.length, nullability.get().length - ); - } } /** diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultMapVector.java b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultMapVector.java index d870dc7e0c2..107c6e81939 100644 --- a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultMapVector.java +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultMapVector.java @@ -70,6 +70,9 @@ public DefaultMapVector( @Override public Map getMap(int rowId) { + if (isNullAt(rowId)) { + return null; + } checkValidRowId(rowId); int start = offsets[rowId]; int end = offsets[rowId + 1]; diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultShortVector.java b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultShortVector.java index 4f27879fd86..eb39629bef2 100644 --- a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultShortVector.java +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultShortVector.java @@ -19,8 +19,7 @@ import static java.util.Objects.requireNonNull; import java.util.Optional; -import io.delta.kernel.types.ByteType; -import io.delta.kernel.types.IntegerType; +import io.delta.kernel.types.ShortType; /** * {@link io.delta.kernel.data.ColumnVector} implementation for short type data. @@ -40,18 +39,10 @@ public class DefaultShortVector */ public DefaultShortVector(int size, Optional nullability, short[] values) { - super(size, ByteType.INSTANCE, nullability); + super(size, ShortType.INSTANCE, nullability); this.values = requireNonNull(values, "values is null"); - checkArgument(values.length >= 0, "invalid vector size: %s", values.length); checkArgument(values.length >= size, "invalid number of values (%s) for given size (%s)", values.length, size); - if (nullability.isPresent()) { - checkArgument(values.length == nullability.get().length, - "vector element components are not of same size" + - "value array size = %s, nullability array size = %s", - values.length, nullability.get().length - ); - } } /** diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultStructVector.java b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultStructVector.java index 9bb452ca624..143f085ca99 100644 --- a/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultStructVector.java +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/data/vector/DefaultStructVector.java @@ -52,7 +52,6 @@ public DefaultStructVector( { super(size, dataType, nullability); checkArgument(dataType instanceof StructType, "not a struct type"); - StructType structType = (StructType) dataType; checkArgument( structType.length() == memberVectors.length, diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/ArrayConverter.java b/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/ArrayConverter.java index 64cce0e2760..be888fac487 100644 --- a/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/ArrayConverter.java +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/ArrayConverter.java @@ -42,21 +42,21 @@ class ArrayConverter private int collectorIndexAtStart; public ArrayConverter( - int maxBatchSize, + int initialBatchSize, ArrayType typeFromClient, GroupType typeFromFile) { this.typeFromClient = typeFromClient; final GroupType innerElementType = (GroupType) typeFromFile.getType("list"); this.converter = new ArrayCollector( - maxBatchSize, + initialBatchSize, typeFromClient, innerElementType ); // initialize working state - this.nullability = initNullabilityVector(maxBatchSize); - this.offsets = new int[maxBatchSize + 1]; + this.nullability = initNullabilityVector(initialBatchSize); + this.offsets = new int[initialBatchSize + 1]; } @Override @@ -67,7 +67,7 @@ public Converter getConverter(int fieldIndex) return converter; default: throw new IllegalArgumentException( - "Invalid field index for a map column: " + fieldIndex); + "Invalid field index for a array column: " + fieldIndex); } } @@ -95,10 +95,7 @@ public ColumnVector getDataColumnVector(int batchSize) offsets, converter.getArrayVector() ); - this.currentRowIndex = 0; - this.nullability = initNullabilityVector(nullability.length); - this.offsets = new int[offsets.length]; - + resetWorkingState(); return vector; } @@ -123,6 +120,14 @@ public void resizeIfNeeded() } } + @Override + public void resetWorkingState() + { + this.currentRowIndex = 0; + this.nullability = initNullabilityVector(nullability.length); + this.offsets = new int[offsets.length]; + } + public static class ArrayCollector extends GroupConverter { diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/MapConverter.java b/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/MapConverter.java index b056f286f6c..4b046f5384d 100644 --- a/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/MapConverter.java +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/MapConverter.java @@ -42,21 +42,21 @@ class MapConverter private int collectorIndexAtStart; public MapConverter( - int maxBatchSize, + int initialBatchSize, MapType typeFromClient, GroupType typeFromFile) { this.typeFromClient = typeFromClient; final GroupType innerMapType = (GroupType) typeFromFile.getType("key_value"); this.converter = new MapCollector( - maxBatchSize, + initialBatchSize, typeFromClient, innerMapType ); // initialize working state - this.nullability = initNullabilityVector(maxBatchSize); - this.offsets = new int[maxBatchSize + 1]; + this.nullability = initNullabilityVector(initialBatchSize); + this.offsets = new int[initialBatchSize + 1]; } @Override @@ -96,10 +96,7 @@ public ColumnVector getDataColumnVector(int batchSize) converter.getKeyVector(), converter.getValueVector() ); - this.currentRowIndex = 0; - this.converter.currentEntryIndex = 0; - this.nullability = initNullabilityVector(nullability.length); - this.offsets = new int[offsets.length]; + resetWorkingState(); return vector; } @@ -124,6 +121,15 @@ public void resizeIfNeeded() } } + @Override + public void resetWorkingState() + { + this.currentRowIndex = 0; + this.converter.currentEntryIndex = 0; + this.nullability = initNullabilityVector(nullability.length); + this.offsets = new int[offsets.length]; + } + public static class MapCollector extends GroupConverter { diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/ParquetConverters.java b/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/ParquetConverters.java index 9d8bb713826..4f6e2fe712e 100644 --- a/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/ParquetConverters.java +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/ParquetConverters.java @@ -53,53 +53,53 @@ class ParquetConverters { public static Converter createConverter( - int maxBatchSize, + int initialBatchSize, DataType typeFromClient, Type typeFromFile ) { if (typeFromClient instanceof StructType) { return new RowConverter( - maxBatchSize, + initialBatchSize, (StructType) typeFromClient, (GroupType) typeFromFile); } else if (typeFromClient instanceof ArrayType) { return new ArrayConverter( - maxBatchSize, + initialBatchSize, (ArrayType) typeFromClient, (GroupType) typeFromFile ); } else if (typeFromClient instanceof MapType) { return new MapConverter( - maxBatchSize, + initialBatchSize, (MapType) typeFromClient, (GroupType) typeFromFile); } else if (typeFromClient instanceof StringType || typeFromClient instanceof BinaryType) { - return new BinaryColumnConverter(typeFromClient, maxBatchSize); + return new BinaryColumnConverter(typeFromClient, initialBatchSize); } else if (typeFromClient instanceof BooleanType) { - return new BooleanColumnConverter(maxBatchSize); + return new BooleanColumnConverter(initialBatchSize); } else if (typeFromClient instanceof IntegerType || typeFromClient instanceof DateType) { - return new IntColumnConverter(typeFromClient, maxBatchSize); + return new IntColumnConverter(typeFromClient, initialBatchSize); } else if (typeFromClient instanceof ByteType) { - return new ByteColumnConverter(maxBatchSize); + return new ByteColumnConverter(initialBatchSize); } else if (typeFromClient instanceof ShortType) { - return new ShortColumnConverter(maxBatchSize); + return new ShortColumnConverter(initialBatchSize); } else if (typeFromClient instanceof LongType) { - return new LongColumnConverter(maxBatchSize); + return new LongColumnConverter(initialBatchSize); } else if (typeFromClient instanceof FloatType) { - return new FloatColumnConverter(maxBatchSize); + return new FloatColumnConverter(initialBatchSize); } else if (typeFromClient instanceof DoubleType) { - return new DoubleColumnConverter(maxBatchSize); + return new DoubleColumnConverter(initialBatchSize); } // else if (typeFromClient instanceof DecimalType) { // @@ -122,7 +122,9 @@ public interface BaseConverter */ boolean moveToNextRow(); - void resizeIfNeeded(); + default void resizeIfNeeded() {} + + default void resetWorkingState() {} } public static class NonExistentColumnConverter @@ -147,12 +149,6 @@ public boolean moveToNextRow() { return true; } - - @Override - public void resizeIfNeeded() - { - // nothing to resize - } } public abstract static class BasePrimitiveColumnConverter @@ -163,12 +159,12 @@ public abstract static class BasePrimitiveColumnConverter protected int currentRowIndex; protected boolean[] nullability; - BasePrimitiveColumnConverter(int maxBatchSize) + BasePrimitiveColumnConverter(int initialBatchSize) { - checkArgument(maxBatchSize >= 0, "invalid maxBatchSize: %s", maxBatchSize); + checkArgument(initialBatchSize >= 0, "invalid initialBatchSize: %s", initialBatchSize); // Initialize the working state - this.nullability = initNullabilityVector(maxBatchSize); + this.nullability = initNullabilityVector(initialBatchSize); } @Override @@ -186,10 +182,10 @@ public static class BooleanColumnConverter // working state private boolean[] values; - BooleanColumnConverter(int maxBatchSize) + BooleanColumnConverter(int initialBatchSize) { - super(maxBatchSize); - this.values = new boolean[maxBatchSize]; + super(initialBatchSize); + this.values = new boolean[initialBatchSize]; } @Override @@ -230,10 +226,10 @@ public static class ByteColumnConverter // working state private byte[] values; - ByteColumnConverter(int maxBatchSize) + ByteColumnConverter(int initialBatchSize) { - super(maxBatchSize); - this.values = new byte[maxBatchSize]; + super(initialBatchSize); + this.values = new byte[initialBatchSize]; } @Override @@ -274,10 +270,10 @@ public static class ShortColumnConverter // working state private short[] values; - ShortColumnConverter(int maxBatchSize) + ShortColumnConverter(int initialBatchSize) { - super(maxBatchSize); - this.values = new short[maxBatchSize]; + super(initialBatchSize); + this.values = new short[initialBatchSize]; } @Override @@ -318,12 +314,12 @@ public static class IntColumnConverter // working state private int[] values; - IntColumnConverter(DataType dataType, int maxBatchSize) + IntColumnConverter(DataType dataType, int initialBatchSize) { - super(maxBatchSize); + super(initialBatchSize); checkArgument(dataType instanceof IntegerType || dataType instanceof DataType); this.dataType = dataType; - this.values = new int[maxBatchSize]; + this.values = new int[initialBatchSize]; } @Override @@ -364,10 +360,10 @@ public static class LongColumnConverter // working state private long[] values; - LongColumnConverter(int maxBatchSize) + LongColumnConverter(int initialBatchSize) { - super(maxBatchSize); - this.values = new long[maxBatchSize]; + super(initialBatchSize); + this.values = new long[initialBatchSize]; } @Override @@ -407,10 +403,10 @@ public static class FloatColumnConverter // working state private float[] values; - FloatColumnConverter(int maxBatchSize) + FloatColumnConverter(int initialBatchSize) { - super(maxBatchSize); - this.values = new float[maxBatchSize]; + super(initialBatchSize); + this.values = new float[initialBatchSize]; } @Override @@ -451,10 +447,10 @@ public static class DoubleColumnConverter // working state private double[] values; - DoubleColumnConverter(int maxBatchSize) + DoubleColumnConverter(int initialBatchSize) { - super(maxBatchSize); - this.values = new double[maxBatchSize]; + super(initialBatchSize); + this.values = new double[initialBatchSize]; } @Override @@ -497,11 +493,11 @@ public static class BinaryColumnConverter // working state private byte[][] values; - BinaryColumnConverter(DataType dataType, int maxBatchSize) + BinaryColumnConverter(DataType dataType, int initialBatchSize) { - super(maxBatchSize); + super(initialBatchSize); this.dataType = dataType; - this.values = new byte[maxBatchSize][]; + this.values = new byte[initialBatchSize][]; } @Override diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/RowConverter.java b/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/RowConverter.java index a50432f5ca3..81663a69ae3 100644 --- a/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/RowConverter.java +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/RowConverter.java @@ -1,6 +1,6 @@ package io.delta.kernel.parquet; -import static io.delta.kernel.DefaultKernelUtils.findFieldType; +import static io.delta.kernel.DefaultKernelUtils.findSubFieldType; import java.util.Arrays; import java.util.HashMap; import java.util.List; @@ -37,7 +37,7 @@ class RowConverter private boolean[] nullability; RowConverter( - int maxBatchSize, + int initialBatchSize, StructType readSchema, GroupType fileSchema) { @@ -47,19 +47,19 @@ class RowConverter this.parquetOrdinalToConverterOrdinal = new HashMap<>(); // Initialize the working state - this.nullability = ParquetConverters.initNullabilityVector(maxBatchSize); + this.nullability = ParquetConverters.initNullabilityVector(initialBatchSize); int parquetOrdinal = 0; for (int i = 0; i < converters.length; i++) { final StructField field = fields.get(i); final DataType typeFromClient = field.getDataType(); - final Type typeFromFile = findFieldType(fileSchema, field); + final Type typeFromFile = findSubFieldType(fileSchema, field); if (typeFromFile == null) { converters[i] = new ParquetConverters.NonExistentColumnConverter(typeFromClient); } else { - converters[i] = - ParquetConverters.createConverter(maxBatchSize, typeFromClient, typeFromFile); + converters[i] = ParquetConverters.createConverter( + initialBatchSize, typeFromClient, typeFromFile); parquetOrdinalToConverterOrdinal.put(parquetOrdinal, i); parquetOrdinal++; } @@ -145,7 +145,8 @@ public void resizeIfNeeded() } } - private void resetWorkingState() + @Override + public void resetWorkingState() { this.currentRowIndex = 0; this.nullability = ParquetConverters.initNullabilityVector(this.nullability.length); diff --git a/kernel/kernel-default/src/test/java/io/delta/kernel/parquet/TestParquetBatchReader.java b/kernel/kernel-default/src/test/java/io/delta/kernel/parquet/TestParquetBatchReader.java index 3814d67d9fd..eae04ce3704 100644 --- a/kernel/kernel-default/src/test/java/io/delta/kernel/parquet/TestParquetBatchReader.java +++ b/kernel/kernel-default/src/test/java/io/delta/kernel/parquet/TestParquetBatchReader.java @@ -20,6 +20,7 @@ import java.time.temporal.ChronoUnit; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Optional; @@ -79,17 +80,16 @@ public class TestParquetBatchReader .add("nested_struct", new StructType() .add("aa", StringType.INSTANCE) - .add("ac", new StructType().add("aca", IntegerType.INSTANCE)) - ).add("array_of_prims", - new ArrayType(IntegerType.INSTANCE, true) - ).add("array_of_structs", - new ArrayType(new StructType().add("ab", LongType.INSTANCE), true) - ).add("map_of_prims", new MapType(IntegerType.INSTANCE, LongType.INSTANCE, true)) + .add("ac", new StructType().add("aca", IntegerType.INSTANCE))) + .add("array_of_prims", + new ArrayType(IntegerType.INSTANCE, true)) + .add("array_of_structs", + new ArrayType(new StructType().add("ab", LongType.INSTANCE), true)) + .add("map_of_prims", new MapType(IntegerType.INSTANCE, LongType.INSTANCE, true)) .add("map_of_complex", new MapType( IntegerType.INSTANCE, new StructType().add("ab", LongType.INSTANCE), - true - )); + true)); private static final LocalDate EPOCH = LocalDate.ofEpochDay(0); @@ -276,7 +276,7 @@ private static void verifyRowFromAllTypesFile( assertTrue(vector.isNullAt(batchWithIdx._2)); } else { - assertArrayEquals(expValue.getBytes(), vector.getBinary(batchWithIdx._2)); + assertEquals(expValue, vector.getString(batchWithIdx._2)); } break; } @@ -319,6 +319,13 @@ private static void verifyRowFromAllTypesFile( if (expIsNull) { assertTrue(vector.isNullAt(batchWithIdx._2)); } + else if (rowId % 29 == 0) { + // TODO: Parquet group converters calls to start/end don't differentiate + // between empty array or null array. The current reader always treats both + // of them nulls. + // assertEquals(Collections.emptyList(), vector.getArray(batchWithIdx._2)); + assertTrue(vector.isNullAt(batchWithIdx._2)); + } else { List expArray = Arrays.asList(rowId, null, rowId + 1); List actArray = vector.getArray(batchWithIdx._2); @@ -340,6 +347,13 @@ private static void verifyRowFromAllTypesFile( if (expIsNull) { assertTrue(vector.isNullAt(batchWithIdx._2)); } + else if (rowId % 30 == 0) { + // TODO: Parquet group converters calls to start/end don't differentiate + // between empty map or null map. The current reader always treats both + // of them nulls. + // assertEquals(Collections.emptyList(), vector.getMap(batchWithIdx._2)); + assertTrue(vector.isNullAt(batchWithIdx._2)); + } else { Map actValue = vector.getMap(batchWithIdx._2); assertTrue(actValue.size() == 2); diff --git a/kernel/kernel-default/src/test/resources/parquet/all_types.parquet b/kernel/kernel-default/src/test/resources/parquet/all_types.parquet index 460a69b66ec7f18d9e80205582c5e6e1abe0626e..e9c5658db2f41478a6ed4b0c03272b593e5050ca 100644 GIT binary patch delta 1689 zcma)6c~BEq7=OE)B?}EABfAD&f+$7=MMO+MV*t4nQMsvrpyI))AfO0BxWtPh@*3nO zDhRG}Xf#~m6wg|9EMsLDq3u-1acFfc?KlI|^bzg!-|ozBzu)hD$Ikn{w{N5$%NxMj z^Bm`Q3g{sLUG#YL<5x1egrIo<)EG=HR9855;gIKhK%q6HKk(_hKa5ryvGga3=FtYU z0RZE?MZN$4MdFwyi6nTV`$_)Xq_M;?iJ=i@PK|1kj71~bwX2|0A%v>n>Wt;wB8MZy4iDw%~? zCS1K)TIMPvE0L&x?yjk12Vxp%9)@Nb{q5<*857ZV&p zVjz$-;3<7cKCjUhy{>Yk1&`w}3racrj%~!9P$bdJMd3s&7l{+i@sT*$%*Atl7~g@5 z_d_cjgA-Rd31$RDn9)=YW{=ivsc3+~a5o7lvRq_k36*ZPb6AN~>g=-4dHs6X2DuDA zb+a~O{R0Anf`UUrgTq3<4%@LKJR&k28oG-VY(iqv!Nfy{l2Z;RryfpAO;0wQHiz@cwrOOSM8XK-OHZ?V0<(jW?*RQo)Z*6I7 zZI`v(Xuo+wt-f_zbE`ws+0oV6-PO~r?P0aOef_-y{euHTgLm!>-@P|{AGHUd#y&Hr z^Ns1+R|7xs;2ZZ!Sg?`Bmcg2h5pWtYFKFfw0aHC#ECv>PM8o$ULM#vRJfqE3#uT`S za0%fuLIXl0LOne084V{qg;+0~@{}vk963I%tT?AbNB+N2j{omb@)~hWQcBF|q6i{G z7fBFWMI=GaOe=JL$n@(JeguC;H?33XWSAj3&iy)_LPiiXuGCI|;qbDTlfP(*rS+nv zR^nwgii`a;d#|m5syl)pm|6)~s~0U%N`;AYJY3=(1*=)B3eC$p&Y+>pC}? z`-Y7ko^mf&Z#SP!o85gkZt?W<+UmW{2ikdy{G)e;?~d3L8MAY5)V}=(cE#?F+Y=v? zus3mE(t(4qhvJgsQxXm*rY5BwOiweo%#}6ewHN9t>#J1NXvj4UwT*RG>YG%}7vT$Uk+r(}c8^Be(c9VA z)ju%UJ=AkYJKTG>?_U4?!3RSRhacVj?%rdy$S01(Sa`}$gxSC*ztVg}|11WpkyN1urx!3!+To8uoAEjgTpYZK->F}lfhoaJ zbDrzD>IP2R1MxffD%jTO6UB@$Oc{-&X~3)K$7q}>G+;iMF>lOfUE=0HFjmNWi25niUyKY3EkZ3#RQeKq zH}&7wD&dw;+hr(k(uygj#i+UBl2bSQ8gn2hLMoLXO+UmVLRqPy_8 zK`sFSxmQ5$05P=@>(F_lzF6NB0 zg-F9kY{W%eg7}SX9fS)~5+C&@aahF9Ml?G~%QvBu9v$MC36dW5eot~phNv*y9q@fM z8zqrkjww+{h6r9PjLbW*2tvEVgrMgaesHrP*z-Fb>&Vja3I0ar;&6Vgg7`* zLJP>m#(4(}Vxv&PD{PJ(2U|x6N|O)!uxV?DBF8P%)E(^|M-vV+EkF+hXrbx3p9?B5 ztgoJ{w}F=~7f4HZ=@$?8T}|PzzYOf%L{LKdL$%vBFs95Q;WHap>Fxr5(jrK3m%-uf zBKXC9t8|iw^j!2~hg!|TQ{aA>!c&%+9iBqGOwJ*R7L0@=V9Sd*M(mbA{wksY6UJX9 zY8K-nR#Qt1mCPaZDpG^6{`sUlDBS(&YckJl#V<(QEJaOve=~HJ; zpFMN#?D=yS&L>|;Nlr~kOI4+%t1{9vGqN(XvxM0>7jtti<>uvGzM{UYQRiz4@}Zy* ziVBO1N{UNM%1X=2D#|M>u2xoEt*)x6uC1x771mv=Z@AXjaJ{jqsk!AwbL)+^)^?A! zj`q%uuFmc*ZTC%W&&}SRzTR7X{kI1C2M2}*hlhm2Bcr;JG2Qss#Q5ao?WsGrr}4;U z&5_CoJ!~zT=L7O8BW>Y^mlgCmt%fIFz81blG_ecghw;StV*)UNm>}@>jw0is!rKZ) zyra$2a7=|s$7EnKFwu=j~2t3mCv7TSHH&4+Pp1T%`!Va71ymf%xii}jY|5JM!#X>kFI#nBiRXK`s^DVXHxbI8%9U>tL& z;|P*su{m5G&0n!nU_`EB!AoE9S_>OXk=1%@@jBZL8$XfQd@9A4w~GV`)7XJ5%tU)1F0=M`LrD~0N!;{1}*vVwA`D6A~HT3l69 zT~@#_1dO7;QLuhx(Bp_Jwv_2eIvI<`*j0jgX2RJ!;>SoN2hdm z#-_*bPTsqHe`*HLj?f1A9p+e@5au6I=4LcL4h$D|XWzSsGG~PFaQCm?uYV#@D$U@y zxsZ81N1}4m_aY~iLQTB)Vy=)vPd^A1Vwh4W4wcdmv2Vsq zwZeR882!Q^^sNvejpb$BT+Y-%hg3>e8vMs?K!?TgZW_cDI1&~_&lvofc9;nB?H1NVIl9M7RfC(6iiXwgYOyqv;kbK4$bCz?KD2lRXe3^XJVOVs%g@i&^#7 z;7@f@XbT~J4J+i56&gPCnql6UV(BK|Zd}l7=zuG>;fgs>$sH+d2IbDHTMeoDE(&=N zVmuZhqTsy3Ld;4ycm`37hRghJ%G8@Kr^I=GkwP=}T|7@SeE1UtV*-H+E8{6ci)Xk6 ze*at{oK{GT@pFso7!AUj5_e`pmtqYmgu4n`EGg_MY(7_RFn(u9 r&EgbpG}{abN+$}N2Bq_w!oKAd{DV^-1XKD3e;7+lGaBvRN&)>ZtlLPx From a37ba8f4014b8604b01c8b642f9159e2b4312621 Mon Sep 17 00:00:00 2001 From: Venki Korukanti Date: Wed, 21 Jun 2023 19:24:13 -0700 Subject: [PATCH 5/9] fix --- .../java/io/delta/kernel/parquet/TestParquetBatchReader.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/kernel-default/src/test/java/io/delta/kernel/parquet/TestParquetBatchReader.java b/kernel/kernel-default/src/test/java/io/delta/kernel/parquet/TestParquetBatchReader.java index eae04ce3704..5d143e4f749 100644 --- a/kernel/kernel-default/src/test/java/io/delta/kernel/parquet/TestParquetBatchReader.java +++ b/kernel/kernel-default/src/test/java/io/delta/kernel/parquet/TestParquetBatchReader.java @@ -91,7 +91,7 @@ public class TestParquetBatchReader new StructType().add("ab", LongType.INSTANCE), true)); - private static final LocalDate EPOCH = LocalDate.ofEpochDay(0); + private static final LocalDate EPOCH = new Date(0).toLocalDate().ofEpochDay(0); @Test public void readAllTypesOfData() From 7a311555079531f09ac737def6b6e840bc3554cd Mon Sep 17 00:00:00 2001 From: Venki Korukanti Date: Wed, 21 Jun 2023 20:06:34 -0700 Subject: [PATCH 6/9] fix data test issue --- .../parquet/TestParquetBatchReader.java | 2 ++ .../test/resources/parquet/all_types.parquet | Bin 24577 -> 24577 bytes 2 files changed, 2 insertions(+) diff --git a/kernel/kernel-default/src/test/java/io/delta/kernel/parquet/TestParquetBatchReader.java b/kernel/kernel-default/src/test/java/io/delta/kernel/parquet/TestParquetBatchReader.java index 5d143e4f749..95fa225292b 100644 --- a/kernel/kernel-default/src/test/java/io/delta/kernel/parquet/TestParquetBatchReader.java +++ b/kernel/kernel-default/src/test/java/io/delta/kernel/parquet/TestParquetBatchReader.java @@ -218,6 +218,8 @@ private static void verifyRowFromAllTypesFile( break; } case "datetype": { + // Set `-Duser.timezone="UTC"` as JVM arg to pass this test in computers + // whose local timezone is non-UTC zone. LocalDate expValue = (rowId % 61 != 0) ? new Date(rowId * 20000000L).toLocalDate() : null; if (expValue == null) { diff --git a/kernel/kernel-default/src/test/resources/parquet/all_types.parquet b/kernel/kernel-default/src/test/resources/parquet/all_types.parquet index e9c5658db2f41478a6ed4b0c03272b593e5050ca..921715ef8336c393e1352332eb5b5c7fff871ccc 100644 GIT binary patch delta 343 zcmZoXz}R?zaf6_e$gNLP8#ru4<3w2)81^uJ*v$Y0lV=+3;1gvKy}=}UiAnVL-&Z#| zCcjpauLnvw3a~UOa2z(^VRjU72^48flxQoIIo_ziF-gU9l1A4eozt5Pcn%x*UN-4_ zY;*pzjR1>dpol9_@{)p-2;fUL|}%W`jT0~!Gos+{+{_Wt); z1&$`5sf#qa9$EA$drnG;oR$hy1(X!-1PU(8oV9Azu63_>ZPGctE%)~3Im#0?I6#h; zlwi@@Z0aM)!o4Hmu0zF{(x{{)MHq(wqJL5kx;{D9Qv^&vsr E0EpUhr2qf` delta 348 zcmZoXz}R?zaf6_e$h4Q|w=&s?#)+~pFzjLcu=_s{OjcCd!6(WfdV@*y5|e1(`wNqq zCcjpauXhk&X;k27GT=E3WCe=221>Lg${a5Q3aNNbQt4Wxae9*u&tU_f%O<^#EzW=Gyv3IvhUQe&@|CIWQ=S+ka9-rklgeY_ATH1B!kz3;X198C!l zZ3{KJ7FqN?_L!ssR1^tR1(aN*A>6enbJns|t5&_4)F2LyzGsAU0RAnIR01c-_VOa)Q<1HppgLFph~Lr?&Sx)kIMqU3|! zKva6LI};1X Date: Wed, 21 Jun 2023 22:36:28 -0700 Subject: [PATCH 7/9] use closeableiter.map --- .../kernel/client/DefaultParquetHandler.java | 22 +++---------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/client/DefaultParquetHandler.java b/kernel/kernel-default/src/main/java/io/delta/kernel/client/DefaultParquetHandler.java index 242e2c452c9..432746879a1 100644 --- a/kernel/kernel-default/src/main/java/io/delta/kernel/client/DefaultParquetHandler.java +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/client/DefaultParquetHandler.java @@ -43,28 +43,12 @@ public CloseableIterator contextualizeFileReads( CloseableIterator fileIter, Expression predicate) { - return new CloseableIterator() - { - @Override - public void close() - throws IOException - { - fileIter.close(); - } - - @Override - public boolean hasNext() - { - return fileIter.hasNext(); - } - - @Override - public FileReadContext next() - { + return fileIter.map( + scanFileRow -> { Row row = fileIter.next(); return () -> row; } - }; + ); } @Override From 8d87de3b5dac8fe9d55b621d502db31637c9b67a Mon Sep 17 00:00:00 2001 From: Venki Korukanti Date: Wed, 21 Jun 2023 22:57:29 -0700 Subject: [PATCH 8/9] fix --- .../io/delta/kernel/parquet/ParquetBatchReader.java | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/ParquetBatchReader.java b/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/ParquetBatchReader.java index 49d1720b7b3..aa2c9340491 100644 --- a/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/ParquetBatchReader.java +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/parquet/ParquetBatchReader.java @@ -17,6 +17,7 @@ import java.io.IOException; import java.util.Map; +import java.util.NoSuchElementException; import static java.util.Objects.requireNonNull; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; @@ -69,6 +70,8 @@ public CloseableIterator read(String path, StructType schema) return new CloseableIterator() { + private boolean hasNotConsumedNextElement; + @Override public void close() throws IOException @@ -80,7 +83,11 @@ public void close() public boolean hasNext() { try { - return reader.nextKeyValue(); + if (hasNotConsumedNextElement) { + return true; + } + hasNotConsumedNextElement = reader.nextKeyValue(); + return hasNotConsumedNextElement; } catch (IOException | InterruptedException e) { throw new RuntimeException(e); @@ -90,8 +97,12 @@ public boolean hasNext() @Override public ColumnarBatch next() { + if (!hasNotConsumedNextElement) { + throw new NoSuchElementException(); + } int batchSize = 0; do { + hasNotConsumedNextElement = false; // hasNext reads to row to confirm there is a next element. batchReadSupport.moveToNextRow(); batchSize++; From 9dffd61302551a5c2ceea47efc1e67e23cb5b9fe Mon Sep 17 00:00:00 2001 From: Venki Korukanti Date: Wed, 21 Jun 2023 23:15:23 -0700 Subject: [PATCH 9/9] Update kernel/kernel-default/src/main/java/io/delta/kernel/DefaultKernelUtils.java Co-authored-by: Allison Portis --- .../src/main/java/io/delta/kernel/DefaultKernelUtils.java | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/kernel-default/src/main/java/io/delta/kernel/DefaultKernelUtils.java b/kernel/kernel-default/src/main/java/io/delta/kernel/DefaultKernelUtils.java index b98819e4e31..3c62d3ecb60 100644 --- a/kernel/kernel-default/src/main/java/io/delta/kernel/DefaultKernelUtils.java +++ b/kernel/kernel-default/src/main/java/io/delta/kernel/DefaultKernelUtils.java @@ -83,6 +83,7 @@ public static Type findSubFieldType(GroupType groupType, StructField field) return null; } + // Note this only prunes top-level fields private static Type pruneSubfields(Type type, DataType deltaDatatype) { if (!(deltaDatatype instanceof StructType)) {