diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java index b7646969bcf3d..3ba180860c325 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java @@ -239,7 +239,7 @@ private void decodeDictionaryIds( int rowId, int num, WritableColumnVector column, - ColumnVector dictionaryIds) { + WritableColumnVector dictionaryIds) { switch (descriptor.getType()) { case INT32: if (column.dataType() == DataTypes.IntegerType || diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java index 1f1347ccd315e..e99201f6372fe 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java @@ -159,11 +159,6 @@ public int[] getInts(int rowId, int count) { return array; } - @Override - public int getDictId(int rowId) { - throw new UnsupportedOperationException(); - } - // // APIs dealing with Longs // diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java index e6b87519239dd..fd5caf3bf170b 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java @@ -22,24 +22,22 @@ import org.apache.spark.unsafe.types.UTF8String; /** - * This class represents a column of values and provides the main APIs to access the data - * values. It supports all the types and contains get APIs as well as their batched versions. - * The batched versions are preferable whenever possible. + * This class represents in-memory values of a column and provides the main APIs to access the data. + * It supports all the types and contains get APIs as well as their batched versions. The batched + * versions are considered to be faster and preferable whenever possible. * * To handle nested schemas, ColumnVector has two types: Arrays and Structs. In both cases these - * columns have child columns. All of the data is stored in the child columns and the parent column - * contains nullability, and in the case of Arrays, the lengths and offsets into the child column. - * Lengths and offsets are encoded identically to INTs. + * columns have child columns. All of the data are stored in the child columns and the parent column + * only contains nullability. In the case of Arrays, the lengths and offsets are saved in the child + * column and are encoded identically to INTs. + * * Maps are just a special case of a two field struct. * * Most of the APIs take the rowId as a parameter. This is the batch local 0-based row id for values - * in the current RowBatch. - * - * A ColumnVector should be considered immutable once originally created. - * - * ColumnVectors are intended to be reused. + * in the current batch. */ public abstract class ColumnVector implements AutoCloseable { + /** * Returns the data type of this column. */ @@ -47,7 +45,6 @@ public abstract class ColumnVector implements AutoCloseable { /** * Cleans up memory for this column. The column is not usable after this. - * TODO: this should probably have ref-counted semantics. */ public abstract void close(); @@ -107,13 +104,6 @@ public abstract class ColumnVector implements AutoCloseable { */ public abstract int[] getInts(int rowId, int count); - /** - * Returns the dictionary Id for rowId. - * This should only be called when the ColumnVector is dictionaryIds. - * We have this separate method for dictionaryIds as per SPARK-16928. - */ - public abstract int getDictId(int rowId); - /** * Returns the value for rowId. */ @@ -145,39 +135,39 @@ public abstract class ColumnVector implements AutoCloseable { public abstract double[] getDoubles(int rowId, int count); /** - * Returns the length of the array at rowid. + * Returns the length of the array for rowId. */ public abstract int getArrayLength(int rowId); /** - * Returns the offset of the array at rowid. + * Returns the offset of the array for rowId. */ public abstract int getArrayOffset(int rowId); /** - * Returns a utility object to get structs. + * Returns the struct for rowId. */ public final ColumnarRow getStruct(int rowId) { return new ColumnarRow(this, rowId); } /** - * Returns a utility object to get structs. - * provided to keep API compatibility with InternalRow for code generation + * A special version of {@link #getStruct(int)}, which is only used as an adapter for Spark + * codegen framework, the second parameter is totally ignored. */ public final ColumnarRow getStruct(int rowId, int size) { return getStruct(rowId); } /** - * Returns the array at rowid. + * Returns the array for rowId. */ public final ColumnarArray getArray(int rowId) { return new ColumnarArray(arrayData(), getArrayOffset(rowId), getArrayLength(rowId)); } /** - * Returns the value for rowId. + * Returns the map for rowId. */ public MapData getMap(int ordinal) { throw new UnsupportedOperationException(); @@ -214,30 +204,6 @@ public MapData getMap(int ordinal) { */ protected DataType type; - /** - * The Dictionary for this column. - * - * If it's not null, will be used to decode the value in getXXX(). - */ - protected Dictionary dictionary; - - /** - * Reusable column for ids of dictionary. - */ - protected ColumnVector dictionaryIds; - - /** - * Returns true if this column has a dictionary. - */ - public boolean hasDictionary() { return this.dictionary != null; } - - /** - * Returns the underlying integer column for ids of dictionary. - */ - public ColumnVector getDictionaryIds() { - return dictionaryIds; - } - /** * Sets up the common state and also handles creating the child columns if this is a nested * type. diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java index 7c053b579442c..63cf60818a855 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java @@ -36,8 +36,10 @@ * elements. This means that the put() APIs do not check as in common cases (i.e. flat schemas), * the lengths are known up front. * - * A ColumnVector should be considered immutable once originally created. In other words, it is not - * valid to call put APIs after reads until reset() is called. + * A WritableColumnVector should be considered immutable once originally created. In other words, + * it is not valid to call put APIs after reads until reset() is called. + * + * WritableColumnVector are intended to be reused. */ public abstract class WritableColumnVector extends ColumnVector { @@ -105,6 +107,58 @@ private void throwUnsupportedException(int requiredCapacity, Throwable cause) { @Override public boolean anyNullsSet() { return anyNullsSet; } + /** + * Returns the dictionary Id for rowId. + * + * This should only be called when this `WritableColumnVector` represents dictionaryIds. + * We have this separate method for dictionaryIds as per SPARK-16928. + */ + public abstract int getDictId(int rowId); + + /** + * The Dictionary for this column. + * + * If it's not null, will be used to decode the value in getXXX(). + */ + protected Dictionary dictionary; + + /** + * Reusable column for ids of dictionary. + */ + protected WritableColumnVector dictionaryIds; + + /** + * Returns true if this column has a dictionary. + */ + public boolean hasDictionary() { return this.dictionary != null; } + + /** + * Returns the underlying integer column for ids of dictionary. + */ + public WritableColumnVector getDictionaryIds() { + return dictionaryIds; + } + + /** + * Update the dictionary. + */ + public void setDictionary(Dictionary dictionary) { + this.dictionary = dictionary; + } + + /** + * Reserve a integer column for ids of dictionary. + */ + public WritableColumnVector reserveDictionaryIds(int capacity) { + if (dictionaryIds == null) { + dictionaryIds = reserveNewColumn(capacity, DataTypes.IntegerType); + } else { + dictionaryIds.reset(); + dictionaryIds.reserve(capacity); + } + return dictionaryIds; + } + /** * Ensures that there is enough storage to store capacity elements. That is, the put() APIs * must work for all rowIds < capacity. @@ -613,36 +667,6 @@ public final int appendStruct(boolean isNull) { */ protected WritableColumnVector[] childColumns; - /** - * Update the dictionary. - */ - public void setDictionary(Dictionary dictionary) { - this.dictionary = dictionary; - } - - /** - * Reserve a integer column for ids of dictionary. - */ - public WritableColumnVector reserveDictionaryIds(int capacity) { - WritableColumnVector dictionaryIds = (WritableColumnVector) this.dictionaryIds; - if (dictionaryIds == null) { - dictionaryIds = reserveNewColumn(capacity, DataTypes.IntegerType); - this.dictionaryIds = dictionaryIds; - } else { - dictionaryIds.reset(); - dictionaryIds.reserve(capacity); - } - return dictionaryIds; - } - - /** - * Returns the underlying integer column for ids of dictionary. - */ - @Override - public WritableColumnVector getDictionaryIds() { - return (WritableColumnVector) dictionaryIds; - } - /** * Reserve a new column. */