Skip to content

Commit

Permalink
[SPARK-22775][SQL] move dictionary related APIs from ColumnVector to …
Browse files Browse the repository at this point in the history
…WritableColumnVector

## What changes were proposed in this pull request?

These dictionary related APIs are special to `WritableColumnVector` and should not be in `ColumnVector`, which will be public soon.

## How was this patch tested?

existing tests

Author: Wenchen Fan <[email protected]>

Closes #19970 from cloud-fan/final.
  • Loading branch information
cloud-fan committed Dec 14, 2017
1 parent c3dd2a2 commit 7d8e2ca
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 88 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ private void decodeDictionaryIds(
int rowId,
int num,
WritableColumnVector column,
ColumnVector dictionaryIds) {
WritableColumnVector dictionaryIds) {
switch (descriptor.getType()) {
case INT32:
if (column.dataType() == DataTypes.IntegerType ||
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,11 +159,6 @@ public int[] getInts(int rowId, int count) {
return array;
}

@Override
public int getDictId(int rowId) {
throw new UnsupportedOperationException();
}

//
// APIs dealing with Longs
//
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,32 +22,29 @@
import org.apache.spark.unsafe.types.UTF8String;

/**
* This class represents a column of values and provides the main APIs to access the data
* values. It supports all the types and contains get APIs as well as their batched versions.
* The batched versions are preferable whenever possible.
* This class represents in-memory values of a column and provides the main APIs to access the data.
* It supports all the types and contains get APIs as well as their batched versions. The batched
* versions are considered to be faster and preferable whenever possible.
*
* To handle nested schemas, ColumnVector has two types: Arrays and Structs. In both cases these
* columns have child columns. All of the data is stored in the child columns and the parent column
* contains nullability, and in the case of Arrays, the lengths and offsets into the child column.
* Lengths and offsets are encoded identically to INTs.
* columns have child columns. All of the data are stored in the child columns and the parent column
* only contains nullability. In the case of Arrays, the lengths and offsets are saved in the child
* column and are encoded identically to INTs.
*
* Maps are just a special case of a two field struct.
*
* Most of the APIs take the rowId as a parameter. This is the batch local 0-based row id for values
* in the current RowBatch.
*
* A ColumnVector should be considered immutable once originally created.
*
* ColumnVectors are intended to be reused.
* in the current batch.
*/
public abstract class ColumnVector implements AutoCloseable {

/**
* Returns the data type of this column.
*/
public final DataType dataType() { return type; }

/**
* Cleans up memory for this column. The column is not usable after this.
* TODO: this should probably have ref-counted semantics.
*/
public abstract void close();

Expand Down Expand Up @@ -107,13 +104,6 @@ public abstract class ColumnVector implements AutoCloseable {
*/
public abstract int[] getInts(int rowId, int count);

/**
* Returns the dictionary Id for rowId.
* This should only be called when the ColumnVector is dictionaryIds.
* We have this separate method for dictionaryIds as per SPARK-16928.
*/
public abstract int getDictId(int rowId);

/**
* Returns the value for rowId.
*/
Expand Down Expand Up @@ -145,39 +135,39 @@ public abstract class ColumnVector implements AutoCloseable {
public abstract double[] getDoubles(int rowId, int count);

/**
* Returns the length of the array at rowid.
* Returns the length of the array for rowId.
*/
public abstract int getArrayLength(int rowId);

/**
* Returns the offset of the array at rowid.
* Returns the offset of the array for rowId.
*/
public abstract int getArrayOffset(int rowId);

/**
* Returns a utility object to get structs.
* Returns the struct for rowId.
*/
public final ColumnarRow getStruct(int rowId) {
return new ColumnarRow(this, rowId);
}

/**
* Returns a utility object to get structs.
* provided to keep API compatibility with InternalRow for code generation
* A special version of {@link #getStruct(int)}, which is only used as an adapter for Spark
* codegen framework, the second parameter is totally ignored.
*/
public final ColumnarRow getStruct(int rowId, int size) {
return getStruct(rowId);
}

/**
* Returns the array at rowid.
* Returns the array for rowId.
*/
public final ColumnarArray getArray(int rowId) {
return new ColumnarArray(arrayData(), getArrayOffset(rowId), getArrayLength(rowId));
}

/**
* Returns the value for rowId.
* Returns the map for rowId.
*/
public MapData getMap(int ordinal) {
throw new UnsupportedOperationException();
Expand Down Expand Up @@ -214,30 +204,6 @@ public MapData getMap(int ordinal) {
*/
protected DataType type;

/**
* The Dictionary for this column.
*
* If it's not null, will be used to decode the value in getXXX().
*/
protected Dictionary dictionary;

/**
* Reusable column for ids of dictionary.
*/
protected ColumnVector dictionaryIds;

/**
* Returns true if this column has a dictionary.
*/
public boolean hasDictionary() { return this.dictionary != null; }

/**
* Returns the underlying integer column for ids of dictionary.
*/
public ColumnVector getDictionaryIds() {
return dictionaryIds;
}

/**
* Sets up the common state and also handles creating the child columns if this is a nested
* type.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,10 @@
* elements. This means that the put() APIs do not check as in common cases (i.e. flat schemas),
* the lengths are known up front.
*
* A ColumnVector should be considered immutable once originally created. In other words, it is not
* valid to call put APIs after reads until reset() is called.
* A WritableColumnVector should be considered immutable once originally created. In other words,
* it is not valid to call put APIs after reads until reset() is called.
*
* WritableColumnVector are intended to be reused.
*/
public abstract class WritableColumnVector extends ColumnVector {

Expand Down Expand Up @@ -105,6 +107,58 @@ private void throwUnsupportedException(int requiredCapacity, Throwable cause) {
@Override
public boolean anyNullsSet() { return anyNullsSet; }

/**
* Returns the dictionary Id for rowId.
*
* This should only be called when this `WritableColumnVector` represents dictionaryIds.
* We have this separate method for dictionaryIds as per SPARK-16928.
*/
public abstract int getDictId(int rowId);

/**
* The Dictionary for this column.
*
* If it's not null, will be used to decode the value in getXXX().
*/
protected Dictionary dictionary;

/**
* Reusable column for ids of dictionary.
*/
protected WritableColumnVector dictionaryIds;

/**
* Returns true if this column has a dictionary.
*/
public boolean hasDictionary() { return this.dictionary != null; }

/**
* Returns the underlying integer column for ids of dictionary.
*/
public WritableColumnVector getDictionaryIds() {
return dictionaryIds;
}

/**
* Update the dictionary.
*/
public void setDictionary(Dictionary dictionary) {
this.dictionary = dictionary;
}

/**
* Reserve a integer column for ids of dictionary.
*/
public WritableColumnVector reserveDictionaryIds(int capacity) {
if (dictionaryIds == null) {
dictionaryIds = reserveNewColumn(capacity, DataTypes.IntegerType);
} else {
dictionaryIds.reset();
dictionaryIds.reserve(capacity);
}
return dictionaryIds;
}

/**
* Ensures that there is enough storage to store capacity elements. That is, the put() APIs
* must work for all rowIds < capacity.
Expand Down Expand Up @@ -613,36 +667,6 @@ public final int appendStruct(boolean isNull) {
*/
protected WritableColumnVector[] childColumns;

/**
* Update the dictionary.
*/
public void setDictionary(Dictionary dictionary) {
this.dictionary = dictionary;
}

/**
* Reserve a integer column for ids of dictionary.
*/
public WritableColumnVector reserveDictionaryIds(int capacity) {
WritableColumnVector dictionaryIds = (WritableColumnVector) this.dictionaryIds;
if (dictionaryIds == null) {
dictionaryIds = reserveNewColumn(capacity, DataTypes.IntegerType);
this.dictionaryIds = dictionaryIds;
} else {
dictionaryIds.reset();
dictionaryIds.reserve(capacity);
}
return dictionaryIds;
}

/**
* Returns the underlying integer column for ids of dictionary.
*/
@Override
public WritableColumnVector getDictionaryIds() {
return (WritableColumnVector) dictionaryIds;
}

/**
* Reserve a new column.
*/
Expand Down

0 comments on commit 7d8e2ca

Please sign in to comment.