-
Notifications
You must be signed in to change notification settings - Fork 3.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
ARROW-5917: [Java] Redesign the dictionary encoder
The current dictionary encoder implementation (org.apache.arrow.vector.dictionary.DictionaryEncoder) has heavy performance overhead, which prevents it from being useful in practice: * There are repeated conversions between Java objects and bytes (e.g. vector.getObject). * Unnecessary memory copy (the vector data must be copied to the hash table). * The hash table cannot be reused for encoding multiple vectors (other data structure & results cannot be reused either). * The output vector should not be created/managed by the encoder (just like in the out-of-place sorter) * The hash table requires that the hashCode & equals methods be implemented appropriately, but this is not guaranteed. We plan to implement a new one in the algorithm module, and gradually deprecate the current one. Closes #4994 from liyafan82/fly_0712_encode and squashes the following commits: 8b699a8 <liyafan82> Redesign the dictionary encoder Authored-by: liyafan82 <[email protected]> Signed-off-by: Micah Kornfield <[email protected]>
- Loading branch information
1 parent
40eddfe
commit 149efd9
Showing
2 changed files
with
463 additions
and
0 deletions.
There are no files selected for viewing
105 changes: 105 additions & 0 deletions
105
...lgorithm/src/main/java/org/apache/arrow/algorithm/dictionary/SearchDictionaryEncoder.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.arrow.algorithm.dictionary; | ||
|
||
import org.apache.arrow.algorithm.search.VectorSearcher; | ||
import org.apache.arrow.algorithm.sort.VectorValueComparator; | ||
import org.apache.arrow.vector.BaseIntVector; | ||
import org.apache.arrow.vector.ValueVector; | ||
|
||
/** | ||
* Dictionary encoder based on searching. | ||
* @param <E> encoded vector type. | ||
* @param <D> decoded vector type, which is also the dictionary type. | ||
*/ | ||
public class SearchDictionaryEncoder<E extends BaseIntVector, D extends ValueVector> { | ||
|
||
/** | ||
* The dictionary for encoding/decoding. | ||
* It must be sorted. | ||
*/ | ||
private final D dictionary; | ||
|
||
/** | ||
* The criteria by which the dictionary is sorted. | ||
*/ | ||
private final VectorValueComparator<D> comparator; | ||
|
||
/** | ||
* A flag indicating if null should be encoded. | ||
*/ | ||
private final boolean encodeNull; | ||
|
||
/** | ||
* Constructs a dictionary encoder. | ||
* @param dictionary the dictionary. It must be in sorted order. | ||
* @param comparator the criteria for sorting. | ||
*/ | ||
public SearchDictionaryEncoder(D dictionary, VectorValueComparator<D> comparator) { | ||
this(dictionary, comparator, false); | ||
} | ||
|
||
/** | ||
* Constructs a dictionary encoder. | ||
* @param dictionary the dictionary. It must be in sorted order. | ||
* @param comparator the criteria for sorting. | ||
* @param encodeNull a flag indicating if null should be encoded. | ||
* It determines the behaviors for processing null values in the input during encoding/decoding. | ||
* <li> | ||
* For encoding, when a null is encountered in the input, | ||
* 1) If the flag is set to true, the encoder searches for the value in the dictionary, | ||
* and outputs the index in the dictionary. | ||
* 2) If the flag is set to false, the encoder simply produces a null in the output. | ||
* </li> | ||
* <li> | ||
* For decoding, when a null is encountered in the input, | ||
* 1) If the flag is set to true, the decoder should never expect a null in the input. | ||
* 2) If set to false, the decoder simply produces a null in the output. | ||
* </li> | ||
*/ | ||
public SearchDictionaryEncoder(D dictionary, VectorValueComparator<D> comparator, boolean encodeNull) { | ||
this.dictionary = dictionary; | ||
this.comparator = comparator; | ||
this.encodeNull = encodeNull; | ||
} | ||
|
||
/** | ||
* Encodes an input vector by binary search. | ||
* So the algorithm takes O(n * log(m)) time, where n is the length of the input vector, | ||
* and m is the length of the dictionary. | ||
* @param input the input vector. | ||
* @param output the output vector. Note that it must be in a fresh state. At least, | ||
* all its validity bits should be clear. | ||
*/ | ||
public void encode(D input, E output) { | ||
for (int i = 0; i < input.getValueCount(); i++) { | ||
if (!encodeNull && input.isNull(i)) { | ||
// for this case, we should simply output a null in the output. | ||
// by assuming the output vector is fresh, we do nothing here. | ||
continue; | ||
} | ||
|
||
int index = VectorSearcher.binarySearch(dictionary, comparator, input, i); | ||
if (index == -1) { | ||
throw new IllegalArgumentException("The data element is not found in the dictionary: " + i); | ||
} | ||
output.setWithPossibleTruncate(i, index); | ||
} | ||
output.setValueCount(input.getValueCount()); | ||
} | ||
} |
Oops, something went wrong.