Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

optimize index merge #1960

Merged
merged 1 commit into from
Nov 12, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 22 additions & 5 deletions processing/src/main/java/io/druid/segment/IndexMerger.java
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
import io.druid.common.utils.JodaUtils;
import io.druid.common.utils.SerializerUtils;
import io.druid.query.aggregation.AggregatorFactory;
import io.druid.segment.column.BitmapIndexSeeker;
import io.druid.segment.column.ColumnCapabilities;
import io.druid.segment.column.ColumnCapabilitiesImpl;
import io.druid.segment.column.ValueType;
Expand Down Expand Up @@ -216,8 +217,11 @@ public File mergeQueryableIndex(
ProgressIndicator progress
) throws IOException
{
return merge(
Lists.transform(
// We are materializing the list for performance reasons. Lists.transform
// only creates a "view" of the original list, meaning the function gets
// applied every time you access an element.
List<IndexableAdapter> indexAdapteres = Lists.newArrayList(
Iterables.transform(
indexes,
new Function<QueryableIndex, IndexableAdapter>()
{
Expand All @@ -227,7 +231,10 @@ public IndexableAdapter apply(final QueryableIndex input)
return new QueryableIndexIndexableAdapter(input);
}
}
),
)
);
return merge(
indexAdapteres,
metricAggs,
outDir,
null,
Expand Down Expand Up @@ -842,13 +849,17 @@ public Rowboat apply(@Nullable Rowboat input)
tree = new RTree(2, new LinearGutmanSplitStrategy(0, 50, bitmapFactory), bitmapFactory);
}

BitmapIndexSeeker[] bitmapIndexSeeker = new BitmapIndexSeeker[indexes.size()];
for (int j = 0; j < indexes.size(); j++) {
bitmapIndexSeeker[j] = indexes.get(j).getBitmapIndexSeeker(dimension);
}
for (String dimVal : IndexedIterable.create(dimVals)) {
progress.progress();
List<Iterable<Integer>> convertedInverteds = Lists.newArrayListWithCapacity(indexes.size());
for (int j = 0; j < indexes.size(); ++j) {
convertedInverteds.add(
new ConvertingIndexedInts(
indexes.get(j).getBitmapIndex(dimension, dimVal), rowNumConversions.get(j)
bitmapIndexSeeker[j].seek(dimVal), rowNumConversions.get(j)
)
);
}
Expand Down Expand Up @@ -998,6 +1009,7 @@ private static class DimValueConverter

private int currIndex;
private String lastVal = null;
private String currValue;

DimValueConverter(
Indexed<String> dimSet
Expand All @@ -1007,6 +1019,7 @@ private static class DimValueConverter
conversionBuf = ByteBuffer.allocateDirect(dimSet.size() * Ints.BYTES).asIntBuffer();

currIndex = 0;
currValue = null;
}

public void convert(String value, int index)
Expand All @@ -1020,7 +1033,9 @@ public void convert(String value, int index)
}
return;
}
String currValue = dimSet.get(currIndex);
if (currValue == null) {
currValue = dimSet.get(currIndex);
}

while (currValue == null) {
conversionBuf.position(conversionBuf.position() + 1);
Expand All @@ -1037,6 +1052,8 @@ public void convert(String value, int index)
++currIndex;
if (currIndex == dimSet.size()) {
lastVal = value;
} else {
currValue = dimSet.get(currIndex);
}
} else if (currValue.compareTo(value) < 0) {
throw new ISE(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

package io.druid.segment;

import io.druid.segment.column.BitmapIndexSeeker;
import io.druid.segment.column.ColumnCapabilities;
import io.druid.segment.data.Indexed;
import io.druid.segment.data.IndexedInts;
Expand All @@ -41,6 +42,8 @@ public interface IndexableAdapter

IndexedInts getBitmapIndex(String dimension, String value);

BitmapIndexSeeker getBitmapIndexSeeker(String dimension);

String getMetricType(String metric);

ColumnCapabilities getCapabilities(String column);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,20 @@
import com.metamx.common.guava.CloseQuietly;
import com.metamx.common.logger.Logger;
import io.druid.segment.column.BitmapIndex;
import io.druid.segment.column.BitmapIndexSeeker;
import io.druid.segment.column.Column;
import io.druid.segment.column.ColumnCapabilities;
import io.druid.segment.column.ComplexColumn;
import io.druid.segment.column.DictionaryEncodedColumn;
import io.druid.segment.column.EmptyBitmapIndexSeeker;
import io.druid.segment.column.GenericColumn;
import io.druid.segment.column.IndexedFloatsGenericColumn;
import io.druid.segment.column.IndexedLongsGenericColumn;
import io.druid.segment.column.ValueType;
import io.druid.segment.data.ArrayBasedIndexedInts;
import io.druid.segment.data.BitmapCompressedIndexedInts;
import io.druid.segment.data.EmptyIndexedInts;
import io.druid.segment.data.GenericIndexed;
import io.druid.segment.data.Indexed;
import io.druid.segment.data.IndexedInts;
import io.druid.segment.data.IndexedIterable;
Expand Down Expand Up @@ -331,4 +334,76 @@ public ColumnCapabilities getCapabilities(String column)
{
return input.getColumn(column).getCapabilities();
}

@Override
public BitmapIndexSeeker getBitmapIndexSeeker(String dimension)
{
final Column column = input.getColumn(dimension);

if (column == null) {
return new EmptyBitmapIndexSeeker();
}

final BitmapIndex bitmaps = column.getBitmapIndex();
if (bitmaps == null) {
return new EmptyBitmapIndexSeeker();
}

final Indexed<String> dimSet = getDimValueLookup(dimension);

// BitmapIndexSeeker is the main performance boost comes from.
// In the previous version of index merge, during the creation of invert index, we do something like
// merge sort of multiply bitmap indexes. It simply iterator all the previous sorted values,
// and "binary find" the id in each bitmap indexes, which involves disk IO and is really slow.
// Suppose we have N (which is 100 in our test) small segments, each have M (which is 50000 in our case) rows.
// In high cardinality scenario, we will almost have N * M uniq values. So the complexity will be O(N * M * M * LOG(M)).

// There are 2 properties we did not use during the merging:
// 1. We always travel the dimension values sequentially
// 2. One single dimension value is valid only in one index when cardinality is high enough
// So we introduced the BitmapIndexSeeker, which can only seek value sequentially and can never seek back.
// By using this and the help of "getDimValueLookup", we only need to translate all dimension value to its ID once,
// and the translation is done by self-increase of the integer. We only need to change the CACHED value once after
// previous value is hit, renew the value and increase the ID. The complexity now is O(N * M * LOG(M)).
return new BitmapIndexSeeker()
{
private int currIndex = 0;
private String currVal = null;
private String lastVal = null;

@Override
public IndexedInts seek(String value)
{
if (dimSet == null || dimSet.size() == 0) {
return new EmptyIndexedInts();
}
if (lastVal != null) {
if (GenericIndexed.STRING_STRATEGY.compare(value, lastVal) <= 0) {
throw new ISE("Value[%s] is less than the last value[%s] I have, cannot be.",
value, lastVal);
}
return new EmptyIndexedInts();
}
if (currVal == null) {
currVal = dimSet.get(currIndex);
}
int compareResult = GenericIndexed.STRING_STRATEGY.compare(currVal, value);
if (compareResult == 0) {
IndexedInts ret = new BitmapCompressedIndexedInts(bitmaps.getBitmap(currIndex));
++currIndex;
if (currIndex == dimSet.size()) {
lastVal = value;
} else {
currVal = dimSet.get(currIndex);
}
return ret;
} else if (compareResult < 0) {
throw new ISE("Skipped currValue[%s], currIndex[%,d]; incoming value[%s]",
currVal, currIndex, value);
} else {
return new EmptyIndexedInts();
}
}
};
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

import com.google.common.base.Predicate;
import com.google.common.collect.Iterables;
import io.druid.segment.column.BitmapIndexSeeker;
import io.druid.segment.column.ColumnCapabilities;
import io.druid.segment.data.Indexed;
import io.druid.segment.data.IndexedInts;
Expand Down Expand Up @@ -92,4 +93,10 @@ public ColumnCapabilities getCapabilities(String column)
{
return baseAdapter.getCapabilities(column);
}

@Override
public BitmapIndexSeeker getBitmapIndexSeeker(String dimension)
{
return baseAdapter.getBitmapIndexSeeker(dimension);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
/*
* Druid - a distributed column store.
* Copyright 2012 - 2015 Metamarkets Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.druid.segment.column;

import io.druid.segment.data.IndexedInts;

/**
* Only support access in order
*/
public interface BitmapIndexSeeker
{
public IndexedInts seek(String value);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/*
* Druid - a distributed column store.
* Copyright 2012 - 2015 Metamarkets Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.druid.segment.column;

import io.druid.segment.data.EmptyIndexedInts;
import io.druid.segment.data.IndexedInts;

public class EmptyBitmapIndexSeeker implements BitmapIndexSeeker
{

@Override
public IndexedInts seek(String value)
{
return new EmptyIndexedInts();
}

}
Loading