Skip to content

Commit

Permalink
Fix testIndexhasDuplicateData tests (elastic#49786)
Browse files Browse the repository at this point in the history
testIndexHasDuplicateData tests were failing ocassionally,
due to approximate calculation of BKDReader.estimatePointCount,
where if the node is Leaf, the number of points in it
was (maxPointsInLeafNode + 1) / 2.
As DEFAULT_MAX_POINTS_IN_LEAF_NODE = 1024, for small indexes
used in tests, the estimation could be really off.

This rewrites tests, to make the  max points in leaf node to
be a small value to control the tests.

Closes elastic#49703
  • Loading branch information
mayya-sharipova committed Mar 19, 2020
1 parent 3b2b564 commit 2c77c0d
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 41 deletions.
39 changes: 21 additions & 18 deletions server/src/main/java/org/elasticsearch/search/query/QueryPhase.java
Original file line number Diff line number Diff line change
Expand Up @@ -575,7 +575,7 @@ private static boolean canEarlyTerminate(IndexReader reader, SortAndFormats sort
* Returns true if more than 50% of data in the index have the same value
* The evaluation is approximation based on finding the median value and estimating its count
*/
static boolean indexFieldHasDuplicateData(IndexReader reader, String field) throws IOException {
private static boolean indexFieldHasDuplicateData(IndexReader reader, String field) throws IOException {
long docsNoDupl = 0; // number of docs in segments with NO duplicate data that would benefit optimization
long docsDupl = 0; // number of docs in segments with duplicate data that would NOT benefit optimization
for (LeafReaderContext lrc : reader.leaves()) {
Expand All @@ -586,24 +586,8 @@ static boolean indexFieldHasDuplicateData(IndexReader reader, String field) thro
continue;
}
assert(pointValues.size() == docCount); // TODO: modify the code to handle multiple values

int duplDocCount = docCount/2; // expected doc count of duplicate data
long minValue = LongPoint.decodeDimension(pointValues.getMinPackedValue(), 0);
long maxValue = LongPoint.decodeDimension(pointValues.getMaxPackedValue(), 0);
boolean hasDuplicateData = true;
while ((minValue < maxValue) && hasDuplicateData) {
long midValue = Math.floorDiv(minValue, 2) + Math.floorDiv(maxValue, 2); // to avoid overflow first divide each value by 2
long countLeft = estimatePointCount(pointValues, minValue, midValue);
long countRight = estimatePointCount(pointValues, midValue + 1, maxValue);
if ((countLeft >= countRight) && (countLeft > duplDocCount) ) {
maxValue = midValue;
} else if ((countRight > countLeft) && (countRight > duplDocCount)) {
minValue = midValue + 1;
} else {
hasDuplicateData = false;
}
}
if (hasDuplicateData) {
if (pointsHaveDuplicateData(pointValues, duplDocCount)) {
docsDupl += docCount;
} else {
docsNoDupl += docCount;
Expand All @@ -612,6 +596,25 @@ static boolean indexFieldHasDuplicateData(IndexReader reader, String field) thro
return (docsDupl > docsNoDupl);
}

static boolean pointsHaveDuplicateData(PointValues pointValues, int duplDocCount) throws IOException {
long minValue = LongPoint.decodeDimension(pointValues.getMinPackedValue(), 0);
long maxValue = LongPoint.decodeDimension(pointValues.getMaxPackedValue(), 0);
boolean hasDuplicateData = true;
while ((minValue < maxValue) && hasDuplicateData) {
long midValue = Math.floorDiv(minValue, 2) + Math.floorDiv(maxValue, 2); // to avoid overflow first divide each value by 2
long countLeft = estimatePointCount(pointValues, minValue, midValue);
long countRight = estimatePointCount(pointValues, midValue + 1, maxValue);
if ((countLeft >= countRight) && (countLeft > duplDocCount) ) {
maxValue = midValue;
} else if ((countRight > countLeft) && (countRight > duplDocCount)) {
minValue = midValue + 1;
} else {
hasDuplicateData = false;
}
}
return hasDuplicateData;
}


private static long estimatePointCount(PointValues pointValues, long minValue, long maxValue) {
final byte[] minValueAsBytes = new byte[Long.BYTES];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,13 @@
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.bkd.BKDReader;
import org.apache.lucene.util.bkd.BKDWriter;
import org.elasticsearch.action.search.SearchShardTask;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.mapper.DateFieldMapper;
Expand All @@ -95,7 +100,7 @@
import java.util.Collections;
import java.util.List;

import static org.elasticsearch.search.query.QueryPhase.indexFieldHasDuplicateData;
import static org.elasticsearch.search.query.QueryPhase.pointsHaveDuplicateData;
import static org.elasticsearch.search.query.TopDocsCollectorContext.hasInfMaxScore;
import static org.hamcrest.Matchers.anyOf;
import static org.hamcrest.Matchers.equalTo;
Expand Down Expand Up @@ -710,31 +715,56 @@ public void testNumericLongOrDateSortOptimization() throws Exception {
dir.close();
}

@AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/49703")
public void testIndexHasDuplicateData() throws IOException {
int docsCount = 7000;
int duplIndex = docsCount * 7 / 10;
int duplIndex2 = docsCount * 3 / 10;
int docsCount = 5000;
int maxPointsInLeafNode = 40;
float duplicateRatio = 0.7f;
long duplicateValue = randomLongBetween(-10000000L, 10000000L);
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(null));
for (int docId = 0; docId < docsCount; docId++) {
Document doc = new Document();
long rndValue = randomLongBetween(-10000000L, 10000000L);
long value = (docId < duplIndex) ? duplicateValue : rndValue;
long value2 = (docId < duplIndex2) ? duplicateValue : rndValue;
doc.add(new LongPoint("duplicateField", value));
doc.add(new LongPoint("notDuplicateField", value2));
writer.addDocument(doc);

try (Directory dir = newDirectory()) {
BKDWriter w = new BKDWriter(docsCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, docsCount);
byte[] longBytes = new byte[8];
for (int docId = 0; docId < docsCount; docId++) {
long value = randomFloat() < duplicateRatio ? duplicateValue : randomLongBetween(-10000000L, 10000000L);
LongPoint.encodeDimension(value, longBytes, 0);
w.add(longBytes, docId);
}
long indexFP;
try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
indexFP = w.finish(out);
}
try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
in.seek(indexFP);
BKDReader r = new BKDReader(in);
assertTrue(pointsHaveDuplicateData(r, r.getDocCount()/2));
}
}
}

public void testIndexHasNoDuplicateData() throws IOException {
int docsCount = 5000;
int maxPointsInLeafNode = 40;
float duplicateRatio = 0.3f;
long duplicateValue = randomLongBetween(-10000000L, 10000000L);

try (Directory dir = newDirectory()) {
BKDWriter w = new BKDWriter(docsCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, docsCount);
byte[] longBytes = new byte[8];
for (int docId = 0; docId < docsCount; docId++) {
long value = randomFloat() < duplicateRatio ? duplicateValue : randomLongBetween(-10000000L, 10000000L);
LongPoint.encodeDimension(value, longBytes, 0);
w.add(longBytes, docId);
}
long indexFP;
try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
indexFP = w.finish(out);
}
try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
in.seek(indexFP);
BKDReader r = new BKDReader(in);
assertFalse(pointsHaveDuplicateData(r, r.getDocCount()/2));
}
}
writer.close();
final IndexReader reader = DirectoryReader.open(dir);
boolean hasDuplicateData = indexFieldHasDuplicateData(reader, "duplicateField");
boolean hasDuplicateData2 = indexFieldHasDuplicateData(reader, "notDuplicateField");
reader.close();
dir.close();
assertTrue(hasDuplicateData);
assertFalse(hasDuplicateData2);
}

public void testMaxScoreQueryVisitor() {
Expand Down

0 comments on commit 2c77c0d

Please sign in to comment.