Fix testIndexhasDuplicateData tests (elastic#49786)

testIndexHasDuplicateData tests were failing ocassionally, due to approximate calculation of BKDReader.estimatePointCount, where if the node is Leaf, the number of points in it was (maxPointsInLeafNode + 1) / 2. As DEFAULT_MAX_POINTS_IN_LEAF_NODE = 1024, for small indexes used in tests, the estimation could be really off. This rewrites tests, to make the max points in leaf node to be a small value to control the tests. Closes elastic#49703
dakrone · Mar 19, 2020 · 2c77c0d · 2c77c0d
1 parent 3b2b564
commit 2c77c0d
Show file tree

Hide file tree

Showing 2 changed files with 74 additions and 41 deletions.
diff --git a/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java b/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java
@@ -575,7 +575,7 @@ private static boolean canEarlyTerminate(IndexReader reader, SortAndFormats sort
      * Returns true if more than 50% of data in the index have the same value
      * The evaluation is approximation based on finding the median value and estimating its count
      */
-    static boolean indexFieldHasDuplicateData(IndexReader reader, String field) throws IOException {
+    private static boolean indexFieldHasDuplicateData(IndexReader reader, String field) throws IOException {
         long docsNoDupl = 0; // number of docs in segments with NO duplicate data that would benefit optimization
         long docsDupl = 0; // number of docs in segments with duplicate data that would NOT benefit optimization
         for (LeafReaderContext lrc : reader.leaves()) {
@@ -586,24 +586,8 @@ static boolean indexFieldHasDuplicateData(IndexReader reader, String field) thro
                 continue;
             }
             assert(pointValues.size() == docCount); // TODO: modify the code to handle multiple values
-
             int duplDocCount = docCount/2; // expected doc count of duplicate data
-            long minValue = LongPoint.decodeDimension(pointValues.getMinPackedValue(), 0);
-            long maxValue = LongPoint.decodeDimension(pointValues.getMaxPackedValue(), 0);
-            boolean hasDuplicateData = true;
-            while ((minValue < maxValue) && hasDuplicateData) {
-                long midValue = Math.floorDiv(minValue, 2) + Math.floorDiv(maxValue, 2); // to avoid overflow first divide each value by 2
-                long countLeft = estimatePointCount(pointValues, minValue, midValue);
-                long countRight = estimatePointCount(pointValues, midValue + 1, maxValue);
-                if ((countLeft >= countRight) && (countLeft > duplDocCount) ) {
-                    maxValue = midValue;
-                } else if ((countRight > countLeft) && (countRight > duplDocCount)) {
-                    minValue = midValue + 1;
-                } else {
-                    hasDuplicateData = false;
-                }
-            }
-            if (hasDuplicateData) {
+            if (pointsHaveDuplicateData(pointValues, duplDocCount)) {
                 docsDupl += docCount;
             } else {
                 docsNoDupl += docCount;
@@ -612,6 +596,25 @@ static boolean indexFieldHasDuplicateData(IndexReader reader, String field) thro
         return (docsDupl > docsNoDupl);
     }
 
+    static boolean pointsHaveDuplicateData(PointValues pointValues, int duplDocCount) throws IOException {
+        long minValue = LongPoint.decodeDimension(pointValues.getMinPackedValue(), 0);
+        long maxValue = LongPoint.decodeDimension(pointValues.getMaxPackedValue(), 0);
+        boolean hasDuplicateData = true;
+        while ((minValue < maxValue) && hasDuplicateData) {
+            long midValue = Math.floorDiv(minValue, 2) + Math.floorDiv(maxValue, 2); // to avoid overflow first divide each value by 2
+            long countLeft = estimatePointCount(pointValues, minValue, midValue);
+            long countRight = estimatePointCount(pointValues, midValue + 1, maxValue);
+            if ((countLeft >= countRight) && (countLeft > duplDocCount) ) {
+                maxValue = midValue;
+            } else if ((countRight > countLeft) && (countRight > duplDocCount)) {
+                minValue = midValue + 1;
+            } else {
+                hasDuplicateData = false;
+            }
+        }
+        return hasDuplicateData;
+    }
+
 
     private static long estimatePointCount(PointValues pointValues, long minValue, long maxValue) {
         final byte[] minValueAsBytes = new byte[Long.BYTES];

diff --git a/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java b/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java
@@ -69,8 +69,13 @@
 import org.apache.lucene.search.spans.SpanNearQuery;
 import org.apache.lucene.search.spans.SpanTermQuery;
 import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.bkd.BKDReader;
+import org.apache.lucene.util.bkd.BKDWriter;
 import org.elasticsearch.action.search.SearchShardTask;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.mapper.DateFieldMapper;
@@ -95,7 +100,7 @@
 import java.util.Collections;
 import java.util.List;
 
-import static org.elasticsearch.search.query.QueryPhase.indexFieldHasDuplicateData;
+import static org.elasticsearch.search.query.QueryPhase.pointsHaveDuplicateData;
 import static org.elasticsearch.search.query.TopDocsCollectorContext.hasInfMaxScore;
 import static org.hamcrest.Matchers.anyOf;
 import static org.hamcrest.Matchers.equalTo;
@@ -710,31 +715,56 @@ public void testNumericLongOrDateSortOptimization() throws Exception {
         dir.close();
     }
 
-    @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/49703")
     public void testIndexHasDuplicateData() throws IOException {
-        int docsCount = 7000;
-        int duplIndex = docsCount * 7 / 10;
-        int duplIndex2 = docsCount * 3 / 10;
+        int docsCount = 5000;
+        int maxPointsInLeafNode = 40;
+        float duplicateRatio = 0.7f;
         long duplicateValue = randomLongBetween(-10000000L, 10000000L);
-        Directory dir = newDirectory();
-        IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(null));
-        for (int docId = 0; docId < docsCount; docId++) {
-            Document doc = new Document();
-            long rndValue = randomLongBetween(-10000000L, 10000000L);
-            long value = (docId < duplIndex) ? duplicateValue : rndValue;
-            long value2 = (docId < duplIndex2) ? duplicateValue : rndValue;
-            doc.add(new LongPoint("duplicateField", value));
-            doc.add(new LongPoint("notDuplicateField", value2));
-            writer.addDocument(doc);
+
+        try (Directory dir = newDirectory()) {
+            BKDWriter w = new BKDWriter(docsCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, docsCount);
+            byte[] longBytes = new byte[8];
+            for (int docId = 0; docId < docsCount; docId++) {
+                long value = randomFloat() < duplicateRatio ? duplicateValue : randomLongBetween(-10000000L, 10000000L);
+                LongPoint.encodeDimension(value, longBytes, 0);
+                w.add(longBytes, docId);
+            }
+            long indexFP;
+            try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
+                indexFP = w.finish(out);
+            }
+            try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
+                in.seek(indexFP);
+                BKDReader r = new BKDReader(in);
+                assertTrue(pointsHaveDuplicateData(r, r.getDocCount()/2));
+            }
+        }
+    }
+
+    public void testIndexHasNoDuplicateData() throws IOException {
+        int docsCount = 5000;
+        int maxPointsInLeafNode = 40;
+        float duplicateRatio = 0.3f;
+        long duplicateValue = randomLongBetween(-10000000L, 10000000L);
+
+        try (Directory dir = newDirectory()) {
+            BKDWriter w = new BKDWriter(docsCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, docsCount);
+            byte[] longBytes = new byte[8];
+            for (int docId = 0; docId < docsCount; docId++) {
+                long value = randomFloat() < duplicateRatio ? duplicateValue : randomLongBetween(-10000000L, 10000000L);
+                LongPoint.encodeDimension(value, longBytes, 0);
+                w.add(longBytes, docId);
+            }
+            long indexFP;
+            try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
+                indexFP = w.finish(out);
+            }
+            try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
+                in.seek(indexFP);
+                BKDReader r = new BKDReader(in);
+                assertFalse(pointsHaveDuplicateData(r, r.getDocCount()/2));
+            }
         }
-        writer.close();
-        final IndexReader reader = DirectoryReader.open(dir);
-        boolean hasDuplicateData = indexFieldHasDuplicateData(reader, "duplicateField");
-        boolean hasDuplicateData2 = indexFieldHasDuplicateData(reader, "notDuplicateField");
-        reader.close();
-        dir.close();
-        assertTrue(hasDuplicateData);
-        assertFalse(hasDuplicateData2);
     }
 
     public void testMaxScoreQueryVisitor() {