Expose duplicate removal in the completion suggester (#26496)

This change exposes the duplicate removal option added in Lucene for the completion suggester with a new option called `skip_duplicates` (defaults to false). This commit also adapts the custom suggest collector to handle deduplication when multiple contexts match the input. Closes #23364
elastic · Sep 7, 2017 · d68d8c9 · d68d8c9
1 parent abe83c4
commit d68d8c9
Show file tree

Hide file tree

Showing 14 changed files with 394 additions and 96 deletions.
diff --git a/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java
@@ -18,17 +18,16 @@
  */
 package org.elasticsearch.search.suggest.completion;
 
+import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.search.BulkScorer;
 import org.apache.lucene.search.CollectionTerminatedException;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Weight;
-import org.apache.lucene.search.suggest.Lookup;
 import org.apache.lucene.search.suggest.document.CompletionQuery;
 import org.apache.lucene.search.suggest.document.TopSuggestDocs;
 import org.apache.lucene.search.suggest.document.TopSuggestDocsCollector;
 import org.apache.lucene.util.CharsRefBuilder;
-import org.apache.lucene.util.PriorityQueue;
 import org.elasticsearch.common.text.Text;
 import org.elasticsearch.index.mapper.CompletionFieldMapper;
 import org.elasticsearch.search.suggest.Suggest;
@@ -53,12 +52,14 @@ protected Suggest.Suggestion<? extends Suggest.Suggestion.Entry<? extends Sugges
             final CompletionSuggestionContext suggestionContext, final IndexSearcher searcher, CharsRefBuilder spare) throws IOException {
         if (suggestionContext.getFieldType() != null) {
             final CompletionFieldMapper.CompletionFieldType fieldType = suggestionContext.getFieldType();
-            CompletionSuggestion completionSuggestion = new CompletionSuggestion(name, suggestionContext.getSize());
+            CompletionSuggestion completionSuggestion =
+                new CompletionSuggestion(name, suggestionContext.getSize(), suggestionContext.isSkipDuplicates());
             spare.copyUTF8Bytes(suggestionContext.getText());
             CompletionSuggestion.Entry completionSuggestEntry = new CompletionSuggestion.Entry(
                 new Text(spare.toString()), 0, spare.length());
             completionSuggestion.addTerm(completionSuggestEntry);
-            TopSuggestDocsCollector collector = new TopDocumentsCollector(suggestionContext.getSize());
+            TopSuggestDocsCollector collector =
+                new TopDocumentsCollector(suggestionContext.getSize(), suggestionContext.isSkipDuplicates());
             suggest(searcher, suggestionContext.toQuery(), collector);
             int numResult = 0;
             for (TopSuggestDocs.SuggestScoreDoc suggestScoreDoc : collector.get().scoreLookupDocs()) {
@@ -97,8 +98,21 @@ private static void suggest(IndexSearcher searcher, CompletionQuery query, TopSu
         }
     }
 
-    // TODO: this should be refactored and moved to lucene
-    // see https://issues.apache.org/jira/browse/LUCENE-6880
+    /**
+     * TODO: this should be refactored and moved to lucene see https://issues.apache.org/jira/browse/LUCENE-6880
+     *
+     * Custom collector that returns top documents from the completion suggester.
+     * When suggestions are augmented with contexts values this collector groups suggestions coming from the same document
+     * but matching different contexts together. Each document is counted as 1 entry and the provided size is the expected number
+     * of documents that should be returned (not the number of suggestions).
+     * This collector is also able to filter duplicate suggestion coming from different documents.
+     * When different contexts match the same suggestion form only the best one (sorted by weight) is kept.
+     * In order to keep this feature fast, the de-duplication of suggestions with different contexts is done
+     * only on the top N*num_contexts (where N is the number of documents to return) suggestions per segment.
+     * This means that skip_duplicates will visit at most N*num_contexts suggestions per segment to find unique suggestions
+     * that match the input. If more than N*num_contexts suggestions are duplicated with different contexts this collector
+     * will not be able to return more than one suggestion even when N is greater than 1.
+     **/
     private static final class TopDocumentsCollector extends TopSuggestDocsCollector {
 
         /**
@@ -150,93 +164,53 @@ public List<CharSequence> getContexts() {
             }
         }
 
-        private static final class SuggestDocPriorityQueue extends PriorityQueue<SuggestDoc> {
+        private final Map<Integer, SuggestDoc> docsMap;
 
-            SuggestDocPriorityQueue(int maxSize) {
-                super(maxSize);
-            }
-
-            @Override
-            protected boolean lessThan(SuggestDoc a, SuggestDoc b) {
-                if (a.score == b.score) {
-                    int cmp = Lookup.CHARSEQUENCE_COMPARATOR.compare(a.key, b.key);
-                    if (cmp == 0) {
-                        // prefer smaller doc id, in case of a tie
-                        return a.doc > b.doc;
-                    } else {
-                        return cmp > 0;
-                    }
-                }
-                return a.score < b.score;
-            }
-
-            public SuggestDoc[] getResults() {
-                int size = size();
-                SuggestDoc[] res = new SuggestDoc[size];
-                for (int i = size - 1; i >= 0; i--) {
-                    res[i] = pop();
-                }
-                return res;
-            }
-        }
-
-        private final int num;
-        private final SuggestDocPriorityQueue pq;
-        private final Map<Integer, SuggestDoc> scoreDocMap;
-
-        // TODO: expose dup removal
-
-        TopDocumentsCollector(int num) {
-            super(1, false); // TODO hack, we don't use the underlying pq, so we allocate a size of 1
-            this.num = num;
-            this.scoreDocMap = new LinkedHashMap<>(num);
-            this.pq = new SuggestDocPriorityQueue(num);
-        }
-
-        @Override
-        public int getCountToCollect() {
-            // This is only needed because we initialize
-            // the base class with 1 instead of the actual num
-            return num;
-        }
-
-
-        @Override
-        protected void doSetNextReader(LeafReaderContext context) throws IOException {
-            super.doSetNextReader(context);
-            updateResults();
-        }
-
-        private void updateResults() {
-            for (SuggestDoc suggestDoc : scoreDocMap.values()) {
-                if (pq.insertWithOverflow(suggestDoc) == suggestDoc) {
-                    break;
-                }
-            }
-            scoreDocMap.clear();
+        TopDocumentsCollector(int num, boolean skipDuplicates) {
+            super(Math.max(1, num), skipDuplicates);
+            this.docsMap = new LinkedHashMap<>(num);
         }
 
         @Override
         public void collect(int docID, CharSequence key, CharSequence context, float score) throws IOException {
-            if (scoreDocMap.containsKey(docID)) {
-                SuggestDoc suggestDoc = scoreDocMap.get(docID);
-                suggestDoc.add(key, context, score);
-            } else if (scoreDocMap.size() <= num) {
-                scoreDocMap.put(docID, new SuggestDoc(docBase + docID, key, context, score));
+            int globalDoc = docID + docBase;
+            if (docsMap.containsKey(globalDoc)) {
+                docsMap.get(globalDoc).add(key, context, score);
             } else {
-                throw new CollectionTerminatedException();
+                docsMap.put(globalDoc, new SuggestDoc(globalDoc, key, context, score));
+                super.collect(docID, key, context, score);
             }
         }
 
         @Override
         public TopSuggestDocs get() throws IOException {
-            updateResults(); // to empty the last set of collected suggest docs
-            TopSuggestDocs.SuggestScoreDoc[] suggestScoreDocs = pq.getResults();
-            if (suggestScoreDocs.length > 0) {
-                return new TopSuggestDocs(suggestScoreDocs.length, suggestScoreDocs, suggestScoreDocs[0].score);
-            } else {
+            TopSuggestDocs entries = super.get();
+            if (entries.scoreDocs.length == 0) {
                 return TopSuggestDocs.EMPTY;
             }
+            // The parent class returns suggestions, not documents, and dedup only the surface form (without contexts).
+            // The following code groups suggestions matching different contexts by document id and dedup the surface form + contexts
+            // if needed (skip_duplicates).
+            int size = entries.scoreDocs.length;
+            final List<TopSuggestDocs.SuggestScoreDoc> suggestDocs = new ArrayList(size);
+            final CharArraySet seenSurfaceForms = doSkipDuplicates() ? new CharArraySet(size, false) : null;
+            for (TopSuggestDocs.SuggestScoreDoc suggestEntry : entries.scoreLookupDocs()) {
+                final SuggestDoc suggestDoc;
+                if (docsMap != null) {
+                    suggestDoc = docsMap.get(suggestEntry.doc);
+                } else {
+                    suggestDoc = new SuggestDoc(suggestEntry.doc, suggestEntry.key, suggestEntry.context, suggestEntry.score);
+                }
+                if (doSkipDuplicates()) {
+                    if (seenSurfaceForms.contains(suggestDoc.key)) {
+                        continue;
+                    }
+                    seenSurfaceForms.add(suggestDoc.key);
+                }
+                suggestDocs.add(suggestDoc);
+            }
+            return new TopSuggestDocs((int) entries.totalHits,
+                suggestDocs.toArray(new TopSuggestDocs.SuggestScoreDoc[0]), entries.getMaxScore());
         }
     }
 }
diff --git a/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestion.java b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestion.java
@@ -18,8 +18,10 @@
  */
 package org.elasticsearch.search.suggest.completion;
 
+import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.suggest.Lookup;
+import org.elasticsearch.Version;
 import org.elasticsearch.common.ParseField;
 import org.elasticsearch.common.io.stream.StreamInput;
 import org.elasticsearch.common.io.stream.StreamOutput;
@@ -68,11 +70,38 @@ public final class CompletionSuggestion extends Suggest.Suggestion<CompletionSug
 
     public static final int TYPE = 4;
 
+    private boolean skipDuplicates;
+
     public CompletionSuggestion() {
     }
 
-    public CompletionSuggestion(String name, int size) {
+    /**
+     * Ctr
+     * @param name The name for the suggestions
+     * @param size The number of suggestions to return
+     * @param skipDuplicates Whether duplicate suggestions should be filtered out
+     */
+    public CompletionSuggestion(String name, int size, boolean skipDuplicates) {
         super(name, size);
+        this.skipDuplicates = skipDuplicates;
+    }
+
+    @Override
+    public void readFrom(StreamInput in) throws IOException {
+        super.readFrom(in);
+        // TODO should be backported to 6.1.0
+        if (in.getVersion().onOrAfter(Version.V_7_0_0_alpha1)) {
+            skipDuplicates = in.readBoolean();
+        }
+    }
+
+    @Override
+    public void writeTo(StreamOutput out) throws IOException {
+        super.writeTo(out);
+        // TODO should be backported to 6.1.0
+        if (out.getVersion().onOrAfter(Version.V_7_0_0_alpha1)) {
+            out.writeBoolean(skipDuplicates);
+        }
     }
 
     /**
@@ -95,7 +124,7 @@ public boolean hasScoreDocs() {
     }
 
     public static CompletionSuggestion fromXContent(XContentParser parser, String name) throws IOException {
-        CompletionSuggestion suggestion = new CompletionSuggestion(name, -1);
+        CompletionSuggestion suggestion = new CompletionSuggestion(name, -1, false);
         parseEntries(parser, suggestion, CompletionSuggestion.Entry::fromXContent);
         return suggestion;
     }
@@ -146,9 +175,19 @@ public static CompletionSuggestion reduceTo(List<Suggest.Suggestion<Entry>> toRe
                 // the global top <code>size</code> entries are collected from the shard results
                 // using a priority queue
                 OptionPriorityQueue priorityQueue = new OptionPriorityQueue(leader.getSize(), COMPARATOR);
+                // Dedup duplicate suggestions (based on the surface form) if skip duplicates is activated
+                final CharArraySet seenSurfaceForms = leader.skipDuplicates ? new CharArraySet(leader.getSize(), false) : null;
                 for (Suggest.Suggestion<Entry> suggestion : toReduce) {
                     assert suggestion.getName().equals(name) : "name should be identical across all suggestions";
                     for (Entry.Option option : ((CompletionSuggestion) suggestion).getOptions()) {
+                        if (leader.skipDuplicates) {
+                            assert ((CompletionSuggestion) suggestion).skipDuplicates;
+                            String text = option.getText().string();
+                            if (seenSurfaceForms.contains(text)) {
+                                continue;
+                            }
+                            seenSurfaceForms.add(text);
+                        }
                         if (option == priorityQueue.insertWithOverflow(option)) {
                             // if the current option has overflown from pq,
                             // we can assume all of the successive options
@@ -157,7 +196,7 @@ public static CompletionSuggestion reduceTo(List<Suggest.Suggestion<Entry>> toRe
                         }
                     }
                 }
-                final CompletionSuggestion suggestion = new CompletionSuggestion(leader.getName(), leader.getSize());
+                final CompletionSuggestion suggestion = new CompletionSuggestion(leader.getName(), leader.getSize(), leader.skipDuplicates);
                 final Entry entry = new Entry(leaderEntry.getText(), leaderEntry.getOffset(), leaderEntry.getLength());
                 Collections.addAll(entry.getOptions(), priorityQueue.get());
                 suggestion.addTerm(entry);

diff --git a/...rc/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionBuilder.java b/...rc/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionBuilder.java
@@ -19,6 +19,7 @@
 package org.elasticsearch.search.suggest.completion;
 
 import org.elasticsearch.ElasticsearchParseException;
+import org.elasticsearch.Version;
 import org.elasticsearch.common.ParseField;
 import org.elasticsearch.common.bytes.BytesReference;
 import org.elasticsearch.common.io.stream.StreamInput;
@@ -57,6 +58,7 @@ public class CompletionSuggestionBuilder extends SuggestionBuilder<CompletionSug
     private static final XContentType CONTEXT_BYTES_XCONTENT_TYPE = XContentType.JSON;
     static final String SUGGESTION_NAME = "completion";
     static final ParseField CONTEXTS_FIELD = new ParseField("contexts", "context");
+    static final ParseField SKIP_DUPLICATES_FIELD = new ParseField("skip_duplicates");
 
     /**
      * {
@@ -94,11 +96,13 @@ public class CompletionSuggestionBuilder extends SuggestionBuilder<CompletionSug
             v.contextBytes = builder.bytes();
             p.skipChildren();
         }, CONTEXTS_FIELD, ObjectParser.ValueType.OBJECT); // context is deprecated
+        PARSER.declareBoolean(CompletionSuggestionBuilder::skipDuplicates, SKIP_DUPLICATES_FIELD);
     }
 
     protected FuzzyOptions fuzzyOptions;
     protected RegexOptions regexOptions;
     protected BytesReference contextBytes = null;
+    protected boolean skipDuplicates = false;
 
     public CompletionSuggestionBuilder(String field) {
         super(field);
@@ -113,6 +117,7 @@ private CompletionSuggestionBuilder(String fieldname, CompletionSuggestionBuilde
         fuzzyOptions = in.fuzzyOptions;
         regexOptions = in.regexOptions;
         contextBytes = in.contextBytes;
+        skipDuplicates = in.skipDuplicates;
     }
 
     /**
@@ -123,13 +128,21 @@ public CompletionSuggestionBuilder(StreamInput in) throws IOException {
         fuzzyOptions = in.readOptionalWriteable(FuzzyOptions::new);
         regexOptions = in.readOptionalWriteable(RegexOptions::new);
         contextBytes = in.readOptionalBytesReference();
+        // TODO should be backported to 6.1.0
+        if (in.getVersion().onOrAfter(Version.V_7_0_0_alpha1)) {
+            skipDuplicates = in.readBoolean();
+        }
     }
 
     @Override
     public void doWriteTo(StreamOutput out) throws IOException {
         out.writeOptionalWriteable(fuzzyOptions);
         out.writeOptionalWriteable(regexOptions);
         out.writeOptionalBytesReference(contextBytes);
+        // TODO should be backported to 6.1.0
+        if (out.getVersion().onOrAfter(Version.V_7_0_0_alpha1)) {
+            out.writeBoolean(skipDuplicates);
+        }
     }
 
     /**
@@ -210,6 +223,21 @@ private CompletionSuggestionBuilder contexts(XContentBuilder contextBuilder) {
         return this;
     }
 
+    /**
+     * Returns whether duplicate suggestions should be filtered out.
+     */
+    public boolean skipDuplicates() {
+        return skipDuplicates;
+    }
+
+    /**
+     * Should duplicates be filtered or not. Defaults to <tt>false</tt>.
+     */
+    public CompletionSuggestionBuilder skipDuplicates(boolean skipDuplicates) {
+        this.skipDuplicates = skipDuplicates;
+        return this;
+    }
+
     private static class InnerBuilder extends CompletionSuggestionBuilder {
         private String field;
 
@@ -231,6 +259,9 @@ protected XContentBuilder innerToXContent(XContentBuilder builder, Params params
         if (regexOptions != null) {
             regexOptions.toXContent(builder, params);
         }
+        if (skipDuplicates) {
+            builder.field(SKIP_DUPLICATES_FIELD.getPreferredName(), skipDuplicates);
+        }
         if (contextBytes != null) {
             builder.rawField(CONTEXTS_FIELD.getPreferredName(), contextBytes);
         }
@@ -255,6 +286,7 @@ public SuggestionContext build(QueryShardContext context) throws IOException {
         // copy over common settings to each suggestion builder
         final MapperService mapperService = context.getMapperService();
         populateCommonFields(mapperService, suggestionContext);
+        suggestionContext.setSkipDuplicates(skipDuplicates);
         suggestionContext.setFuzzyOptions(fuzzyOptions);
         suggestionContext.setRegexOptions(regexOptions);
         MappedFieldType mappedFieldType = mapperService.fullName(suggestionContext.getField());
@@ -302,13 +334,14 @@ public String getWriteableName() {
 
     @Override
     protected boolean doEquals(CompletionSuggestionBuilder other) {
-        return Objects.equals(fuzzyOptions, other.fuzzyOptions) &&
+        return skipDuplicates == other.skipDuplicates &&
+            Objects.equals(fuzzyOptions, other.fuzzyOptions) &&
             Objects.equals(regexOptions, other.regexOptions) &&
             Objects.equals(contextBytes, other.contextBytes);
     }
 
     @Override
     protected int doHashCode() {
-        return Objects.hash(fuzzyOptions, regexOptions, contextBytes);
+        return Objects.hash(fuzzyOptions, regexOptions, contextBytes, skipDuplicates);
     }
 }