From 54c6ac709960fb59862aa2b16d7758cb8d1ba62f Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Mon, 4 Sep 2017 17:13:40 +0200 Subject: [PATCH 1/6] Expose duplicate removal in the completion suggester This change exposes the duplicate removal option added in Lucene for the completion suggester with a new option called `skip_duplicates` (defaults to false). This commit also adapts the custom suggest collector to handle deduplication when multiple contexts match the input. Closes #23364 --- .../completion/CompletionSuggester.java | 138 ++++++++---------- .../completion/CompletionSuggestion.java | 39 ++++- .../CompletionSuggestionBuilder.java | 31 +++- .../CompletionSuggestionContext.java | 9 ++ .../search/SearchPhaseControllerTests.java | 6 +- .../suggest/CompletionSuggestSearchIT.java | 32 ++++ .../ContextCompletionSuggestSearchIT.java | 44 ++++++ .../search/suggest/SuggestTests.java | 4 +- .../search/suggest/SuggestionTests.java | 6 +- .../CompletionSuggesterBuilderTests.java | 7 +- .../completion/CompletionSuggestionTests.java | 3 +- .../suggesters/completion-suggest.asciidoc | 28 ++++ .../test/suggest/20_completion.yml | 38 +++++ .../rest-api-spec/test/suggest/30_context.yml | 74 +++++++++- 14 files changed, 363 insertions(+), 96 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java index 0b127b2eeef7d..254dab0d692fb 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java @@ -18,17 +18,16 @@ */ package org.elasticsearch.search.suggest.completion; +import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.BulkScorer; import org.apache.lucene.search.CollectionTerminatedException; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Weight; -import org.apache.lucene.search.suggest.Lookup; import org.apache.lucene.search.suggest.document.CompletionQuery; import org.apache.lucene.search.suggest.document.TopSuggestDocs; import org.apache.lucene.search.suggest.document.TopSuggestDocsCollector; import org.apache.lucene.util.CharsRefBuilder; -import org.apache.lucene.util.PriorityQueue; import org.elasticsearch.common.text.Text; import org.elasticsearch.index.mapper.CompletionFieldMapper; import org.elasticsearch.search.suggest.Suggest; @@ -53,12 +52,15 @@ protected Suggest.Suggestion getContexts() { } } - private static final class SuggestDocPriorityQueue extends PriorityQueue { + private final Map docsMap; - SuggestDocPriorityQueue(int maxSize) { - super(maxSize); - } - - @Override - protected boolean lessThan(SuggestDoc a, SuggestDoc b) { - if (a.score == b.score) { - int cmp = Lookup.CHARSEQUENCE_COMPARATOR.compare(a.key, b.key); - if (cmp == 0) { - // prefer smaller doc id, in case of a tie - return a.doc > b.doc; - } else { - return cmp > 0; - } - } - return a.score < b.score; - } - - public SuggestDoc[] getResults() { - int size = size(); - SuggestDoc[] res = new SuggestDoc[size]; - for (int i = size - 1; i >= 0; i--) { - res[i] = pop(); - } - return res; - } - } - - private final int num; - private final SuggestDocPriorityQueue pq; - private final Map scoreDocMap; - - // TODO: expose dup removal - - TopDocumentsCollector(int num) { - super(1, false); // TODO hack, we don't use the underlying pq, so we allocate a size of 1 - this.num = num; - this.scoreDocMap = new LinkedHashMap<>(num); - this.pq = new SuggestDocPriorityQueue(num); - } - - @Override - public int getCountToCollect() { - // This is only needed because we initialize - // the base class with 1 instead of the actual num - return num; - } - - - @Override - protected void doSetNextReader(LeafReaderContext context) throws IOException { - super.doSetNextReader(context); - updateResults(); - } - - private void updateResults() { - for (SuggestDoc suggestDoc : scoreDocMap.values()) { - if (pq.insertWithOverflow(suggestDoc) == suggestDoc) { - break; - } - } - scoreDocMap.clear(); + TopDocumentsCollector(int num, boolean hasContexts, boolean skipDuplicates) { + super(Math.max(1, num), skipDuplicates); + this.docsMap = hasContexts ? new LinkedHashMap<>(num) : null; } @Override public void collect(int docID, CharSequence key, CharSequence context, float score) throws IOException { - if (scoreDocMap.containsKey(docID)) { - SuggestDoc suggestDoc = scoreDocMap.get(docID); - suggestDoc.add(key, context, score); - } else if (scoreDocMap.size() <= num) { - scoreDocMap.put(docID, new SuggestDoc(docBase + docID, key, context, score)); + if (docsMap == null) { + super.collect(docID, key, context, score); } else { - throw new CollectionTerminatedException(); + int globalDoc = docID + docBase; + if (docsMap.containsKey(globalDoc)) { + docsMap.get(globalDoc).add(key, context, score); + } else { + docsMap.put(globalDoc, new SuggestDoc(globalDoc, key, context, score)); + super.collect(docID, key, context, score); + } } } @Override public TopSuggestDocs get() throws IOException { - updateResults(); // to empty the last set of collected suggest docs - TopSuggestDocs.SuggestScoreDoc[] suggestScoreDocs = pq.getResults(); - if (suggestScoreDocs.length > 0) { - return new TopSuggestDocs(suggestScoreDocs.length, suggestScoreDocs, suggestScoreDocs[0].score); - } else { + TopSuggestDocs entries = super.get(); + if (entries.scoreDocs.length == 0) { return TopSuggestDocs.EMPTY; } + // The parent class returns suggestions, not documents, and dedup only the surface form (without contexts). + // The following code groups suggestions matching different contexts by document id and dedup the surface form + contexts + // if needed (skip_duplicates). + int size = entries.scoreDocs.length; + final List suggestDocs = new ArrayList(size); + final CharArraySet seenSurfaceForms = doSkipDuplicates() ? new CharArraySet(size, false) : null; + for (TopSuggestDocs.SuggestScoreDoc suggestEntry : entries.scoreLookupDocs()) { + final SuggestDoc suggestDoc; + if (docsMap != null) { + suggestDoc = docsMap.get(suggestEntry.doc); + } else { + suggestDoc = new SuggestDoc(suggestEntry.doc, suggestEntry.key, suggestEntry.context, suggestEntry.score); + } + if (doSkipDuplicates()) { + if (seenSurfaceForms.contains(suggestDoc.key)) { + continue; + } + seenSurfaceForms.add(suggestDoc.key); + } + suggestDocs.add(suggestDoc); + } + return new TopSuggestDocs((int) entries.totalHits, + suggestDocs.toArray(new TopSuggestDocs.SuggestScoreDoc[0]), entries.getMaxScore()); } } } diff --git a/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestion.java b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestion.java index 229b77aad2850..917687504ef2f 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestion.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestion.java @@ -18,8 +18,10 @@ */ package org.elasticsearch.search.suggest.completion; +import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.suggest.Lookup; +import org.elasticsearch.Version; import org.elasticsearch.common.ParseField; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; @@ -68,11 +70,32 @@ public final class CompletionSuggestion extends Suggest.Suggestion> toRe // the global top size entries are collected from the shard results // using a priority queue OptionPriorityQueue priorityQueue = new OptionPriorityQueue(leader.getSize(), COMPARATOR); + // Dedup duplicate suggestions (based on the surface form) if skip duplicates is activated + final CharArraySet seenSurfaceForms = leader.skipDuplicates ? new CharArraySet(leader.getSize(), false) : null; for (Suggest.Suggestion suggestion : toReduce) { assert suggestion.getName().equals(name) : "name should be identical across all suggestions"; for (Entry.Option option : ((CompletionSuggestion) suggestion).getOptions()) { + if (leader.skipDuplicates) { + assert ((CompletionSuggestion) suggestion).skipDuplicates; + String text = option.getText().string(); + if (seenSurfaceForms.contains(text)) { + continue; + } + seenSurfaceForms.add(text); + } if (option == priorityQueue.insertWithOverflow(option)) { // if the current option has overflown from pq, // we can assume all of the successive options @@ -157,7 +190,7 @@ public static CompletionSuggestion reduceTo(List> toRe } } } - final CompletionSuggestion suggestion = new CompletionSuggestion(leader.getName(), leader.getSize()); + final CompletionSuggestion suggestion = new CompletionSuggestion(leader.getName(), leader.getSize(), leader.skipDuplicates); final Entry entry = new Entry(leaderEntry.getText(), leaderEntry.getOffset(), leaderEntry.getLength()); Collections.addAll(entry.getOptions(), priorityQueue.get()); suggestion.addTerm(entry); diff --git a/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionBuilder.java b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionBuilder.java index 462aa8e271bad..44284623770f6 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionBuilder.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionBuilder.java @@ -19,6 +19,7 @@ package org.elasticsearch.search.suggest.completion; import org.elasticsearch.ElasticsearchParseException; +import org.elasticsearch.Version; import org.elasticsearch.common.ParseField; import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.io.stream.StreamInput; @@ -57,6 +58,7 @@ public class CompletionSuggestionBuilder extends SuggestionBuilder> queryContexts = Collections.emptyMap(); CompletionFieldMapper.CompletionFieldType getFieldType() { @@ -62,6 +63,10 @@ void setQueryContexts(Map> que this.queryContexts = queryContexts; } + void setSkipDuplicates(boolean skipDuplicates) { + this.skipDuplicates = skipDuplicates; + } + public FuzzyOptions getFuzzyOptions() { return fuzzyOptions; } @@ -74,6 +79,10 @@ public Map> getQueryContexts() return queryContexts; } + public boolean isSkipDuplicates() { + return skipDuplicates; + } + CompletionQuery toQuery() { CompletionFieldMapper.CompletionFieldType fieldType = getFieldType(); final CompletionQuery query; diff --git a/core/src/test/java/org/elasticsearch/action/search/SearchPhaseControllerTests.java b/core/src/test/java/org/elasticsearch/action/search/SearchPhaseControllerTests.java index e6d1e20147b90..7501a7a90be70 100644 --- a/core/src/test/java/org/elasticsearch/action/search/SearchPhaseControllerTests.java +++ b/core/src/test/java/org/elasticsearch/action/search/SearchPhaseControllerTests.java @@ -72,7 +72,7 @@ public void setup() { public void testSort() throws Exception { List suggestions = new ArrayList<>(); for (int i = 0; i < randomIntBetween(1, 5); i++) { - suggestions.add(new CompletionSuggestion(randomAlphaOfLength(randomIntBetween(1, 5)), randomIntBetween(1, 20))); + suggestions.add(new CompletionSuggestion(randomAlphaOfLength(randomIntBetween(1, 5)), randomIntBetween(1, 20), false)); } int nShards = randomIntBetween(1, 20); int queryResultSize = randomBoolean() ? 0 : randomIntBetween(1, nShards * 2); @@ -139,7 +139,7 @@ public void testMerge() throws IOException { for (int i = 0; i < randomIntBetween(1, 5); i++) { int size = randomIntBetween(1, 20); maxSuggestSize += size; - suggestions.add(new CompletionSuggestion(randomAlphaOfLength(randomIntBetween(1, 5)), size)); + suggestions.add(new CompletionSuggestion(randomAlphaOfLength(randomIntBetween(1, 5)), size, false)); } int nShards = randomIntBetween(1, 20); int queryResultSize = randomBoolean() ? 0 : randomIntBetween(1, nShards * 2); @@ -202,7 +202,7 @@ private AtomicArray generateQueryResults(int nShards, List shardSuggestion = new ArrayList<>(); for (CompletionSuggestion completionSuggestion : suggestions) { CompletionSuggestion suggestion = new CompletionSuggestion( - completionSuggestion.getName(), completionSuggestion.getSize()); + completionSuggestion.getName(), completionSuggestion.getSize(), false); final CompletionSuggestion.Entry completionEntry = new CompletionSuggestion.Entry(new Text(""), 0, 5); suggestion.addTerm(completionEntry); int optionSize = randomIntBetween(1, suggestion.getSize()); diff --git a/core/src/test/java/org/elasticsearch/search/suggest/CompletionSuggestSearchIT.java b/core/src/test/java/org/elasticsearch/search/suggest/CompletionSuggestSearchIT.java index 5bd2bad31d134..fa9f9c27645dc 100644 --- a/core/src/test/java/org/elasticsearch/search/suggest/CompletionSuggestSearchIT.java +++ b/core/src/test/java/org/elasticsearch/search/suggest/CompletionSuggestSearchIT.java @@ -858,6 +858,38 @@ public void testThatIndexingInvalidFieldsInCompletionFieldResultsInException() t } } + public void testSkipDuplicates() throws Exception { + final CompletionMappingBuilder mapping = new CompletionMappingBuilder(); + createIndexAndMapping(mapping); + int numDocs = randomIntBetween(10, 100); + int numUnique = randomIntBetween(1, numDocs); + List indexRequestBuilders = new ArrayList<>(); + for (int i = 1; i <= numDocs; i++) { + int id = i % numUnique; + indexRequestBuilders.add(client().prepareIndex(INDEX, TYPE, "" + i) + .setSource(jsonBuilder() + .startObject() + .startObject(FIELD) + .field("input", "suggestion" + id) + .field("weight", id) + .endObject() + .endObject() + )); + } + String[] expected = new String[numUnique]; + int sugg = numUnique - 1; + for (int i = 0; i < numUnique; i++) { + expected[i] = "suggestion" + sugg--; + } + indexRandom(true, indexRequestBuilders); + CompletionSuggestionBuilder completionSuggestionBuilder = + SuggestBuilders.completionSuggestion(FIELD).prefix("sugg").skipDuplicates(true).size(numUnique); + + SearchResponse searchResponse = client().prepareSearch(INDEX) + .suggest(new SuggestBuilder().addSuggestion("suggestions", completionSuggestionBuilder)).execute().actionGet(); + assertSuggestions(searchResponse, true, "suggestions", expected); + } + public void assertSuggestions(String suggestionName, SuggestionBuilder suggestBuilder, String... suggestions) { SearchResponse searchResponse = client().prepareSearch(INDEX).suggest(new SuggestBuilder().addSuggestion(suggestionName, suggestBuilder)).execute().actionGet(); assertSuggestions(searchResponse, suggestionName, suggestions); diff --git a/core/src/test/java/org/elasticsearch/search/suggest/ContextCompletionSuggestSearchIT.java b/core/src/test/java/org/elasticsearch/search/suggest/ContextCompletionSuggestSearchIT.java index a9741c3170008..17afd584ebb42 100644 --- a/core/src/test/java/org/elasticsearch/search/suggest/ContextCompletionSuggestSearchIT.java +++ b/core/src/test/java/org/elasticsearch/search/suggest/ContextCompletionSuggestSearchIT.java @@ -639,6 +639,50 @@ public void testGeoField() throws Exception { assertEquals("Hotel Amsterdam in Berlin", searchResponse.getSuggest().getSuggestion(suggestionName).iterator().next().getOptions().iterator().next().getText().string()); } + public void testSkipDuplicatesWithContexts() throws Exception { + LinkedHashMap map = new LinkedHashMap<>(); + map.put("type", ContextBuilder.category("type").field("type").build()); + map.put("cat", ContextBuilder.category("cat").field("cat").build()); + final CompletionMappingBuilder mapping = new CompletionMappingBuilder().context(map); + createIndexAndMapping(mapping); + int numDocs = randomIntBetween(10, 100); + int numUnique = randomIntBetween(1, numDocs); + List indexRequestBuilders = new ArrayList<>(); + for (int i = 0; i < numDocs; i++) { + int id = i % numUnique; + XContentBuilder source = jsonBuilder() + .startObject() + .startObject(FIELD) + .field("input", "suggestion" + id) + .field("weight", id) + .endObject() + .field("cat", "cat" + id % 2) + .field("type", "type" + id) + .endObject(); + indexRequestBuilders.add(client().prepareIndex(INDEX, TYPE, "" + i) + .setSource(source)); + } + String[] expected = new String[numUnique]; + for (int i = 0; i < numUnique; i++) { + expected[i] = "suggestion" + (numUnique-1-i); + } + indexRandom(true, indexRequestBuilders); + CompletionSuggestionBuilder completionSuggestionBuilder = + SuggestBuilders.completionSuggestion(FIELD).prefix("sugg").skipDuplicates(true).size(numUnique); + + assertSuggestions("suggestions", completionSuggestionBuilder, expected); + + Map> contextMap = new HashMap<>(); + contextMap.put("cat", Arrays.asList(CategoryQueryContext.builder().setCategory("cat0").build())); + completionSuggestionBuilder = + SuggestBuilders.completionSuggestion(FIELD).prefix("sugg").contexts(contextMap).skipDuplicates(true).size(numUnique); + + String[] expectedModulo = Arrays.stream(expected) + .filter((s) -> Integer.parseInt(s.substring("suggestion".length())) % 2 == 0) + .toArray(String[]::new); + assertSuggestions("suggestions", completionSuggestionBuilder, expectedModulo); + } + public void assertSuggestions(String suggestionName, SuggestionBuilder suggestBuilder, String... suggestions) { SearchResponse searchResponse = client().prepareSearch(INDEX).suggest( new SuggestBuilder().addSuggestion(suggestionName, suggestBuilder) diff --git a/core/src/test/java/org/elasticsearch/search/suggest/SuggestTests.java b/core/src/test/java/org/elasticsearch/search/suggest/SuggestTests.java index f1a630fab3742..d53cbfdab6e80 100644 --- a/core/src/test/java/org/elasticsearch/search/suggest/SuggestTests.java +++ b/core/src/test/java/org/elasticsearch/search/suggest/SuggestTests.java @@ -139,7 +139,7 @@ public void testToXContent() throws IOException { public void testFilter() throws Exception { List>> suggestions; - CompletionSuggestion completionSuggestion = new CompletionSuggestion(randomAlphaOfLength(10), 2); + CompletionSuggestion completionSuggestion = new CompletionSuggestion(randomAlphaOfLength(10), 2, false); PhraseSuggestion phraseSuggestion = new PhraseSuggestion(randomAlphaOfLength(10), 2); TermSuggestion termSuggestion = new TermSuggestion(randomAlphaOfLength(10), 2, SortBy.SCORE); suggestions = Arrays.asList(completionSuggestion, phraseSuggestion, termSuggestion); @@ -160,7 +160,7 @@ public void testSuggestionOrdering() throws Exception { suggestions = new ArrayList<>(); int n = randomIntBetween(2, 5); for (int i = 0; i < n; i++) { - suggestions.add(new CompletionSuggestion(randomAlphaOfLength(10), randomIntBetween(3, 5))); + suggestions.add(new CompletionSuggestion(randomAlphaOfLength(10), randomIntBetween(3, 5), false)); } Collections.shuffle(suggestions, random()); Suggest suggest = new Suggest(suggestions); diff --git a/core/src/test/java/org/elasticsearch/search/suggest/SuggestionTests.java b/core/src/test/java/org/elasticsearch/search/suggest/SuggestionTests.java index 3c56597299dd3..70c4396ce8867 100644 --- a/core/src/test/java/org/elasticsearch/search/suggest/SuggestionTests.java +++ b/core/src/test/java/org/elasticsearch/search/suggest/SuggestionTests.java @@ -79,7 +79,7 @@ public static Suggestion> createTestItem(Class suggestion = new PhraseSuggestion(name, size); entrySupplier = () -> SuggestionEntryTests.createTestItem(PhraseSuggestion.Entry.class); } else if (type == CompletionSuggestion.class) { - suggestion = new CompletionSuggestion(name, size); + suggestion = new CompletionSuggestion(name, size, randomBoolean()); entrySupplier = () -> SuggestionEntryTests.createTestItem(CompletionSuggestion.Entry.class); } else { throw new UnsupportedOperationException("type not supported [" + type + "]"); @@ -249,7 +249,7 @@ public void testToXContent() throws IOException { CompletionSuggestion.Entry.Option option = new CompletionSuggestion.Entry.Option(1, new Text("someText"), 1.3f, contexts); CompletionSuggestion.Entry entry = new CompletionSuggestion.Entry(new Text("entryText"), 42, 313); entry.addOption(option); - CompletionSuggestion suggestion = new CompletionSuggestion("suggestionName", 5); + CompletionSuggestion suggestion = new CompletionSuggestion("suggestionName", 5, randomBoolean()); suggestion.addTerm(entry); BytesReference xContent = toXContent(suggestion, XContentType.JSON, params, randomBoolean()); assertEquals( @@ -265,4 +265,4 @@ public void testToXContent() throws IOException { + "}]}", xContent.utf8ToString()); } } -} \ No newline at end of file +} diff --git a/core/src/test/java/org/elasticsearch/search/suggest/completion/CompletionSuggesterBuilderTests.java b/core/src/test/java/org/elasticsearch/search/suggest/completion/CompletionSuggesterBuilderTests.java index d8eb885823b8e..862916890e1bb 100644 --- a/core/src/test/java/org/elasticsearch/search/suggest/completion/CompletionSuggesterBuilderTests.java +++ b/core/src/test/java/org/elasticsearch/search/suggest/completion/CompletionSuggesterBuilderTests.java @@ -114,6 +114,7 @@ public static CompletionSuggestionBuilder randomCompletionSuggestionBuilder() { contextMap.put(geoQueryContextName, contexts); } testBuilder.contexts(contextMap); + testBuilder.skipDuplicates(randomBoolean()); return testBuilder; } @@ -128,7 +129,7 @@ protected String[] shuffleProtectedFields() { @Override protected void mutateSpecificParameters(CompletionSuggestionBuilder builder) throws IOException { - switch (randomIntBetween(0, 4)) { + switch (randomIntBetween(0, 5)) { case 0: int nCatContext = randomIntBetween(1, 5); List contexts = new ArrayList<>(nCatContext); @@ -154,6 +155,9 @@ protected void mutateSpecificParameters(CompletionSuggestionBuilder builder) thr case 4: builder.regex(randomAlphaOfLength(10), RegexOptionsTests.randomRegexOptions()); break; + case 5: + builder.skipDuplicates(!builder.skipDuplicates); + break; default: throw new IllegalStateException("should not through"); } @@ -182,5 +186,6 @@ protected void assertSuggestionContext(CompletionSuggestionBuilder builder, Sugg assertEquals(parsedContextBytes.get(contextName), queryContexts.get(contextName)); } assertEquals(builder.regexOptions, completionSuggestionCtx.getRegexOptions()); + assertEquals(builder.skipDuplicates, completionSuggestionCtx.isSkipDuplicates()); } } diff --git a/core/src/test/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionTests.java b/core/src/test/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionTests.java index 4b0e60a1d00db..2a5a89bdde332 100644 --- a/core/src/test/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionTests.java +++ b/core/src/test/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionTests.java @@ -24,6 +24,7 @@ import org.elasticsearch.test.ESTestCase; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.List; @@ -38,7 +39,7 @@ public void testToReduce() throws Exception { String name = randomAlphaOfLength(10); int size = randomIntBetween(3, 5); for (int i = 0; i < nShards; i++) { - CompletionSuggestion suggestion = new CompletionSuggestion(name, size); + CompletionSuggestion suggestion = new CompletionSuggestion(name, size, false); suggestion.addTerm(new CompletionSuggestion.Entry(new Text(""), 0, 0)); shardSuggestions.add(suggestion); } diff --git a/docs/reference/search/suggesters/completion-suggest.asciidoc b/docs/reference/search/suggesters/completion-suggest.asciidoc index 1a42dedf47c74..566a659279f60 100644 --- a/docs/reference/search/suggesters/completion-suggest.asciidoc +++ b/docs/reference/search/suggesters/completion-suggest.asciidoc @@ -277,6 +277,7 @@ The basic completion suggester query supports the following parameters: `field`:: The name of the field on which to run the query (required). `size`:: The number of suggestions to return (defaults to `5`). +`skip_duplicates`:: Whether duplicate suggestions should be filtered out (defaults to `false`). NOTE: The completion suggester considers all documents in the index. See <> for an explanation of how to query a subset of @@ -291,6 +292,33 @@ index completions into a single shard index. In case of high heap usage due to shard size, it is still recommended to break index into multiple shards instead of optimizing for completion performance. +[[skip_duplicates]] +==== Skip duplicate suggestions + +Queries can return duplicate suggestions coming from different documents. +It is possible to modify this behavior by setting `skip_duplicates` to true. +When set, this option filters out documents with duplicate suggestions from the result. + +[source,js] +-------------------------------------------------- +POST music/_search?pretty +{ + "suggest": { + "song-suggest" : { + "prefix" : "nor", + "completion" : { + "field" : "suggest", + "skip_duplicates": true + } + } + } +} +-------------------------------------------------- +// CONSOLE + +WARNING: when set to true this option can slow down search because more suggestions +need to be visited to find the top N. + [[fuzzy]] ==== Fuzzy queries diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/suggest/20_completion.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/suggest/20_completion.yml index 860c43bbcbf0e..d0e3e422a1745 100644 --- a/rest-api-spec/src/main/resources/rest-api-spec/test/suggest/20_completion.yml +++ b/rest-api-spec/src/main/resources/rest-api-spec/test/suggest/20_completion.yml @@ -291,3 +291,41 @@ setup: - match: { suggest.result.0.options.1._type: "test" } - match: { suggest.result.0.options.1._source.title: "title_bar" } - match: { suggest.result.0.options.1._source.count: 4 } + +--- +"Skip duplicates should work": + - skip: + version: " - 6.99.99" + reason: skip_duplicates was added in 7.0 (TODO should be backported to 6.1) + - do: + index: + index: test + type: test + id: 1 + body: + suggest_1: "bar" + + - do: + index: + index: test + type: test + id: 2 + body: + suggest_1: "bar" + + - do: + indices.refresh: {} + + - do: + search: + body: + suggest: + result: + text: "b" + completion: + field: suggest_1 + skip_duplicates: true + + - length: { suggest.result: 1 } + - length: { suggest.result.0.options: 1 } + - match: { suggest.result.0.options.0.text: "bar" } diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/suggest/30_context.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/suggest/30_context.yml index 778094ec90baf..50470ecb9a2fd 100644 --- a/rest-api-spec/src/main/resources/rest-api-spec/test/suggest/30_context.yml +++ b/rest-api-spec/src/main/resources/rest-api-spec/test/suggest/30_context.yml @@ -276,4 +276,76 @@ setup: - length: { suggest.result: 1 } - length: { suggest.result.0.options: 1 } - - match: { suggest.result.0.options.0.text: "Marriot in Berlin" } + - match: { suggest.result.0.options.0.text: "Marriot in Berlin" } + +--- +"Skip duplicates with contexts should work": +- skip: + version: " - 6.99.99" + reason: skip_duplicates was added in 7.0 (TODO should be backported to 6.1) + + - do: + index: + index: test + type: test + id: 1 + body: + suggest_context: + input: "foo" + contexts: + color: "red" + + - do: + index: + index: test + type: test + id: 1 + body: + suggest_context: + input: "foo" + contexts: + color: "red" + + - do: + index: + index: test + type: test + id: 2 + body: + suggest_context: + input: "foo" + contexts: + color: "blue" + + - do: + indices.refresh: {} + + - do: + search: + body: + suggest: + result: + text: "foo" + completion: + field: suggest_context + skip_duplicates: true + contexts: + color: "red" + + - length: { suggest.result: 1 } + - length: { suggest.result.0.options: 1 } + - match: { suggest.result.0.options.0.text: "foo" } + + - do: + search: + body: + suggest: + result: + text: "foo" + completion: + skip_duplicates: true + field: suggest_context + + - length: { suggest.result: 1 } + - length: { suggest.result.0.options: 1 } + - match: { suggest.result.0.options.0.text: "foo" } From 450804f00803f97cc7d57a644368c514c5ca8dcb Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Mon, 4 Sep 2017 17:24:36 +0200 Subject: [PATCH 2/6] skip bwc rest tests for now --- .../main/resources/rest-api-spec/test/suggest/20_completion.yml | 1 + .../main/resources/rest-api-spec/test/suggest/30_context.yml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/suggest/20_completion.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/suggest/20_completion.yml index d0e3e422a1745..2b2f5934185f5 100644 --- a/rest-api-spec/src/main/resources/rest-api-spec/test/suggest/20_completion.yml +++ b/rest-api-spec/src/main/resources/rest-api-spec/test/suggest/20_completion.yml @@ -297,6 +297,7 @@ setup: - skip: version: " - 6.99.99" reason: skip_duplicates was added in 7.0 (TODO should be backported to 6.1) + - do: index: index: test diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/suggest/30_context.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/suggest/30_context.yml index 50470ecb9a2fd..ee2801df67d99 100644 --- a/rest-api-spec/src/main/resources/rest-api-spec/test/suggest/30_context.yml +++ b/rest-api-spec/src/main/resources/rest-api-spec/test/suggest/30_context.yml @@ -280,7 +280,7 @@ setup: --- "Skip duplicates with contexts should work": -- skip: + - skip: version: " - 6.99.99" reason: skip_duplicates was added in 7.0 (TODO should be backported to 6.1) From f62508531a42b365de0a3b5c9dfa1def67b7a082 Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Tue, 5 Sep 2017 13:37:49 +0200 Subject: [PATCH 3/6] after review --- .../search/suggest/completion/CompletionSuggester.java | 7 ++++--- .../search/suggest/completion/CompletionSuggestion.java | 6 ++++++ .../suggest/completion/CompletionSuggestionBuilder.java | 3 +++ 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java index 254dab0d692fb..0c78b95e70b49 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java @@ -109,9 +109,10 @@ private static void suggest(IndexSearcher searcher, CompletionQuery query, TopSu * This collector is also able to filter duplicate suggestion coming from different documents. * When different contexts match the same suggestion form only the best one (sorted by weight) is kept. * In order to keep this feature fast, the de-duplication of suggestions with different contexts is done - * only on the top N*num_contexts suggestions per segment. This means that skip_duplicates will visit at most N*num_contexts suggestions - * per segment to find unique suggestions that match the input. If more than N*num_contexts suggestions are duplicated with different - * contexts this collector will be able to return only one suggestion even when N is greater than 1. + * only on the top N*num_contexts (where N is the number of requested suggestions) suggestions per segment. + * This means that skip_duplicates will visit at most N*num_contexts suggestions per segment to find unique suggestions + * that match the input. If more than N*num_contexts suggestions are duplicated with different contexts this collector + * will not be able to return more than one suggestion even when N is greater than 1. **/ private static final class TopDocumentsCollector extends TopSuggestDocsCollector { diff --git a/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestion.java b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestion.java index 917687504ef2f..9d540b57ef16e 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestion.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestion.java @@ -75,6 +75,12 @@ public final class CompletionSuggestion extends Suggest.Suggestion Date: Tue, 5 Sep 2017 13:39:09 +0200 Subject: [PATCH 4/6] javadoc --- .../search/suggest/completion/CompletionSuggester.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java index 0c78b95e70b49..018d2e4525750 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java @@ -109,7 +109,7 @@ private static void suggest(IndexSearcher searcher, CompletionQuery query, TopSu * This collector is also able to filter duplicate suggestion coming from different documents. * When different contexts match the same suggestion form only the best one (sorted by weight) is kept. * In order to keep this feature fast, the de-duplication of suggestions with different contexts is done - * only on the top N*num_contexts (where N is the number of requested suggestions) suggestions per segment. + * only on the top N*num_contexts (where N is the number of documents to return) suggestions per segment. * This means that skip_duplicates will visit at most N*num_contexts suggestions per segment to find unique suggestions * that match the input. If more than N*num_contexts suggestions are duplicated with different contexts this collector * will not be able to return more than one suggestion even when N is greater than 1. From 114e078d84c5856c2475b11b6d9a068624d46735 Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Tue, 5 Sep 2017 13:51:44 +0200 Subject: [PATCH 5/6] fix grouping per doc in the completion suggester when a document matches the input with different suggestions within the same context --- .../completion/CompletionSuggester.java | 21 +++++++----------- .../suggest/CompletionSuggestSearchIT.java | 22 +++++++++++++++++++ 2 files changed, 30 insertions(+), 13 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java index 018d2e4525750..38b3d4b420216 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java @@ -59,8 +59,7 @@ protected Suggest.Suggestion getContexts() { private final Map docsMap; - TopDocumentsCollector(int num, boolean hasContexts, boolean skipDuplicates) { + TopDocumentsCollector(int num, boolean skipDuplicates) { super(Math.max(1, num), skipDuplicates); - this.docsMap = hasContexts ? new LinkedHashMap<>(num) : null; + this.docsMap = new LinkedHashMap<>(num); } @Override public void collect(int docID, CharSequence key, CharSequence context, float score) throws IOException { - if (docsMap == null) { - super.collect(docID, key, context, score); + int globalDoc = docID + docBase; + if (docsMap.containsKey(globalDoc)) { + docsMap.get(globalDoc).add(key, context, score); } else { - int globalDoc = docID + docBase; - if (docsMap.containsKey(globalDoc)) { - docsMap.get(globalDoc).add(key, context, score); - } else { - docsMap.put(globalDoc, new SuggestDoc(globalDoc, key, context, score)); - super.collect(docID, key, context, score); - } + docsMap.put(globalDoc, new SuggestDoc(globalDoc, key, context, score)); + super.collect(docID, key, context, score); } } diff --git a/core/src/test/java/org/elasticsearch/search/suggest/CompletionSuggestSearchIT.java b/core/src/test/java/org/elasticsearch/search/suggest/CompletionSuggestSearchIT.java index fa9f9c27645dc..f0f47524d26ec 100644 --- a/core/src/test/java/org/elasticsearch/search/suggest/CompletionSuggestSearchIT.java +++ b/core/src/test/java/org/elasticsearch/search/suggest/CompletionSuggestSearchIT.java @@ -1140,6 +1140,28 @@ public void testIndexingUnrelatedNullValue() throws Exception { } } + public void testMultiDocSuggestions() throws Exception { + final CompletionMappingBuilder mapping = new CompletionMappingBuilder(); + createIndexAndMapping(mapping); + int numDocs = 10; + List indexRequestBuilders = new ArrayList<>(); + for (int i = 1; i <= numDocs; i++) { + indexRequestBuilders.add(client().prepareIndex(INDEX, TYPE, "" + i) + .setSource(jsonBuilder() + .startObject() + .startObject(FIELD) + .array("input", "suggestion" + i, "suggestions" + i, "suggester" + i) + .field("weight", i) + .endObject() + .endObject() + )); + } + indexRandom(true, indexRequestBuilders); + CompletionSuggestionBuilder prefix = SuggestBuilders.completionSuggestion(FIELD).prefix("sugg"); + assertSuggestions("foo", prefix, "suggester10", "suggester9", "suggester8", "suggester7", "suggester6"); + } + + public static boolean isReservedChar(char c) { switch (c) { case '\u001F': From a53e904bb82dcda76ce9d057a75296db2126da92 Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Tue, 5 Sep 2017 14:01:10 +0200 Subject: [PATCH 6/6] javadoc --- .../suggest/completion/CompletionSuggestionBuilder.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionBuilder.java b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionBuilder.java index 9f79e09ae1417..468a7d21fc5e3 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionBuilder.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionBuilder.java @@ -224,12 +224,15 @@ private CompletionSuggestionBuilder contexts(XContentBuilder contextBuilder) { } /** - * Returns whether duplicate suggestions should be filtered out (defaults to false). + * Returns whether duplicate suggestions should be filtered out. */ public boolean skipDuplicates() { return skipDuplicates; } + /** + * Should duplicates be filtered or not. Defaults to false. + */ public CompletionSuggestionBuilder skipDuplicates(boolean skipDuplicates) { this.skipDuplicates = skipDuplicates; return this;