-
Notifications
You must be signed in to change notification settings - Fork 25.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Expose duplicate removal in the completion suggester #26496
Changes from 2 commits
54c6ac7
450804f
f625085
beedb08
114e078
a53e904
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,17 +18,16 @@ | |
*/ | ||
package org.elasticsearch.search.suggest.completion; | ||
|
||
import org.apache.lucene.analysis.CharArraySet; | ||
import org.apache.lucene.index.LeafReaderContext; | ||
import org.apache.lucene.search.BulkScorer; | ||
import org.apache.lucene.search.CollectionTerminatedException; | ||
import org.apache.lucene.search.IndexSearcher; | ||
import org.apache.lucene.search.Weight; | ||
import org.apache.lucene.search.suggest.Lookup; | ||
import org.apache.lucene.search.suggest.document.CompletionQuery; | ||
import org.apache.lucene.search.suggest.document.TopSuggestDocs; | ||
import org.apache.lucene.search.suggest.document.TopSuggestDocsCollector; | ||
import org.apache.lucene.util.CharsRefBuilder; | ||
import org.apache.lucene.util.PriorityQueue; | ||
import org.elasticsearch.common.text.Text; | ||
import org.elasticsearch.index.mapper.CompletionFieldMapper; | ||
import org.elasticsearch.search.suggest.Suggest; | ||
|
@@ -53,12 +52,15 @@ protected Suggest.Suggestion<? extends Suggest.Suggestion.Entry<? extends Sugges | |
final CompletionSuggestionContext suggestionContext, final IndexSearcher searcher, CharsRefBuilder spare) throws IOException { | ||
if (suggestionContext.getFieldType() != null) { | ||
final CompletionFieldMapper.CompletionFieldType fieldType = suggestionContext.getFieldType(); | ||
CompletionSuggestion completionSuggestion = new CompletionSuggestion(name, suggestionContext.getSize()); | ||
CompletionSuggestion completionSuggestion = | ||
new CompletionSuggestion(name, suggestionContext.getSize(), suggestionContext.isSkipDuplicates()); | ||
spare.copyUTF8Bytes(suggestionContext.getText()); | ||
CompletionSuggestion.Entry completionSuggestEntry = new CompletionSuggestion.Entry( | ||
new Text(spare.toString()), 0, spare.length()); | ||
completionSuggestion.addTerm(completionSuggestEntry); | ||
TopSuggestDocsCollector collector = new TopDocumentsCollector(suggestionContext.getSize()); | ||
TopSuggestDocsCollector collector = | ||
new TopDocumentsCollector(suggestionContext.getSize(), suggestionContext.getFieldType().hasContextMappings(), | ||
suggestionContext.isSkipDuplicates()); | ||
suggest(searcher, suggestionContext.toQuery(), collector); | ||
int numResult = 0; | ||
for (TopSuggestDocs.SuggestScoreDoc suggestScoreDoc : collector.get().scoreLookupDocs()) { | ||
|
@@ -97,8 +99,20 @@ private static void suggest(IndexSearcher searcher, CompletionQuery query, TopSu | |
} | ||
} | ||
|
||
// TODO: this should be refactored and moved to lucene | ||
// see https://issues.apache.org/jira/browse/LUCENE-6880 | ||
/** | ||
* TODO: this should be refactored and moved to lucene see https://issues.apache.org/jira/browse/LUCENE-6880 | ||
* | ||
* Custom collector that returns top documents from the completion suggester. | ||
* When suggestions are augmented with contexts values this collector groups suggestions coming from the same document | ||
* but matching different contexts together. Each document is counted as 1 entry and the provided size is the expected number | ||
* of documents that should be returned (not the number of suggestions). | ||
* This collector is also able to filter duplicate suggestion coming from different documents. | ||
* When different contexts match the same suggestion form only the best one (sorted by weight) is kept. | ||
* In order to keep this feature fast, the de-duplication of suggestions with different contexts is done | ||
* only on the top N*num_contexts suggestions per segment. This means that skip_duplicates will visit at most N*num_contexts suggestions | ||
* per segment to find unique suggestions that match the input. If more than N*num_contexts suggestions are duplicated with different | ||
* contexts this collector will be able to return only one suggestion even when N is greater than 1. | ||
**/ | ||
private static final class TopDocumentsCollector extends TopSuggestDocsCollector { | ||
|
||
/** | ||
|
@@ -150,93 +164,57 @@ public List<CharSequence> getContexts() { | |
} | ||
} | ||
|
||
private static final class SuggestDocPriorityQueue extends PriorityQueue<SuggestDoc> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For my own understanding, can you briefly explain what the queue did here before and why this can now be done in TopSuggestDocsCollector (at least thats what it looks to me) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Prior to this change the custom collector was used to group the suggestions coming from different contexts per document. |
||
private final Map<Integer, SuggestDoc> docsMap; | ||
|
||
SuggestDocPriorityQueue(int maxSize) { | ||
super(maxSize); | ||
} | ||
|
||
@Override | ||
protected boolean lessThan(SuggestDoc a, SuggestDoc b) { | ||
if (a.score == b.score) { | ||
int cmp = Lookup.CHARSEQUENCE_COMPARATOR.compare(a.key, b.key); | ||
if (cmp == 0) { | ||
// prefer smaller doc id, in case of a tie | ||
return a.doc > b.doc; | ||
} else { | ||
return cmp > 0; | ||
} | ||
} | ||
return a.score < b.score; | ||
} | ||
|
||
public SuggestDoc[] getResults() { | ||
int size = size(); | ||
SuggestDoc[] res = new SuggestDoc[size]; | ||
for (int i = size - 1; i >= 0; i--) { | ||
res[i] = pop(); | ||
} | ||
return res; | ||
} | ||
} | ||
|
||
private final int num; | ||
private final SuggestDocPriorityQueue pq; | ||
private final Map<Integer, SuggestDoc> scoreDocMap; | ||
|
||
// TODO: expose dup removal | ||
|
||
TopDocumentsCollector(int num) { | ||
super(1, false); // TODO hack, we don't use the underlying pq, so we allocate a size of 1 | ||
this.num = num; | ||
this.scoreDocMap = new LinkedHashMap<>(num); | ||
this.pq = new SuggestDocPriorityQueue(num); | ||
} | ||
|
||
@Override | ||
public int getCountToCollect() { | ||
// This is only needed because we initialize | ||
// the base class with 1 instead of the actual num | ||
return num; | ||
} | ||
|
||
|
||
@Override | ||
protected void doSetNextReader(LeafReaderContext context) throws IOException { | ||
super.doSetNextReader(context); | ||
updateResults(); | ||
} | ||
|
||
private void updateResults() { | ||
for (SuggestDoc suggestDoc : scoreDocMap.values()) { | ||
if (pq.insertWithOverflow(suggestDoc) == suggestDoc) { | ||
break; | ||
} | ||
} | ||
scoreDocMap.clear(); | ||
TopDocumentsCollector(int num, boolean hasContexts, boolean skipDuplicates) { | ||
super(Math.max(1, num), skipDuplicates); | ||
this.docsMap = hasContexts ? new LinkedHashMap<>(num) : null; | ||
} | ||
|
||
@Override | ||
public void collect(int docID, CharSequence key, CharSequence context, float score) throws IOException { | ||
if (scoreDocMap.containsKey(docID)) { | ||
SuggestDoc suggestDoc = scoreDocMap.get(docID); | ||
suggestDoc.add(key, context, score); | ||
} else if (scoreDocMap.size() <= num) { | ||
scoreDocMap.put(docID, new SuggestDoc(docBase + docID, key, context, score)); | ||
if (docsMap == null) { | ||
super.collect(docID, key, context, score); | ||
} else { | ||
throw new CollectionTerminatedException(); | ||
int globalDoc = docID + docBase; | ||
if (docsMap.containsKey(globalDoc)) { | ||
docsMap.get(globalDoc).add(key, context, score); | ||
} else { | ||
docsMap.put(globalDoc, new SuggestDoc(globalDoc, key, context, score)); | ||
super.collect(docID, key, context, score); | ||
} | ||
} | ||
} | ||
|
||
@Override | ||
public TopSuggestDocs get() throws IOException { | ||
updateResults(); // to empty the last set of collected suggest docs | ||
TopSuggestDocs.SuggestScoreDoc[] suggestScoreDocs = pq.getResults(); | ||
if (suggestScoreDocs.length > 0) { | ||
return new TopSuggestDocs(suggestScoreDocs.length, suggestScoreDocs, suggestScoreDocs[0].score); | ||
} else { | ||
TopSuggestDocs entries = super.get(); | ||
if (entries.scoreDocs.length == 0) { | ||
return TopSuggestDocs.EMPTY; | ||
} | ||
// The parent class returns suggestions, not documents, and dedup only the surface form (without contexts). | ||
// The following code groups suggestions matching different contexts by document id and dedup the surface form + contexts | ||
// if needed (skip_duplicates). | ||
int size = entries.scoreDocs.length; | ||
final List<TopSuggestDocs.SuggestScoreDoc> suggestDocs = new ArrayList(size); | ||
final CharArraySet seenSurfaceForms = doSkipDuplicates() ? new CharArraySet(size, false) : null; | ||
for (TopSuggestDocs.SuggestScoreDoc suggestEntry : entries.scoreLookupDocs()) { | ||
final SuggestDoc suggestDoc; | ||
if (docsMap != null) { | ||
suggestDoc = docsMap.get(suggestEntry.doc); | ||
} else { | ||
suggestDoc = new SuggestDoc(suggestEntry.doc, suggestEntry.key, suggestEntry.context, suggestEntry.score); | ||
} | ||
if (doSkipDuplicates()) { | ||
if (seenSurfaceForms.contains(suggestDoc.key)) { | ||
continue; | ||
} | ||
seenSurfaceForms.add(suggestDoc.key); | ||
} | ||
suggestDocs.add(suggestDoc); | ||
} | ||
return new TopSuggestDocs((int) entries.totalHits, | ||
suggestDocs.toArray(new TopSuggestDocs.SuggestScoreDoc[0]), entries.getMaxScore()); | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,8 +18,10 @@ | |
*/ | ||
package org.elasticsearch.search.suggest.completion; | ||
|
||
import org.apache.lucene.analysis.CharArraySet; | ||
import org.apache.lucene.search.ScoreDoc; | ||
import org.apache.lucene.search.suggest.Lookup; | ||
import org.elasticsearch.Version; | ||
import org.elasticsearch.common.ParseField; | ||
import org.elasticsearch.common.io.stream.StreamInput; | ||
import org.elasticsearch.common.io.stream.StreamOutput; | ||
|
@@ -68,11 +70,32 @@ public final class CompletionSuggestion extends Suggest.Suggestion<CompletionSug | |
|
||
public static final int TYPE = 4; | ||
|
||
private boolean skipDuplicates; | ||
|
||
public CompletionSuggestion() { | ||
} | ||
|
||
public CompletionSuggestion(String name, int size) { | ||
public CompletionSuggestion(String name, int size, boolean skipDuplicates) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I always get confused, is this a user-facing API? In that case, should we keep the original constructor with "false" as the default? If not all good. |
||
super(name, size); | ||
this.skipDuplicates = skipDuplicates; | ||
} | ||
|
||
@Override | ||
public void readFrom(StreamInput in) throws IOException { | ||
super.readFrom(in); | ||
// TODO should be backported to 6.1.0 | ||
if (in.getVersion().onOrAfter(Version.V_7_0_0_alpha1)) { | ||
skipDuplicates = in.readBoolean(); | ||
} | ||
} | ||
|
||
@Override | ||
public void writeTo(StreamOutput out) throws IOException { | ||
super.writeTo(out); | ||
// TODO should be backported to 6.1.0 | ||
if (out.getVersion().onOrAfter(Version.V_7_0_0_alpha1)) { | ||
out.writeBoolean(skipDuplicates); | ||
} | ||
} | ||
|
||
/** | ||
|
@@ -95,7 +118,7 @@ public boolean hasScoreDocs() { | |
} | ||
|
||
public static CompletionSuggestion fromXContent(XContentParser parser, String name) throws IOException { | ||
CompletionSuggestion suggestion = new CompletionSuggestion(name, -1); | ||
CompletionSuggestion suggestion = new CompletionSuggestion(name, -1, false); | ||
parseEntries(parser, suggestion, CompletionSuggestion.Entry::fromXContent); | ||
return suggestion; | ||
} | ||
|
@@ -146,9 +169,19 @@ public static CompletionSuggestion reduceTo(List<Suggest.Suggestion<Entry>> toRe | |
// the global top <code>size</code> entries are collected from the shard results | ||
// using a priority queue | ||
OptionPriorityQueue priorityQueue = new OptionPriorityQueue(leader.getSize(), COMPARATOR); | ||
// Dedup duplicate suggestions (based on the surface form) if skip duplicates is activated | ||
final CharArraySet seenSurfaceForms = leader.skipDuplicates ? new CharArraySet(leader.getSize(), false) : null; | ||
for (Suggest.Suggestion<Entry> suggestion : toReduce) { | ||
assert suggestion.getName().equals(name) : "name should be identical across all suggestions"; | ||
for (Entry.Option option : ((CompletionSuggestion) suggestion).getOptions()) { | ||
if (leader.skipDuplicates) { | ||
assert ((CompletionSuggestion) suggestion).skipDuplicates; | ||
String text = option.getText().string(); | ||
if (seenSurfaceForms.contains(text)) { | ||
continue; | ||
} | ||
seenSurfaceForms.add(text); | ||
} | ||
if (option == priorityQueue.insertWithOverflow(option)) { | ||
// if the current option has overflown from pq, | ||
// we can assume all of the successive options | ||
|
@@ -157,7 +190,7 @@ public static CompletionSuggestion reduceTo(List<Suggest.Suggestion<Entry>> toRe | |
} | ||
} | ||
} | ||
final CompletionSuggestion suggestion = new CompletionSuggestion(leader.getName(), leader.getSize()); | ||
final CompletionSuggestion suggestion = new CompletionSuggestion(leader.getName(), leader.getSize(), leader.skipDuplicates); | ||
final Entry entry = new Entry(leaderEntry.getText(), leaderEntry.getOffset(), leaderEntry.getLength()); | ||
Collections.addAll(entry.getOptions(), priorityQueue.get()); | ||
suggestion.addTerm(entry); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,7 @@ | |
package org.elasticsearch.search.suggest.completion; | ||
|
||
import org.elasticsearch.ElasticsearchParseException; | ||
import org.elasticsearch.Version; | ||
import org.elasticsearch.common.ParseField; | ||
import org.elasticsearch.common.bytes.BytesReference; | ||
import org.elasticsearch.common.io.stream.StreamInput; | ||
|
@@ -57,6 +58,7 @@ public class CompletionSuggestionBuilder extends SuggestionBuilder<CompletionSug | |
private static final XContentType CONTEXT_BYTES_XCONTENT_TYPE = XContentType.JSON; | ||
static final String SUGGESTION_NAME = "completion"; | ||
static final ParseField CONTEXTS_FIELD = new ParseField("contexts", "context"); | ||
static final ParseField SKIP_DUPLICATES_FIELD = new ParseField("skip_duplicates"); | ||
|
||
/** | ||
* { | ||
|
@@ -94,11 +96,13 @@ public class CompletionSuggestionBuilder extends SuggestionBuilder<CompletionSug | |
v.contextBytes = builder.bytes(); | ||
p.skipChildren(); | ||
}, CONTEXTS_FIELD, ObjectParser.ValueType.OBJECT); // context is deprecated | ||
PARSER.declareBoolean(CompletionSuggestionBuilder::skipDuplicates, SKIP_DUPLICATES_FIELD); | ||
} | ||
|
||
protected FuzzyOptions fuzzyOptions; | ||
protected RegexOptions regexOptions; | ||
protected BytesReference contextBytes = null; | ||
protected boolean skipDuplicates = false; | ||
|
||
public CompletionSuggestionBuilder(String field) { | ||
super(field); | ||
|
@@ -113,6 +117,7 @@ private CompletionSuggestionBuilder(String fieldname, CompletionSuggestionBuilde | |
fuzzyOptions = in.fuzzyOptions; | ||
regexOptions = in.regexOptions; | ||
contextBytes = in.contextBytes; | ||
skipDuplicates = in.skipDuplicates; | ||
} | ||
|
||
/** | ||
|
@@ -123,13 +128,21 @@ public CompletionSuggestionBuilder(StreamInput in) throws IOException { | |
fuzzyOptions = in.readOptionalWriteable(FuzzyOptions::new); | ||
regexOptions = in.readOptionalWriteable(RegexOptions::new); | ||
contextBytes = in.readOptionalBytesReference(); | ||
// TODO should be backported to 6.1.0 | ||
if (in.getVersion().onOrAfter(Version.V_7_0_0_alpha1)) { | ||
skipDuplicates = in.readBoolean(); | ||
} | ||
} | ||
|
||
@Override | ||
public void doWriteTo(StreamOutput out) throws IOException { | ||
out.writeOptionalWriteable(fuzzyOptions); | ||
out.writeOptionalWriteable(regexOptions); | ||
out.writeOptionalBytesReference(contextBytes); | ||
// TODO should be backported to 6.1.0 | ||
if (out.getVersion().onOrAfter(Version.V_7_0_0_alpha1)) { | ||
out.writeBoolean(skipDuplicates); | ||
} | ||
} | ||
|
||
/** | ||
|
@@ -210,6 +223,15 @@ private CompletionSuggestionBuilder contexts(XContentBuilder contextBuilder) { | |
return this; | ||
} | ||
|
||
public boolean skipDuplicates() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: maybe add some javadoc since this is what most users see that use the java api |
||
return skipDuplicates; | ||
} | ||
|
||
public CompletionSuggestionBuilder skipDuplicates(boolean skipDuplicates) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: maybe add some javadoc since this is what most users see that use the java api |
||
this.skipDuplicates = skipDuplicates; | ||
return this; | ||
} | ||
|
||
private static class InnerBuilder extends CompletionSuggestionBuilder { | ||
private String field; | ||
|
||
|
@@ -231,6 +253,9 @@ protected XContentBuilder innerToXContent(XContentBuilder builder, Params params | |
if (regexOptions != null) { | ||
regexOptions.toXContent(builder, params); | ||
} | ||
if (skipDuplicates) { | ||
builder.field(SKIP_DUPLICATES_FIELD.getPreferredName(), skipDuplicates); | ||
} | ||
if (contextBytes != null) { | ||
builder.rawField(CONTEXTS_FIELD.getPreferredName(), contextBytes); | ||
} | ||
|
@@ -255,6 +280,7 @@ public SuggestionContext build(QueryShardContext context) throws IOException { | |
// copy over common settings to each suggestion builder | ||
final MapperService mapperService = context.getMapperService(); | ||
populateCommonFields(mapperService, suggestionContext); | ||
suggestionContext.setSkipDuplicates(skipDuplicates); | ||
suggestionContext.setFuzzyOptions(fuzzyOptions); | ||
suggestionContext.setRegexOptions(regexOptions); | ||
MappedFieldType mappedFieldType = mapperService.fullName(suggestionContext.getField()); | ||
|
@@ -302,13 +328,14 @@ public String getWriteableName() { | |
|
||
@Override | ||
protected boolean doEquals(CompletionSuggestionBuilder other) { | ||
return Objects.equals(fuzzyOptions, other.fuzzyOptions) && | ||
return skipDuplicates == other.skipDuplicates && | ||
Objects.equals(fuzzyOptions, other.fuzzyOptions) && | ||
Objects.equals(regexOptions, other.regexOptions) && | ||
Objects.equals(contextBytes, other.contextBytes); | ||
} | ||
|
||
@Override | ||
protected int doHashCode() { | ||
return Objects.hash(fuzzyOptions, regexOptions, contextBytes); | ||
return Objects.hash(fuzzyOptions, regexOptions, contextBytes, skipDuplicates); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
question for my own understanding: what is N here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The completion suggester is document based so N here is the number of documents to return.