Skip to content

Commit

Permalink
Expose duplicate removal in the completion suggester (#26496)
Browse files Browse the repository at this point in the history
This change exposes the duplicate removal option added in Lucene for the completion suggester
with a new option called `skip_duplicates` (defaults to false).
This commit also adapts the custom suggest collector to handle deduplication when multiple contexts match the input.

Closes #23364
  • Loading branch information
jimczi authored Sep 7, 2017
1 parent abe83c4 commit d68d8c9
Show file tree
Hide file tree
Showing 14 changed files with 394 additions and 96 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,16 @@
*/
package org.elasticsearch.search.suggest.completion;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.BulkScorer;
import org.apache.lucene.search.CollectionTerminatedException;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.document.CompletionQuery;
import org.apache.lucene.search.suggest.document.TopSuggestDocs;
import org.apache.lucene.search.suggest.document.TopSuggestDocsCollector;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.PriorityQueue;
import org.elasticsearch.common.text.Text;
import org.elasticsearch.index.mapper.CompletionFieldMapper;
import org.elasticsearch.search.suggest.Suggest;
Expand All @@ -53,12 +52,14 @@ protected Suggest.Suggestion<? extends Suggest.Suggestion.Entry<? extends Sugges
final CompletionSuggestionContext suggestionContext, final IndexSearcher searcher, CharsRefBuilder spare) throws IOException {
if (suggestionContext.getFieldType() != null) {
final CompletionFieldMapper.CompletionFieldType fieldType = suggestionContext.getFieldType();
CompletionSuggestion completionSuggestion = new CompletionSuggestion(name, suggestionContext.getSize());
CompletionSuggestion completionSuggestion =
new CompletionSuggestion(name, suggestionContext.getSize(), suggestionContext.isSkipDuplicates());
spare.copyUTF8Bytes(suggestionContext.getText());
CompletionSuggestion.Entry completionSuggestEntry = new CompletionSuggestion.Entry(
new Text(spare.toString()), 0, spare.length());
completionSuggestion.addTerm(completionSuggestEntry);
TopSuggestDocsCollector collector = new TopDocumentsCollector(suggestionContext.getSize());
TopSuggestDocsCollector collector =
new TopDocumentsCollector(suggestionContext.getSize(), suggestionContext.isSkipDuplicates());
suggest(searcher, suggestionContext.toQuery(), collector);
int numResult = 0;
for (TopSuggestDocs.SuggestScoreDoc suggestScoreDoc : collector.get().scoreLookupDocs()) {
Expand Down Expand Up @@ -97,8 +98,21 @@ private static void suggest(IndexSearcher searcher, CompletionQuery query, TopSu
}
}

// TODO: this should be refactored and moved to lucene
// see https://issues.apache.org/jira/browse/LUCENE-6880
/**
* TODO: this should be refactored and moved to lucene see https://issues.apache.org/jira/browse/LUCENE-6880
*
* Custom collector that returns top documents from the completion suggester.
* When suggestions are augmented with contexts values this collector groups suggestions coming from the same document
* but matching different contexts together. Each document is counted as 1 entry and the provided size is the expected number
* of documents that should be returned (not the number of suggestions).
* This collector is also able to filter duplicate suggestion coming from different documents.
* When different contexts match the same suggestion form only the best one (sorted by weight) is kept.
* In order to keep this feature fast, the de-duplication of suggestions with different contexts is done
* only on the top N*num_contexts (where N is the number of documents to return) suggestions per segment.
* This means that skip_duplicates will visit at most N*num_contexts suggestions per segment to find unique suggestions
* that match the input. If more than N*num_contexts suggestions are duplicated with different contexts this collector
* will not be able to return more than one suggestion even when N is greater than 1.
**/
private static final class TopDocumentsCollector extends TopSuggestDocsCollector {

/**
Expand Down Expand Up @@ -150,93 +164,53 @@ public List<CharSequence> getContexts() {
}
}

private static final class SuggestDocPriorityQueue extends PriorityQueue<SuggestDoc> {
private final Map<Integer, SuggestDoc> docsMap;

SuggestDocPriorityQueue(int maxSize) {
super(maxSize);
}

@Override
protected boolean lessThan(SuggestDoc a, SuggestDoc b) {
if (a.score == b.score) {
int cmp = Lookup.CHARSEQUENCE_COMPARATOR.compare(a.key, b.key);
if (cmp == 0) {
// prefer smaller doc id, in case of a tie
return a.doc > b.doc;
} else {
return cmp > 0;
}
}
return a.score < b.score;
}

public SuggestDoc[] getResults() {
int size = size();
SuggestDoc[] res = new SuggestDoc[size];
for (int i = size - 1; i >= 0; i--) {
res[i] = pop();
}
return res;
}
}

private final int num;
private final SuggestDocPriorityQueue pq;
private final Map<Integer, SuggestDoc> scoreDocMap;

// TODO: expose dup removal

TopDocumentsCollector(int num) {
super(1, false); // TODO hack, we don't use the underlying pq, so we allocate a size of 1
this.num = num;
this.scoreDocMap = new LinkedHashMap<>(num);
this.pq = new SuggestDocPriorityQueue(num);
}

@Override
public int getCountToCollect() {
// This is only needed because we initialize
// the base class with 1 instead of the actual num
return num;
}


@Override
protected void doSetNextReader(LeafReaderContext context) throws IOException {
super.doSetNextReader(context);
updateResults();
}

private void updateResults() {
for (SuggestDoc suggestDoc : scoreDocMap.values()) {
if (pq.insertWithOverflow(suggestDoc) == suggestDoc) {
break;
}
}
scoreDocMap.clear();
TopDocumentsCollector(int num, boolean skipDuplicates) {
super(Math.max(1, num), skipDuplicates);
this.docsMap = new LinkedHashMap<>(num);
}

@Override
public void collect(int docID, CharSequence key, CharSequence context, float score) throws IOException {
if (scoreDocMap.containsKey(docID)) {
SuggestDoc suggestDoc = scoreDocMap.get(docID);
suggestDoc.add(key, context, score);
} else if (scoreDocMap.size() <= num) {
scoreDocMap.put(docID, new SuggestDoc(docBase + docID, key, context, score));
int globalDoc = docID + docBase;
if (docsMap.containsKey(globalDoc)) {
docsMap.get(globalDoc).add(key, context, score);
} else {
throw new CollectionTerminatedException();
docsMap.put(globalDoc, new SuggestDoc(globalDoc, key, context, score));
super.collect(docID, key, context, score);
}
}

@Override
public TopSuggestDocs get() throws IOException {
updateResults(); // to empty the last set of collected suggest docs
TopSuggestDocs.SuggestScoreDoc[] suggestScoreDocs = pq.getResults();
if (suggestScoreDocs.length > 0) {
return new TopSuggestDocs(suggestScoreDocs.length, suggestScoreDocs, suggestScoreDocs[0].score);
} else {
TopSuggestDocs entries = super.get();
if (entries.scoreDocs.length == 0) {
return TopSuggestDocs.EMPTY;
}
// The parent class returns suggestions, not documents, and dedup only the surface form (without contexts).
// The following code groups suggestions matching different contexts by document id and dedup the surface form + contexts
// if needed (skip_duplicates).
int size = entries.scoreDocs.length;
final List<TopSuggestDocs.SuggestScoreDoc> suggestDocs = new ArrayList(size);
final CharArraySet seenSurfaceForms = doSkipDuplicates() ? new CharArraySet(size, false) : null;
for (TopSuggestDocs.SuggestScoreDoc suggestEntry : entries.scoreLookupDocs()) {
final SuggestDoc suggestDoc;
if (docsMap != null) {
suggestDoc = docsMap.get(suggestEntry.doc);
} else {
suggestDoc = new SuggestDoc(suggestEntry.doc, suggestEntry.key, suggestEntry.context, suggestEntry.score);
}
if (doSkipDuplicates()) {
if (seenSurfaceForms.contains(suggestDoc.key)) {
continue;
}
seenSurfaceForms.add(suggestDoc.key);
}
suggestDocs.add(suggestDoc);
}
return new TopSuggestDocs((int) entries.totalHits,
suggestDocs.toArray(new TopSuggestDocs.SuggestScoreDoc[0]), entries.getMaxScore());
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@
*/
package org.elasticsearch.search.suggest.completion;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.suggest.Lookup;
import org.elasticsearch.Version;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
Expand Down Expand Up @@ -68,11 +70,38 @@ public final class CompletionSuggestion extends Suggest.Suggestion<CompletionSug

public static final int TYPE = 4;

private boolean skipDuplicates;

public CompletionSuggestion() {
}

public CompletionSuggestion(String name, int size) {
/**
* Ctr
* @param name The name for the suggestions
* @param size The number of suggestions to return
* @param skipDuplicates Whether duplicate suggestions should be filtered out
*/
public CompletionSuggestion(String name, int size, boolean skipDuplicates) {
super(name, size);
this.skipDuplicates = skipDuplicates;
}

@Override
public void readFrom(StreamInput in) throws IOException {
super.readFrom(in);
// TODO should be backported to 6.1.0
if (in.getVersion().onOrAfter(Version.V_7_0_0_alpha1)) {
skipDuplicates = in.readBoolean();
}
}

@Override
public void writeTo(StreamOutput out) throws IOException {
super.writeTo(out);
// TODO should be backported to 6.1.0
if (out.getVersion().onOrAfter(Version.V_7_0_0_alpha1)) {
out.writeBoolean(skipDuplicates);
}
}

/**
Expand All @@ -95,7 +124,7 @@ public boolean hasScoreDocs() {
}

public static CompletionSuggestion fromXContent(XContentParser parser, String name) throws IOException {
CompletionSuggestion suggestion = new CompletionSuggestion(name, -1);
CompletionSuggestion suggestion = new CompletionSuggestion(name, -1, false);
parseEntries(parser, suggestion, CompletionSuggestion.Entry::fromXContent);
return suggestion;
}
Expand Down Expand Up @@ -146,9 +175,19 @@ public static CompletionSuggestion reduceTo(List<Suggest.Suggestion<Entry>> toRe
// the global top <code>size</code> entries are collected from the shard results
// using a priority queue
OptionPriorityQueue priorityQueue = new OptionPriorityQueue(leader.getSize(), COMPARATOR);
// Dedup duplicate suggestions (based on the surface form) if skip duplicates is activated
final CharArraySet seenSurfaceForms = leader.skipDuplicates ? new CharArraySet(leader.getSize(), false) : null;
for (Suggest.Suggestion<Entry> suggestion : toReduce) {
assert suggestion.getName().equals(name) : "name should be identical across all suggestions";
for (Entry.Option option : ((CompletionSuggestion) suggestion).getOptions()) {
if (leader.skipDuplicates) {
assert ((CompletionSuggestion) suggestion).skipDuplicates;
String text = option.getText().string();
if (seenSurfaceForms.contains(text)) {
continue;
}
seenSurfaceForms.add(text);
}
if (option == priorityQueue.insertWithOverflow(option)) {
// if the current option has overflown from pq,
// we can assume all of the successive options
Expand All @@ -157,7 +196,7 @@ public static CompletionSuggestion reduceTo(List<Suggest.Suggestion<Entry>> toRe
}
}
}
final CompletionSuggestion suggestion = new CompletionSuggestion(leader.getName(), leader.getSize());
final CompletionSuggestion suggestion = new CompletionSuggestion(leader.getName(), leader.getSize(), leader.skipDuplicates);
final Entry entry = new Entry(leaderEntry.getText(), leaderEntry.getOffset(), leaderEntry.getLength());
Collections.addAll(entry.getOptions(), priorityQueue.get());
suggestion.addTerm(entry);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
package org.elasticsearch.search.suggest.completion;

import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.Version;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.io.stream.StreamInput;
Expand Down Expand Up @@ -57,6 +58,7 @@ public class CompletionSuggestionBuilder extends SuggestionBuilder<CompletionSug
private static final XContentType CONTEXT_BYTES_XCONTENT_TYPE = XContentType.JSON;
static final String SUGGESTION_NAME = "completion";
static final ParseField CONTEXTS_FIELD = new ParseField("contexts", "context");
static final ParseField SKIP_DUPLICATES_FIELD = new ParseField("skip_duplicates");

/**
* {
Expand Down Expand Up @@ -94,11 +96,13 @@ public class CompletionSuggestionBuilder extends SuggestionBuilder<CompletionSug
v.contextBytes = builder.bytes();
p.skipChildren();
}, CONTEXTS_FIELD, ObjectParser.ValueType.OBJECT); // context is deprecated
PARSER.declareBoolean(CompletionSuggestionBuilder::skipDuplicates, SKIP_DUPLICATES_FIELD);
}

protected FuzzyOptions fuzzyOptions;
protected RegexOptions regexOptions;
protected BytesReference contextBytes = null;
protected boolean skipDuplicates = false;

public CompletionSuggestionBuilder(String field) {
super(field);
Expand All @@ -113,6 +117,7 @@ private CompletionSuggestionBuilder(String fieldname, CompletionSuggestionBuilde
fuzzyOptions = in.fuzzyOptions;
regexOptions = in.regexOptions;
contextBytes = in.contextBytes;
skipDuplicates = in.skipDuplicates;
}

/**
Expand All @@ -123,13 +128,21 @@ public CompletionSuggestionBuilder(StreamInput in) throws IOException {
fuzzyOptions = in.readOptionalWriteable(FuzzyOptions::new);
regexOptions = in.readOptionalWriteable(RegexOptions::new);
contextBytes = in.readOptionalBytesReference();
// TODO should be backported to 6.1.0
if (in.getVersion().onOrAfter(Version.V_7_0_0_alpha1)) {
skipDuplicates = in.readBoolean();
}
}

@Override
public void doWriteTo(StreamOutput out) throws IOException {
out.writeOptionalWriteable(fuzzyOptions);
out.writeOptionalWriteable(regexOptions);
out.writeOptionalBytesReference(contextBytes);
// TODO should be backported to 6.1.0
if (out.getVersion().onOrAfter(Version.V_7_0_0_alpha1)) {
out.writeBoolean(skipDuplicates);
}
}

/**
Expand Down Expand Up @@ -210,6 +223,21 @@ private CompletionSuggestionBuilder contexts(XContentBuilder contextBuilder) {
return this;
}

/**
* Returns whether duplicate suggestions should be filtered out.
*/
public boolean skipDuplicates() {
return skipDuplicates;
}

/**
* Should duplicates be filtered or not. Defaults to <tt>false</tt>.
*/
public CompletionSuggestionBuilder skipDuplicates(boolean skipDuplicates) {
this.skipDuplicates = skipDuplicates;
return this;
}

private static class InnerBuilder extends CompletionSuggestionBuilder {
private String field;

Expand All @@ -231,6 +259,9 @@ protected XContentBuilder innerToXContent(XContentBuilder builder, Params params
if (regexOptions != null) {
regexOptions.toXContent(builder, params);
}
if (skipDuplicates) {
builder.field(SKIP_DUPLICATES_FIELD.getPreferredName(), skipDuplicates);
}
if (contextBytes != null) {
builder.rawField(CONTEXTS_FIELD.getPreferredName(), contextBytes);
}
Expand All @@ -255,6 +286,7 @@ public SuggestionContext build(QueryShardContext context) throws IOException {
// copy over common settings to each suggestion builder
final MapperService mapperService = context.getMapperService();
populateCommonFields(mapperService, suggestionContext);
suggestionContext.setSkipDuplicates(skipDuplicates);
suggestionContext.setFuzzyOptions(fuzzyOptions);
suggestionContext.setRegexOptions(regexOptions);
MappedFieldType mappedFieldType = mapperService.fullName(suggestionContext.getField());
Expand Down Expand Up @@ -302,13 +334,14 @@ public String getWriteableName() {

@Override
protected boolean doEquals(CompletionSuggestionBuilder other) {
return Objects.equals(fuzzyOptions, other.fuzzyOptions) &&
return skipDuplicates == other.skipDuplicates &&
Objects.equals(fuzzyOptions, other.fuzzyOptions) &&
Objects.equals(regexOptions, other.regexOptions) &&
Objects.equals(contextBytes, other.contextBytes);
}

@Override
protected int doHashCode() {
return Objects.hash(fuzzyOptions, regexOptions, contextBytes);
return Objects.hash(fuzzyOptions, regexOptions, contextBytes, skipDuplicates);
}
}
Loading

0 comments on commit d68d8c9

Please sign in to comment.