Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Term Vectors with artificial docs and keyword fields #53504

Merged
merged 8 commits into from
Mar 13, 2020
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -120,24 +120,6 @@ public IndexableField[] getFields(String name) {
return f.toArray(new IndexableField[f.size()]);
}

/**
* Returns an array of values of the field specified as the method parameter.
* This method returns an empty array when there are no
* matching fields. It never returns null.
* If you want the actual numeric field instances back, use {@link #getFields}.
* @param name the name of the field
* @return a <code>String[]</code> of field values
*/
public final String[] getValues(String name) {
List<String> result = new ArrayList<>();
for (IndexableField field : fields) {
if (field.name().equals(name) && field.stringValue() != null) {
result.add(field.stringValue());
}
}
return result.toArray(new String[result.size()]);
}

public IndexableField getField(String name) {
for (IndexableField field : fields) {
if (field.name().equals(name)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
import org.elasticsearch.search.dfs.AggregatedDfs;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
Expand Down Expand Up @@ -313,14 +314,33 @@ private static Fields generateTermVectorsFromDoc(IndexShard indexShard, TermVect
else {
seenFields.add(field.name());
}
String[] values = doc.getValues(field.name());
String[] values = getValues(doc.getFields(field.name()));
documentFields.add(new DocumentField(field.name(), Arrays.asList((Object[]) values)));
}
return generateTermVectors(indexShard,
XContentHelper.convertToMap(parsedDocument.source(), true, request.xContentType()).v2(), documentFields,
request.offsets(), request.perFieldAnalyzer(), seenFields);
}

/**
* Returns an array of values of the field specified as the method parameter.
* This method returns an empty array when there are no
* matching fields. It never returns null.
* @param fields The <code>IndexableField</code> to get the values from
* @return a <code>String[]</code> of field values
*/
public static String[] getValues(IndexableField[] fields) {
List<String> result = new ArrayList<>();
for (IndexableField field : fields) {
if (field.stringValue() != null) {
result.add(field.stringValue());
} else if (field.binaryValue() != null) { // KeywordFieldType
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that this will create duplicates due to doc values field that also use a binary value (SortedSetDocValuesField). You should change this to something like:

if (field.fieldType() instanceof KeywordFieldMapper.KeywordFieldType) {
  result.add(field.binaryValue().utf8ToString());
} else if (field.fieldType() instanceof StringFieldType) {
  result.add(field.stringValue());
}

result.add(field.binaryValue().utf8ToString());
}
}
return result.toArray(new String[0]);
}

private static ParsedDocument parseDocument(IndexShard indexShard, String index, BytesReference doc,
XContentType xContentType, String routing) {
MapperService mapperService = indexShard.mapperService();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.index.IndexService;
import org.elasticsearch.index.mapper.MapperService.MergeReason;
import org.elasticsearch.index.termvectors.TermVectorsService;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.test.ESSingleNodeTestCase;
import org.elasticsearch.test.InternalSettingsPlugin;
Expand Down Expand Up @@ -202,7 +203,7 @@ private void testIgnoreMalfomedForValue(String value, String expectedException)

IndexableField[] fields = doc.rootDoc().getFields("field");
assertEquals(0, fields.length);
assertArrayEquals(new String[] { "field" }, doc.rootDoc().getValues("_ignored"));
assertArrayEquals(new String[] { "field" }, TermVectorsService.getValues(doc.rootDoc().getFields("_ignored")));
}

public void testChangeFormat() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -154,11 +154,15 @@ public void testDotsWithExistingMapper() throws Exception {
.endObject());
ParsedDocument doc = mapper.parse(new SourceToParse("test", "1", bytes, XContentType.JSON));
assertNull(doc.dynamicMappingsUpdate()); // no update!
String[] values = doc.rootDoc().getValues("foo.bar.baz");
assertEquals(3, values.length);
assertEquals("123", values[0]);
assertEquals("456", values[1]);
assertEquals("789", values[2]);

IndexableField[] fields = doc.rootDoc().getFields("foo.bar.baz");
assertEquals(6, fields.length);
assertEquals(123, fields[0].numericValue());
assertEquals("123", fields[1].stringValue());
assertEquals(456, fields[2].numericValue());
assertEquals("456", fields[3].stringValue());
assertEquals(789, fields[4].numericValue());
assertEquals("789", fields[5].stringValue());
}

public void testDotsWithExistingNestedMapper() throws Exception {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.index.termvectors.TermVectorsService;
import org.elasticsearch.test.ESSingleNodeTestCase;
import org.elasticsearch.test.VersionUtils;

Expand All @@ -52,7 +53,7 @@ private static SortedSet<String> set(String... values) {
}

void assertFieldNames(Set<String> expected, ParsedDocument doc) {
String[] got = doc.rootDoc().getValues("_field_names");
String[] got = TermVectorsService.getValues(doc.rootDoc().getFields("_field_names"));
assertEquals(expected, set(got));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.index.IndexService;
import org.elasticsearch.index.termvectors.TermVectorsService;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.test.ESSingleNodeTestCase;
import org.elasticsearch.test.InternalSettingsPlugin;
Expand Down Expand Up @@ -194,7 +195,7 @@ public void testIgnoreMalformed() throws Exception {

IndexableField[] fields = doc.rootDoc().getFields("field");
assertEquals(0, fields.length);
assertArrayEquals(new String[] { "field" }, doc.rootDoc().getValues("_ignored"));
assertArrayEquals(new String[] { "field" }, TermVectorsService.getValues(doc.rootDoc().getFields("_ignored")));
}

public void testNullValue() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.index.mapper.MapperService.MergeReason;
import org.elasticsearch.index.termvectors.TermVectorsService;
import org.elasticsearch.indices.analysis.AnalysisModule;
import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.plugins.Plugin;
Expand Down Expand Up @@ -128,6 +129,9 @@ public void testDefaults() throws Exception {
fieldType = fields[1].fieldType();
assertThat(fieldType.indexOptions(), equalTo(IndexOptions.NONE));
assertEquals(DocValuesType.SORTED_SET, fieldType.docValuesType());

// used by TermVectorsService
assertArrayEquals(new String[] { "1234", "1234" }, TermVectorsService.getValues(doc.rootDoc().getFields("field")));
}

public void testIgnoreAbove() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
package org.elasticsearch.index.mapper;

import com.carrotsearch.randomizedtesting.annotations.Timeout;

import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexableField;
import org.elasticsearch.common.Strings;
Expand All @@ -32,6 +31,7 @@
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.index.mapper.NumberFieldMapper.NumberType;
import org.elasticsearch.index.mapper.NumberFieldTypeTests.OutOfRangeSpec;
import org.elasticsearch.index.termvectors.TermVectorsService;

import java.io.ByteArrayInputStream;
import java.io.IOException;
Expand Down Expand Up @@ -250,7 +250,7 @@ public void testIgnoreMalformed() throws Exception {

IndexableField[] fields = doc.rootDoc().getFields("field");
assertEquals(0, fields.length);
assertArrayEquals(new String[] { "field" }, doc.rootDoc().getValues("_ignored"));
assertArrayEquals(new String[] { "field" }, TermVectorsService.getValues(doc.rootDoc().getFields("_ignored")));
}
}
}
Expand Down