From dfe1a10ac4361832eb793d3d222593452506c117 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=BCscher?= Date: Thu, 12 Mar 2020 19:11:21 +0100 Subject: [PATCH] Mask wildcard query special characters on keyword queries (#53127) Wildcard queries on keyword fields get normalized, however this normalization step should exclude the two special characters * and ? in order to keep the wildcard query itself intact. Closes #46300 --- .../index/mapper/ConstantFieldType.java | 4 + .../index/mapper/KeywordFieldMapper.java | 1 + .../index/mapper/StringFieldType.java | 44 +++++-- .../index/mapper/TypeFieldMapper.java | 50 ++------ .../index/mapper/TypeFieldTypeTests.java | 23 ++-- .../index/query/RangeQueryBuilderTests.java | 7 -- .../query/WildcardQueryBuilderTests.java | 11 +- .../search/query/SearchQueryIT.java | 118 +++++++++++++++++- 8 files changed, 187 insertions(+), 71 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/ConstantFieldType.java b/server/src/main/java/org/elasticsearch/index/mapper/ConstantFieldType.java index 779e8f91350dc..a74e4b0c49daf 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/ConstantFieldType.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/ConstantFieldType.java @@ -79,6 +79,10 @@ private static String valueToString(Object value) { public final Query termQuery(Object value, QueryShardContext context) { String pattern = valueToString(value); if (matches(pattern, context)) { + if (context != null && context.getMapperService().hasNested()) { + // type filters are expected not to match nested docs + return Queries.newNonNestedFilter(context.indexVersionCreated()); + } return Queries.newMatchAllQuery(); } else { return new MatchNoDocsQuery(); diff --git a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java index 099ae9b2aa7a2..54ba709eb99c0 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java @@ -202,6 +202,7 @@ protected KeywordFieldType(KeywordFieldType ref) { this.splitQueriesOnWhitespace = ref.splitQueriesOnWhitespace; } + @Override public KeywordFieldType clone() { return new KeywordFieldType(this); } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java b/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java index 4ddda3df0af2d..05bf6b61d1de1 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java @@ -21,8 +21,6 @@ import org.apache.lucene.index.Term; import org.apache.lucene.search.FuzzyQuery; -import org.apache.lucene.search.MatchAllDocsQuery; -import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.MultiTermQuery; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; @@ -31,6 +29,7 @@ import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; import org.elasticsearch.ElasticsearchException; import org.elasticsearch.common.lucene.BytesRefs; import org.elasticsearch.common.unit.Fuzziness; @@ -38,6 +37,8 @@ import org.elasticsearch.index.query.support.QueryParsers; import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import static org.elasticsearch.search.SearchService.ALLOW_EXPENSIVE_QUERIES; @@ -47,6 +48,8 @@ * can be implemented. */ public abstract class StringFieldType extends TermBasedFieldType { + private static final Pattern WILDCARD_PATTERN = Pattern.compile("(\\\\.)|([?*]+)"); + public StringFieldType() {} protected StringFieldType(MappedFieldType ref) { @@ -92,16 +95,41 @@ public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, Quer @Override public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) { - Query termQuery = termQuery(value, context); - if (termQuery instanceof MatchNoDocsQuery || termQuery instanceof MatchAllDocsQuery) { - return termQuery; - } - + failIfNotIndexed(); if (context.allowExpensiveQueries() == false) { throw new ElasticsearchException("[wildcard] queries cannot be executed when '" + ALLOW_EXPENSIVE_QUERIES.getKey() + "' is set to false."); } - Term term = MappedFieldType.extractTerm(termQuery); + + Term term; + if (searchAnalyzer() != null) { + // we want to normalize everything except wildcard characters, e.g. F?o Ba* to f?o ba*, even if e.g there + // is a char_filter that would otherwise remove them + Matcher wildcardMatcher = WILDCARD_PATTERN.matcher(value); + BytesRefBuilder sb = new BytesRefBuilder(); + int last = 0; + + while (wildcardMatcher.find()) { + if (wildcardMatcher.start() > 0) { + String chunk = value.substring(last, wildcardMatcher.start()); + + BytesRef normalized = searchAnalyzer().normalize(name(), chunk); + sb.append(normalized); + } + // append the matched group - without normalizing + sb.append(new BytesRef(wildcardMatcher.group())); + + last = wildcardMatcher.end(); + } + if (last < value.length()) { + String chunk = value.substring(last); + BytesRef normalized = searchAnalyzer().normalize(name(), chunk); + sb.append(normalized); + } + term = new Term(name(), sb.toBytesRef()); + } else { + term = new Term(name(), indexedValueForSearch(value)); + } WildcardQuery query = new WildcardQuery(term); QueryParsers.setRewriteMethod(query, method); diff --git a/server/src/main/java/org/elasticsearch/index/mapper/TypeFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/TypeFieldMapper.java index c4d9ef966ca3d..9e8e9991b6f06 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/TypeFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/TypeFieldMapper.java @@ -35,8 +35,10 @@ import org.apache.lucene.search.TermInSetQuery; import org.apache.lucene.search.TermQuery; import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.geo.ShapeRelation; +import org.elasticsearch.common.lucene.BytesRefs; import org.elasticsearch.common.lucene.Lucene; -import org.elasticsearch.common.lucene.search.Queries; +import org.elasticsearch.common.time.DateMathParser; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.fielddata.IndexFieldData; @@ -44,6 +46,7 @@ import org.elasticsearch.index.query.QueryShardContext; import java.io.IOException; +import java.time.ZoneId; import java.util.Arrays; import java.util.HashSet; import java.util.List; @@ -88,7 +91,7 @@ public MetadataFieldMapper getDefault(MappedFieldType fieldType, ParserContext c } } - public static final class TypeFieldType extends StringFieldType { + public static final class TypeFieldType extends ConstantFieldType { TypeFieldType() { } @@ -114,55 +117,26 @@ public IndexFieldData.Builder fielddataBuilder(String fullyQualifiedIndexName) { } @Override - public boolean isSearchable() { - return true; - } - - @Override - public Query existsQuery(QueryShardContext context) { - return new MatchAllDocsQuery(); - } - - @Override - public Query termQuery(Object value, QueryShardContext context) { - return termsQuery(Arrays.asList(value), context); - } - - @Override - public Query termsQuery(List values, QueryShardContext context) { - DocumentMapper mapper = context.getMapperService().documentMapper(); - if (mapper == null) { - return new MatchNoDocsQuery("No types"); - } - BytesRef indexType = indexedValueForSearch(mapper.type()); - if (values.stream() - .map(this::indexedValueForSearch) - .anyMatch(indexType::equals)) { - if (context.getMapperService().hasNested()) { - // type filters are expected not to match nested docs - return Queries.newNonNestedFilter(context.indexVersionCreated()); - } else { - return new MatchAllDocsQuery(); - } - } else { - return new MatchNoDocsQuery("Type list does not contain the index type"); - } + protected boolean matches(String pattern, QueryShardContext context) { + String type = context.getMapperService().documentMapper().type(); + return pattern.equals(type); } @Override - public Query rangeQuery(Object lowerTerm, Object upperTerm, boolean includeLower, boolean includeUpper, QueryShardContext context) { + public Query rangeQuery(Object lowerTerm, Object upperTerm, boolean includeLower, boolean includeUpper, ShapeRelation relation, + ZoneId timeZone, DateMathParser parser, QueryShardContext context) { Query result = new MatchAllDocsQuery(); String type = context.getMapperService().documentMapper().type(); if (type != null) { BytesRef typeBytes = new BytesRef(type); if (lowerTerm != null) { - int comp = indexedValueForSearch(lowerTerm).compareTo(typeBytes); + int comp = BytesRefs.toBytesRef(lowerTerm).compareTo(typeBytes); if (comp > 0 || (comp == 0 && includeLower == false)) { result = new MatchNoDocsQuery("[_type] was lexicographically smaller than lower bound of range"); } } if (upperTerm != null) { - int comp = indexedValueForSearch(upperTerm).compareTo(typeBytes); + int comp = BytesRefs.toBytesRef(upperTerm).compareTo(typeBytes); if (comp < 0 || (comp == 0 && includeUpper == false)) { result = new MatchNoDocsQuery("[_type] was lexicographically greater than upper bound of range"); } diff --git a/server/src/test/java/org/elasticsearch/index/mapper/TypeFieldTypeTests.java b/server/src/test/java/org/elasticsearch/index/mapper/TypeFieldTypeTests.java index dc6f14cb0148a..98fe2a0afef64 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/TypeFieldTypeTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/TypeFieldTypeTests.java @@ -53,26 +53,25 @@ public void testTermsQuery() throws Exception { MapperService mapperService = Mockito.mock(MapperService.class); Mockito.when(mapperService.documentMapper()).thenReturn(null); Mockito.when(context.getMapperService()).thenReturn(mapperService); + DocumentMapper mapper = Mockito.mock(DocumentMapper.class); + Mockito.when(mapper.type()).thenReturn("_doc"); + Mockito.when(mapperService.documentMapper()).thenReturn(mapper); TypeFieldMapper.TypeFieldType ft = new TypeFieldMapper.TypeFieldType(); ft.setName(TypeFieldMapper.NAME); - Query query = ft.termQuery("my_type", context); + + Query query = ft.termQuery("_doc", context); + assertEquals(new MatchAllDocsQuery(), query); + + query = ft.termQuery("other_type", context); assertEquals(new MatchNoDocsQuery(), query); - DocumentMapper mapper = Mockito.mock(DocumentMapper.class); - Mockito.when(mapper.type()).thenReturn("my_type"); - Mockito.when(mapperService.documentMapper()).thenReturn(mapper); - query = ft.termQuery("my_type", context); + Mockito.when(mapper.type()).thenReturn("other_type"); + query = ft.termQuery("other_type", context); assertEquals(new MatchAllDocsQuery(), query); Mockito.when(mapperService.hasNested()).thenReturn(true); - query = ft.termQuery("my_type", context); + query = ft.termQuery("other_type", context); assertEquals(Queries.newNonNestedFilter(context.indexVersionCreated()), query); - - mapper = Mockito.mock(DocumentMapper.class); - Mockito.when(mapper.type()).thenReturn("other_type"); - Mockito.when(mapperService.documentMapper()).thenReturn(mapper); - query = ft.termQuery("my_type", context); - assertEquals(new MatchNoDocsQuery(), query); } } diff --git a/server/src/test/java/org/elasticsearch/index/query/RangeQueryBuilderTests.java b/server/src/test/java/org/elasticsearch/index/query/RangeQueryBuilderTests.java index a282bbe987d14..6c36e35b055ac 100644 --- a/server/src/test/java/org/elasticsearch/index/query/RangeQueryBuilderTests.java +++ b/server/src/test/java/org/elasticsearch/index/query/RangeQueryBuilderTests.java @@ -560,13 +560,6 @@ public void testParseRelation() { assertEquals(ShapeRelation.INTERSECTS, builder.relation()); } - public void testTypeField() throws IOException { - RangeQueryBuilder builder = QueryBuilders.rangeQuery("_type") - .from("value1"); - builder.doToQuery(createShardContext()); - assertWarnings(QueryShardContext.TYPES_DEPRECATION_MESSAGE); - } - /** * Range queries should generally be cacheable, at least the ones we create randomly. * This test makes sure we also test the non-cacheable cases regularly. diff --git a/server/src/test/java/org/elasticsearch/index/query/WildcardQueryBuilderTests.java b/server/src/test/java/org/elasticsearch/index/query/WildcardQueryBuilderTests.java index 9482e0bb87d49..3eff0d1f819a5 100644 --- a/server/src/test/java/org/elasticsearch/index/query/WildcardQueryBuilderTests.java +++ b/server/src/test/java/org/elasticsearch/index/query/WildcardQueryBuilderTests.java @@ -27,6 +27,7 @@ import java.io.IOException; import java.util.HashMap; +import java.util.Locale; import java.util.Map; import static org.hamcrest.Matchers.equalTo; @@ -75,7 +76,9 @@ protected void doAssertLuceneQuery(WildcardQueryBuilder queryBuilder, Query quer assertThat(wildcardQuery.getField(), equalTo(expectedFieldName)); assertThat(wildcardQuery.getTerm().field(), equalTo(expectedFieldName)); - assertThat(wildcardQuery.getTerm().text(), equalTo(queryBuilder.value())); + // wildcard queries get normalized + String text = wildcardQuery.getTerm().text().toLowerCase(Locale.ROOT); + assertThat(text, equalTo(text)); } else { Query expected = new MatchNoDocsQuery("unknown field [" + expectedFieldName + "]"); assertEquals(expected, query); @@ -138,14 +141,14 @@ public void testTypeField() throws IOException { builder.doToQuery(createShardContext()); assertWarnings(QueryShardContext.TYPES_DEPRECATION_MESSAGE); } - + public void testRewriteIndexQueryToMatchNone() throws IOException { WildcardQueryBuilder query = new WildcardQueryBuilder("_index", "does_not_exist"); QueryShardContext queryShardContext = createShardContext(); QueryBuilder rewritten = query.rewrite(queryShardContext); assertThat(rewritten, instanceOf(MatchNoneQueryBuilder.class)); - } - + } + public void testRewriteIndexQueryNotMatchNone() throws IOException { String fullIndexName = getIndex().getName(); String firstHalfOfIndexName = fullIndexName.substring(0,fullIndexName.length()/2); diff --git a/server/src/test/java/org/elasticsearch/search/query/SearchQueryIT.java b/server/src/test/java/org/elasticsearch/search/query/SearchQueryIT.java index 88873ead39c0a..a190e1a0c5c53 100644 --- a/server/src/test/java/org/elasticsearch/search/query/SearchQueryIT.java +++ b/server/src/test/java/org/elasticsearch/search/query/SearchQueryIT.java @@ -19,6 +19,8 @@ package org.elasticsearch.search.query; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.pattern.PatternReplaceCharFilter; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.MultiTermQuery; import org.apache.lucene.search.join.ScoreMode; @@ -32,11 +34,15 @@ import org.elasticsearch.common.Strings; import org.elasticsearch.common.document.DocumentField; import org.elasticsearch.common.lucene.search.SpanBooleanQueryRewriteWithMaxClause; +import org.elasticsearch.common.regex.Regex; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.time.DateFormatter; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentFactory; import org.elasticsearch.common.xcontent.XContentType; +import org.elasticsearch.index.analysis.CharFilterFactory; +import org.elasticsearch.index.analysis.NormalizingCharFilterFactory; +import org.elasticsearch.index.analysis.TokenizerFactory; import org.elasticsearch.index.query.BoolQueryBuilder; import org.elasticsearch.index.query.MatchQueryBuilder; import org.elasticsearch.index.query.MultiMatchQueryBuilder; @@ -45,11 +51,14 @@ import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.index.query.RangeQueryBuilder; import org.elasticsearch.index.query.TermQueryBuilder; +import org.elasticsearch.index.query.WildcardQueryBuilder; import org.elasticsearch.index.query.WrapperQueryBuilder; import org.elasticsearch.index.query.functionscore.ScoreFunctionBuilders; import org.elasticsearch.index.search.MatchQuery; import org.elasticsearch.indices.IndicesService; import org.elasticsearch.indices.TermsLookup; +import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; +import org.elasticsearch.plugins.AnalysisPlugin; import org.elasticsearch.plugins.Plugin; import org.elasticsearch.rest.RestStatus; import org.elasticsearch.search.SearchHit; @@ -60,16 +69,20 @@ import org.elasticsearch.test.junit.annotations.TestIssueLogging; import java.io.IOException; +import java.io.Reader; import java.time.Instant; import java.time.ZoneId; import java.time.ZoneOffset; import java.time.ZonedDateTime; import java.time.format.DateTimeFormatter; +import java.util.Arrays; import java.util.Collection; -import java.util.Collections; +import java.util.Map; import java.util.Random; import java.util.concurrent.ExecutionException; +import java.util.regex.Pattern; +import static java.util.Collections.singletonMap; import static org.elasticsearch.action.support.WriteRequest.RefreshPolicy.IMMEDIATE; import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_SHARDS; import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; @@ -120,7 +133,7 @@ public class SearchQueryIT extends ESIntegTestCase { @Override protected Collection> nodePlugins() { - return Collections.singleton(InternalSettingsPlugin.class); + return Arrays.asList(InternalSettingsPlugin.class, MockAnalysisPlugin.class); } @Override @@ -1897,6 +1910,107 @@ public void testFieldAliasesForMetaFields() throws Exception { } + /** + * Test that wildcard queries on keyword fields get normalized + */ + public void testWildcardQueryNormalizationOnKeywordField() { + assertAcked(prepareCreate("test") + .setSettings(Settings.builder() + .put("index.analysis.normalizer.lowercase_normalizer.type", "custom") + .putList("index.analysis.normalizer.lowercase_normalizer.filter", "lowercase") + .build()) + .addMapping("_doc", "field1", "type=keyword,normalizer=lowercase_normalizer")); + client().prepareIndex("test", "_doc", "1").setSource("field1", "Bbb Aaa").get(); + refresh(); + + { + WildcardQueryBuilder wildCardQuery = wildcardQuery("field1", "Bb*"); + SearchResponse searchResponse = client().prepareSearch().setQuery(wildCardQuery).get(); + assertHitCount(searchResponse, 1L); + + wildCardQuery = wildcardQuery("field1", "bb*"); + searchResponse = client().prepareSearch().setQuery(wildCardQuery).get(); + assertHitCount(searchResponse, 1L); + } + } + + /** + * Test that wildcard queries on text fields get normalized + */ + public void testWildcardQueryNormalizationOnTextField() { + assertAcked(prepareCreate("test") + .setSettings(Settings.builder() + .put("index.analysis.analyzer.lowercase_analyzer.type", "custom") + .put("index.analysis.analyzer.lowercase_analyzer.tokenizer", "standard") + .putList("index.analysis.analyzer.lowercase_analyzer.filter", "lowercase") + .build()) + .addMapping("_doc", "field1", "type=text,analyzer=lowercase_analyzer")); + client().prepareIndex("test", "_doc", "1").setSource("field1", "Bbb Aaa").get(); + refresh(); + + { + WildcardQueryBuilder wildCardQuery = wildcardQuery("field1", "Bb*"); + SearchResponse searchResponse = client().prepareSearch().setQuery(wildCardQuery).get(); + assertHitCount(searchResponse, 1L); + + wildCardQuery = wildcardQuery("field1", "bb*"); + searchResponse = client().prepareSearch().setQuery(wildCardQuery).get(); + assertHitCount(searchResponse, 1L); + } + } + + /** + * Reserved characters should be excluded when the normalization is applied for keyword fields. + * See https://github.com/elastic/elasticsearch/issues/46300 for details. + */ + public void testWildcardQueryNormalizationKeywordSpecialCharacters() { + assertAcked(prepareCreate("test") + .setSettings(Settings.builder().put("index.analysis.char_filter.no_wildcard.type", "mock_pattern_replace") + .put("index.analysis.normalizer.no_wildcard.type", "custom") + .put("index.analysis.normalizer.no_wildcard.char_filter", "no_wildcard").build()) + .addMapping("_doc", "field", "type=keyword,normalizer=no_wildcard")); + client().prepareIndex("test", "_doc", "1").setSource("field", "label-1").get(); + refresh(); + + WildcardQueryBuilder wildCardQuery = wildcardQuery("field", "la*"); + SearchResponse searchResponse = client().prepareSearch().setQuery(wildCardQuery).get(); + assertHitCount(searchResponse, 1L); + + wildCardQuery = wildcardQuery("field", "la*el-?"); + searchResponse = client().prepareSearch().setQuery(wildCardQuery).get(); + assertHitCount(searchResponse, 1L); + } + + public static class MockAnalysisPlugin extends Plugin implements AnalysisPlugin { + + @Override + public Map> getCharFilters() { + return singletonMap("mock_pattern_replace", (indexSettings, env, name, settings) -> { + class Factory implements NormalizingCharFilterFactory { + + private final Pattern pattern = Regex.compile("[\\*\\?]", null); + + @Override + public String name() { + return name; + } + + @Override + public Reader create(Reader reader) { + return new PatternReplaceCharFilter(pattern, "", reader); + } + } + return new Factory(); + }); + } + + @Override + public Map> getTokenizers() { + return singletonMap("keyword", (indexSettings, environment, name, settings) -> TokenizerFactory.newFactory(name, + () -> new MockTokenizer(MockTokenizer.KEYWORD, false))); + } + } + /** * Test correct handling {@link SpanBooleanQueryRewriteWithMaxClause#rewrite(IndexReader, MultiTermQuery)}. That rewrite method is e.g. * set for fuzzy queries with "constant_score" rewrite nested inside a `span_multi` query and would cause NPEs due to an unset