Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Wildcard field - add normalizer support (#53851) #54109

Merged
merged 1 commit into from
Mar 24, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions docs/reference/mapping/types/wildcard.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,23 @@ POST my_index/_doc/_search
--------------------------------------------------


[[wildcard-params]]
==== Parameters for wildcard fields

The following parameters are accepted by `wildcard` fields:

[horizontal]

<<ignore-above,`ignore_above`>>::

Do not index any string longer than this value. Defaults to `2147483647`
so that all values would be accepted.

<<normalizer,`normalizer`>>::

How to pre-process the value prior to indexing. Defaults to `null`,
meaning the value is kept as-is.

==== Limitations

* `wildcard` fields are untokenized like keyword fields, so do not support queries that rely on word positions such as phrase queries.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -358,14 +358,14 @@ public Query fuzzyQuery(Object value, Fuzziness fuzziness, int prefixLength, int
}

public Query prefixQuery(String value, @Nullable MultiTermQuery.RewriteMethod method, QueryShardContext context) {
throw new QueryShardException(context, "Can only use prefix queries on keyword and text fields - not on [" + name
throw new QueryShardException(context, "Can only use prefix queries on keyword, text and wildcard fields - not on [" + name
+ "] which is of type [" + typeName() + "]");
}

public Query wildcardQuery(String value,
@Nullable MultiTermQuery.RewriteMethod method,
QueryShardContext context) {
throw new QueryShardException(context, "Can only use wildcard queries on keyword and text fields - not on [" + name
throw new QueryShardException(context, "Can only use wildcard queries on keyword, text and wildcard fields - not on [" + name
+ "] which is of type [" + typeName() + "]");
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

package org.elasticsearch.index.mapper;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.MultiTermQuery;
Expand Down Expand Up @@ -93,6 +94,36 @@ public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, Quer
return query;
}

public static final String normalizeWildcardPattern(String fieldname, String value, Analyzer normalizer) {
if (normalizer == null) {
return value;
}
// we want to normalize everything except wildcard characters, e.g. F?o Ba* to f?o ba*, even if e.g there
// is a char_filter that would otherwise remove them
Matcher wildcardMatcher = WILDCARD_PATTERN.matcher(value);
BytesRefBuilder sb = new BytesRefBuilder();
int last = 0;

while (wildcardMatcher.find()) {
if (wildcardMatcher.start() > 0) {
String chunk = value.substring(last, wildcardMatcher.start());

BytesRef normalized = normalizer.normalize(fieldname, chunk);
sb.append(normalized);
}
// append the matched group - without normalizing
sb.append(new BytesRef(wildcardMatcher.group()));

last = wildcardMatcher.end();
}
if (last < value.length()) {
String chunk = value.substring(last);
BytesRef normalized = normalizer.normalize(fieldname, chunk);
sb.append(normalized);
}
return sb.toBytesRef().utf8ToString();
}

@Override
public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
failIfNotIndexed();
Expand All @@ -103,30 +134,8 @@ public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, Qu

Term term;
if (searchAnalyzer() != null) {
// we want to normalize everything except wildcard characters, e.g. F?o Ba* to f?o ba*, even if e.g there
// is a char_filter that would otherwise remove them
Matcher wildcardMatcher = WILDCARD_PATTERN.matcher(value);
BytesRefBuilder sb = new BytesRefBuilder();
int last = 0;

while (wildcardMatcher.find()) {
if (wildcardMatcher.start() > 0) {
String chunk = value.substring(last, wildcardMatcher.start());

BytesRef normalized = searchAnalyzer().normalize(name(), chunk);
sb.append(normalized);
}
// append the matched group - without normalizing
sb.append(new BytesRef(wildcardMatcher.group()));

last = wildcardMatcher.end();
}
if (last < value.length()) {
String chunk = value.substring(last);
BytesRef normalized = searchAnalyzer().normalize(name(), chunk);
sb.append(normalized);
}
term = new Term(name(), sb.toBytesRef());
value = normalizeWildcardPattern(name(), value, searchAnalyzer());
term = new Term(name(), value);
} else {
term = new Term(name(), indexedValueForSearch(value));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ public static RangeQueryBuilder rangeQuery(String name) {
* which matches any single character. Note this query can be slow, as it
* needs to iterate over many terms. In order to prevent extremely slow WildcardQueries,
* a Wildcard term should not start with one of the wildcards {@code *} or
* {@code ?}.
* {@code ?}. (The wildcard field type however, is optimised for leading wildcards)
*
* @param name The field name
* @param query The wildcard query string
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ public void testNumeric() throws Exception {
QueryShardContext context = createShardContext();
QueryShardException e = expectThrows(QueryShardException.class,
() -> query.toQuery(context));
assertEquals("Can only use prefix queries on keyword and text fields - not on [mapped_int] which is of type [integer]",
assertEquals("Can only use prefix queries on keyword, text and wildcard fields - not on [mapped_int] which is of type [integer]",
e.getMessage());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -816,7 +816,7 @@ public void testPrefixNumeric() throws Exception {
QueryShardContext context = createShardContext();
QueryShardException e = expectThrows(QueryShardException.class,
() -> query.toQuery(context));
assertEquals("Can only use prefix queries on keyword and text fields - not on [mapped_int] which is of type [integer]",
assertEquals("Can only use prefix queries on keyword, text and wildcard fields - not on [mapped_int] which is of type [integer]",
e.getMessage());
query.lenient(true);
query.toQuery(context); // no exception
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,20 @@ setup:
body:
settings:
number_of_replicas: 0
analysis:
normalizer:
lowercase:
type: custom
char_filter: []
filter: ["lowercase"]
mappings:
properties:
my_wildcard:
type: wildcard
normalizer: lowercase
fields:
case_sensitive:
type: wildcard
- do:
index:
index: test-index
Expand All @@ -26,6 +36,12 @@ setup:
id: 2
body:
my_wildcard: goodbye world
- do:
index:
index: test-index
id: 3
body:
my_wildcard: cAsE iNsEnSiTiVe World

- do:
indices.refresh: {}
Expand Down Expand Up @@ -80,6 +96,31 @@ setup:
my_wildcard: {value: "*ello worl*" }


- match: {hits.total.value: 1}
---
"Case insensitive query":
- do:
search:
body:
track_total_hits: true
query:
wildcard:
my_wildcard: {value: "*Worl*" }


- match: {hits.total.value: 3}

---
"Case sensitive query":
- do:
search:
body:
track_total_hits: true
query:
wildcard:
my_wildcard.case_sensitive: {value: "*Worl*" }


- match: {hits.total.value: 1}

---
Expand All @@ -93,7 +134,7 @@ setup:
my_wildcard: {value: "*ld" }


- match: {hits.total.value: 2}
- match: {hits.total.value: 3}

---
"Long suffix query":
Expand Down Expand Up @@ -188,8 +229,8 @@ setup:
terms: {field: "my_wildcard" }


- match: {hits.total.value: 2}
- length: { aggregations.top_vals.buckets: 2 }
- match: {hits.total.value: 3}
- length: { aggregations.top_vals.buckets: 3 }

---
"Sort works":
Expand All @@ -199,20 +240,21 @@ setup:
track_total_hits: true
sort: [ { "my_wildcard": "desc" } ]

- match: { hits.total.value: 2 }
- length: { hits.hits: 2 }
- match: { hits.total.value: 3 }
- length: { hits.hits: 3 }
- match: { hits.hits.0._id: "1" }
- match: { hits.hits.1._id: "2" }
- match: { hits.hits.2._id: "3" }

- do:
search:
body:
track_total_hits: true
sort: [ { "my_wildcard": "asc" } ]

- match: { hits.total.value: 2 }
- length: { hits.hits: 2 }
- match: { hits.hits.0._id: "2" }
- match: { hits.hits.1._id: "1" }

- match: { hits.total.value: 3 }
- length: { hits.hits: 3 }
- match: { hits.hits.0._id: "3" }
- match: { hits.hits.1._id: "2" }
- match: { hits.hits.2._id: "1" }

Loading