From 9e13202044e9182f704307cdd1208dd45672fedb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=BCscher?= Date: Thu, 12 Jul 2018 16:32:35 +0200 Subject: [PATCH 1/4] Add exclusion option to `keep_types` token filter Currently the `keep_types` token filter includes all token types specified using its `types` parameter. Lucenes TypeTokenFilter also provides a second mode where instead of keeping the specified tokens (include) they are filtered out (exclude). This change exposes this option as a new `mode` parameter that can either take the values `include` (the default, if not specified) or `exclude`. Closes #29277 --- .../keep-types-tokenfilter.asciidoc | 74 ++++++++++++++++++- .../common/KeepTypesFilterFactory.java | 15 +++- .../common/KeepTypesFilterFactoryTests.java | 36 +++++++-- 3 files changed, 114 insertions(+), 11 deletions(-) diff --git a/docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc index afaf4f8fa8c46..05687f8669155 100644 --- a/docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc @@ -8,8 +8,9 @@ contained in a predefined set. [float] === Options [horizontal] -types:: a list of types to keep - +types:: a list of types to include (default mode) or exclude +mode:: if set to `include` (default) the specified token types will be kept, +if set to `exclude` the specified token types will be removed from the stream [float] === Settings example @@ -53,7 +54,7 @@ POST /keep_types_example/_analyze // CONSOLE // TEST[continued] -And it'd respond: +The response will be: [source,js] -------------------------------------------------- @@ -72,3 +73,70 @@ And it'd respond: // TESTRESPONSE Note how only the `` token is in the output. + +=== Exclude mode settings example + +If the `mode` parameter is set to `exclude` like in the following example: + +[source,js] +-------------------------------------------------- +PUT /keep_types_exclude_example +{ + "settings" : { + "analysis" : { + "analyzer" : { + "my_analyzer" : { + "tokenizer" : "standard", + "filter" : ["standard", "lowercase", "remove_numbers"] + } + }, + "filter" : { + "remove_numbers" : { + "type" : "keep_types", + "mode" : "exclude", + "types" : [ "" ] + } + } + } + } +} +-------------------------------------------------- +// CONSOLE + +And we test it like: + +[source,js] +-------------------------------------------------- +POST /keep_types_exclude_example/_analyze +{ + "analyzer" : "my_analyzer", + "text" : "hello 101 world" +} +-------------------------------------------------- +// CONSOLE +// TEST[continued] + +The response will be: + +[source,js] +-------------------------------------------------- +{ + "tokens": [ + { + "token": "hello", + "start_offset": 0, + "end_offset": 5, + "type": "", + "position": 0 + }, + { + "token": "world", + "start_offset": 10, + "end_offset": 15, + "type": "", + "position": 2 + } + ] +} +-------------------------------------------------- +// TESTRESPONSE \ No newline at end of file diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeepTypesFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeepTypesFilterFactory.java index 0f94b521e4b7d..9b4e76dd0f14c 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeepTypesFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeepTypesFilterFactory.java @@ -43,7 +43,12 @@ */ public class KeepTypesFilterFactory extends AbstractTokenFilterFactory { private final Set keepTypes; - private static final String KEEP_TYPES_KEY = "types"; + private final boolean includeMode; + static final String KEEP_TYPES_KEY = "types"; + static final String KEEP_TYPES_MODE = "mode"; + static final String KEEP_TYPES_MODE_INCLUDE = "include"; + static final String KEEP_TYPES_MODE_EXCLUDE = "exclude"; + KeepTypesFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); @@ -52,12 +57,18 @@ public class KeepTypesFilterFactory extends AbstractTokenFilterFactory { if ((arrayKeepTypes == null)) { throw new IllegalArgumentException("keep_types requires `" + KEEP_TYPES_KEY + "` to be configured"); } + final String modeParameter = settings.get(KEEP_TYPES_MODE, KEEP_TYPES_MODE_INCLUDE).toLowerCase(); + if (modeParameter.equals(KEEP_TYPES_MODE_INCLUDE) == false && modeParameter.equals(KEEP_TYPES_MODE_EXCLUDE) == false) { + throw new IllegalArgumentException( + "keep_types mode can only be `" + KEEP_TYPES_MODE_INCLUDE + "` or `" + KEEP_TYPES_MODE_INCLUDE + "`"); + } this.keepTypes = new HashSet<>(arrayKeepTypes); + this.includeMode = modeParameter.equals(KEEP_TYPES_MODE_INCLUDE); } @Override public TokenStream create(TokenStream tokenStream) { - return new TypeTokenFilter(tokenStream, keepTypes, true); + return new TypeTokenFilter(tokenStream, keepTypes, includeMode); } } diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/KeepTypesFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/KeepTypesFilterFactoryTests.java index a19882d6faa00..01b9a3bea8298 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/KeepTypesFilterFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/KeepTypesFilterFactoryTests.java @@ -34,19 +34,43 @@ import static org.hamcrest.Matchers.instanceOf; public class KeepTypesFilterFactoryTests extends ESTokenStreamTestCase { - public void testKeepTypes() throws IOException { + + private static final String BASE_SETTING = "index.analysis.filter.keep_numbers"; + + public void testKeepTypesInclude() throws IOException { + Settings.Builder settingsBuilder = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put(BASE_SETTING + ".type", "keep_types") + .putList(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_KEY, new String[] { "", "" }); + // either use default mode or set "include" mode explicitly + if (random().nextBoolean()) { + settingsBuilder.put(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_MODE, + KeepTypesFilterFactory.KEEP_TYPES_MODE_INCLUDE); + } + Settings settings = settingsBuilder.build(); + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin()); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("keep_numbers"); + assertThat(tokenFilter, instanceOf(KeepTypesFilterFactory.class)); + String source = "Hello 123 world"; + String[] expected = new String[] { "123" }; + Tokenizer tokenizer = new StandardTokenizer(); + tokenizer.setReader(new StringReader(source)); + assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[] { 2 }); + } + + public void testKeepTypesExclude() throws IOException { Settings settings = Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.keep_numbers.type", "keep_types") - .putList("index.analysis.filter.keep_numbers.types", new String[] {"", ""}) - .build(); + .put(BASE_SETTING + ".type", "keep_types") + .putList(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_KEY, new String[] { "", "" }) + .put(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_MODE, KeepTypesFilterFactory.KEEP_TYPES_MODE_EXCLUDE).build(); ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("keep_numbers"); assertThat(tokenFilter, instanceOf(KeepTypesFilterFactory.class)); String source = "Hello 123 world"; - String[] expected = new String[]{"123"}; + String[] expected = new String[] { "Hello", "world" }; Tokenizer tokenizer = new StandardTokenizer(); tokenizer.setReader(new StringReader(source)); - assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[]{2}); + assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[] { 1, 2 }); } } From 302303edaec4bc168585d341e921226082aedd58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=BCscher?= Date: Thu, 12 Jul 2018 19:17:02 +0200 Subject: [PATCH 2/4] iter --- .../elasticsearch/analysis/common/KeepTypesFilterFactory.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeepTypesFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeepTypesFilterFactory.java index 9b4e76dd0f14c..37eabb5e961a9 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeepTypesFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeepTypesFilterFactory.java @@ -29,6 +29,7 @@ import java.util.HashSet; import java.util.List; +import java.util.Locale; import java.util.Set; /** @@ -57,7 +58,7 @@ public class KeepTypesFilterFactory extends AbstractTokenFilterFactory { if ((arrayKeepTypes == null)) { throw new IllegalArgumentException("keep_types requires `" + KEEP_TYPES_KEY + "` to be configured"); } - final String modeParameter = settings.get(KEEP_TYPES_MODE, KEEP_TYPES_MODE_INCLUDE).toLowerCase(); + final String modeParameter = settings.get(KEEP_TYPES_MODE, KEEP_TYPES_MODE_INCLUDE).toLowerCase(Locale.ROOT); if (modeParameter.equals(KEEP_TYPES_MODE_INCLUDE) == false && modeParameter.equals(KEEP_TYPES_MODE_EXCLUDE) == false) { throw new IllegalArgumentException( "keep_types mode can only be `" + KEEP_TYPES_MODE_INCLUDE + "` or `" + KEEP_TYPES_MODE_INCLUDE + "`"); From cbc738c73d0a6c087630350a7efae327474106fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=BCscher?= Date: Mon, 16 Jul 2018 10:42:58 +0200 Subject: [PATCH 3/4] Improve exception message --- .../analysis/common/KeepTypesFilterFactory.java | 4 ++-- .../common/KeepTypesFilterFactoryTests.java | 16 ++++++++++++---- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeepTypesFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeepTypesFilterFactory.java index 37eabb5e961a9..c358f2076021f 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeepTypesFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeepTypesFilterFactory.java @@ -60,8 +60,8 @@ public class KeepTypesFilterFactory extends AbstractTokenFilterFactory { } final String modeParameter = settings.get(KEEP_TYPES_MODE, KEEP_TYPES_MODE_INCLUDE).toLowerCase(Locale.ROOT); if (modeParameter.equals(KEEP_TYPES_MODE_INCLUDE) == false && modeParameter.equals(KEEP_TYPES_MODE_EXCLUDE) == false) { - throw new IllegalArgumentException( - "keep_types mode can only be `" + KEEP_TYPES_MODE_INCLUDE + "` or `" + KEEP_TYPES_MODE_INCLUDE + "`"); + throw new IllegalArgumentException("`keep_types` tokenfilter mode can only be [" + KEEP_TYPES_MODE_INCLUDE + "] or [" + + KEEP_TYPES_MODE_EXCLUDE + "] but was [" + modeParameter + "]."); } this.keepTypes = new HashSet<>(arrayKeepTypes); diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/KeepTypesFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/KeepTypesFilterFactoryTests.java index 01b9a3bea8298..e0f848bbbf7ee 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/KeepTypesFilterFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/KeepTypesFilterFactoryTests.java @@ -38,8 +38,7 @@ public class KeepTypesFilterFactoryTests extends ESTokenStreamTestCase { private static final String BASE_SETTING = "index.analysis.filter.keep_numbers"; public void testKeepTypesInclude() throws IOException { - Settings.Builder settingsBuilder = Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + Settings.Builder settingsBuilder = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put(BASE_SETTING + ".type", "keep_types") .putList(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_KEY, new String[] { "", "" }); // either use default mode or set "include" mode explicitly @@ -59,8 +58,7 @@ public void testKeepTypesInclude() throws IOException { } public void testKeepTypesExclude() throws IOException { - Settings settings = Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put(BASE_SETTING + ".type", "keep_types") .putList(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_KEY, new String[] { "", "" }) .put(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_MODE, KeepTypesFilterFactory.KEEP_TYPES_MODE_EXCLUDE).build(); @@ -73,4 +71,14 @@ public void testKeepTypesExclude() throws IOException { tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[] { 1, 2 }); } + + public void testKeepTypesException() throws IOException { + Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put(BASE_SETTING + ".type", "keep_types") + .putList(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_KEY, new String[] { "", "" }) + .put(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_MODE, "bad_parameter").build(); + IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, + () -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin())); + assertEquals("`keep_types` tokenfilter mode can only be [include] or [exclude] but was [bad_parameter].", ex.getMessage()); + } } From a33a32c1664100182cd28abfa20359bd0da5e3f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=BCscher?= Date: Mon, 16 Jul 2018 12:17:04 +0200 Subject: [PATCH 4/4] iter --- .../common/KeepTypesFilterFactory.java | 42 ++++++++++++------- .../common/KeepTypesFilterFactoryTests.java | 8 ++-- 2 files changed, 32 insertions(+), 18 deletions(-) diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeepTypesFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeepTypesFilterFactory.java index c358f2076021f..b6b8b45fabfc2 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeepTypesFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeepTypesFilterFactory.java @@ -35,21 +35,41 @@ /** * A {@link TokenFilterFactory} for {@link TypeTokenFilter}. This filter only * keep tokens that are contained in the set configured via - * {@value #KEEP_TYPES_KEY} setting. + * {@value #KEEP_TYPES_MODE_KEY} setting. *

* Configuration options: *

    - *
  • {@value #KEEP_TYPES_KEY} the array of words / tokens to keep.
  • + *
  • {@value #KEEP_TYPES_KEY} the array of words / tokens.
  • + *
  • {@value #KEEP_TYPES_MODE_KEY} whether to keep ("include") or discard + * ("exclude") the specified token types.
  • *
*/ public class KeepTypesFilterFactory extends AbstractTokenFilterFactory { private final Set keepTypes; - private final boolean includeMode; + private final KeepTypesMode includeMode; static final String KEEP_TYPES_KEY = "types"; - static final String KEEP_TYPES_MODE = "mode"; - static final String KEEP_TYPES_MODE_INCLUDE = "include"; - static final String KEEP_TYPES_MODE_EXCLUDE = "exclude"; + static final String KEEP_TYPES_MODE_KEY = "mode"; + enum KeepTypesMode { + INCLUDE, EXCLUDE; + + @Override + public String toString() { + return this.name().toLowerCase(Locale.ROOT); + } + + private static KeepTypesMode fromString(String modeString) { + String lc = modeString.toLowerCase(Locale.ROOT); + if (lc.equals("include")) { + return INCLUDE; + } else if (lc.equals("exclude")) { + return EXCLUDE; + } else { + throw new IllegalArgumentException("`keep_types` tokenfilter mode can only be [" + KeepTypesMode.INCLUDE + "] or [" + + KeepTypesMode.EXCLUDE + "] but was [" + modeString + "]."); + } + } + }; KeepTypesFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); @@ -58,18 +78,12 @@ public class KeepTypesFilterFactory extends AbstractTokenFilterFactory { if ((arrayKeepTypes == null)) { throw new IllegalArgumentException("keep_types requires `" + KEEP_TYPES_KEY + "` to be configured"); } - final String modeParameter = settings.get(KEEP_TYPES_MODE, KEEP_TYPES_MODE_INCLUDE).toLowerCase(Locale.ROOT); - if (modeParameter.equals(KEEP_TYPES_MODE_INCLUDE) == false && modeParameter.equals(KEEP_TYPES_MODE_EXCLUDE) == false) { - throw new IllegalArgumentException("`keep_types` tokenfilter mode can only be [" + KEEP_TYPES_MODE_INCLUDE + "] or [" - + KEEP_TYPES_MODE_EXCLUDE + "] but was [" + modeParameter + "]."); - } - + this.includeMode = KeepTypesMode.fromString(settings.get(KEEP_TYPES_MODE_KEY, "include")); this.keepTypes = new HashSet<>(arrayKeepTypes); - this.includeMode = modeParameter.equals(KEEP_TYPES_MODE_INCLUDE); } @Override public TokenStream create(TokenStream tokenStream) { - return new TypeTokenFilter(tokenStream, keepTypes, includeMode); + return new TypeTokenFilter(tokenStream, keepTypes, includeMode == KeepTypesMode.INCLUDE); } } diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/KeepTypesFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/KeepTypesFilterFactoryTests.java index e0f848bbbf7ee..d0c7723457ff3 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/KeepTypesFilterFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/KeepTypesFilterFactoryTests.java @@ -43,8 +43,8 @@ public void testKeepTypesInclude() throws IOException { .putList(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_KEY, new String[] { "", "" }); // either use default mode or set "include" mode explicitly if (random().nextBoolean()) { - settingsBuilder.put(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_MODE, - KeepTypesFilterFactory.KEEP_TYPES_MODE_INCLUDE); + settingsBuilder.put(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_MODE_KEY, + KeepTypesFilterFactory.KeepTypesMode.INCLUDE); } Settings settings = settingsBuilder.build(); ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin()); @@ -61,7 +61,7 @@ public void testKeepTypesExclude() throws IOException { Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put(BASE_SETTING + ".type", "keep_types") .putList(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_KEY, new String[] { "", "" }) - .put(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_MODE, KeepTypesFilterFactory.KEEP_TYPES_MODE_EXCLUDE).build(); + .put(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_MODE_KEY, KeepTypesFilterFactory.KeepTypesMode.EXCLUDE).build(); ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("keep_numbers"); assertThat(tokenFilter, instanceOf(KeepTypesFilterFactory.class)); @@ -76,7 +76,7 @@ public void testKeepTypesException() throws IOException { Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put(BASE_SETTING + ".type", "keep_types") .putList(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_KEY, new String[] { "", "" }) - .put(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_MODE, "bad_parameter").build(); + .put(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_MODE_KEY, "bad_parameter").build(); IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, () -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin())); assertEquals("`keep_types` tokenfilter mode can only be [include] or [exclude] but was [bad_parameter].", ex.getMessage());