From 751dd13d215be4d1aa78944a08507ed3a00ba213 Mon Sep 17 00:00:00 2001 From: Malpani Date: Tue, 14 Jul 2020 12:13:33 -0700 Subject: [PATCH 1/3] Add support for ignore_keywords flag in word delimiter graph token filter Support ignore_keywords flag for word delimiter graph token filter Lucene's WordDelimiterGraphFilter allows to skip processing of tokens tagged as keyword. However the Elasticsearch word delimiter graph token filter does not support this yet. I would like to update the Elasticsearch implementation to incorporate the ignore_keywords flag to enable better customization of token filters Fix for https://github.com/elastic/elasticsearch/issues/59491 --- .../WordDelimiterGraphTokenFilterFactory.java | 3 ++ ...DelimiterGraphTokenFilterFactoryTests.java | 32 +++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java index bd7cd7ed3c5dc..99416869342a9 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java @@ -38,6 +38,7 @@ import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.CATENATE_WORDS; import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS; import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_WORD_PARTS; +import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.IGNORE_KEYWORDS; import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.PRESERVE_ORIGINAL; import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE; import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_NUMERICS; @@ -87,6 +88,8 @@ public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environ // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil" flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true); // If not null is the set of tokens to protect from being delimited + flags |= getFlag(IGNORE_KEYWORDS, settings, "ignore_keywords", false); + // If set, suppresses processing terms with KeywordAttribute#isKeyword()=true. Set protectedWords = Analysis.getWordSet(env, settings, "protected_words"); this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords); this.flags = flags; diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java index ec61d614db97f..a027781f022d0 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java @@ -118,6 +118,38 @@ public void testAdjustingOffsets() throws IOException { expectedIncr, expectedPosLen, null); } + public void testIgnoreKeywords() throws IOException { + //test with keywords but ignore is false (default behavior) + Settings settings = Settings.builder() + .put("index.analysis.filter.my_word_delimiter.type", type) + .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true") + .put("index.analysis.filter.my_keyword.type", "keyword_marker") + .put("index.analysis.filter.my_keyword.keywords", "PowerHungry") + .put("index.analysis.analyzer.my_analyzer.type", "custom") + .put("index.analysis.analyzer.my_analyzer.tokenizer", "whitespace") + .put("index.analysis.analyzer.my_analyzer.filter", "my_keyword, my_word_delimiter") + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build(); + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin()); + String source = "PowerShot PowerHungry"; + int[] expectedStartOffsets = new int[]{0, 5, 10, 15}; + int[] expectedEndOffsets = new int[]{5, 9, 15, 21}; + String[] expected = new String[]{"Power", "Shot", "Power", "Hungry"}; + NamedAnalyzer analyzer = analysis.indexAnalyzers.get("my_analyzer"); + assertAnalyzesTo(analyzer, source, expected, expectedStartOffsets, expectedEndOffsets); + + //test with keywords but ignore_keywords is set as true + settings = Settings.builder().put(settings) + .put("index.analysis.filter.my_word_delimiter.ignore_keywords", "true") + .build(); + analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin()); + analyzer = analysis.indexAnalyzers.get("my_analyzer"); + expectedStartOffsets = new int[]{0, 5, 10}; + expectedEndOffsets = new int[]{5, 9, 21}; + expected = new String[]{"Power", "Shot", "PowerHungry"}; + assertAnalyzesTo(analyzer, source, expected, expectedStartOffsets, expectedEndOffsets); + } + public void testPreconfiguredFilter() throws IOException { // Before 7.3 we don't adjust offsets { From 1a66b001266a7cb5f92917caf278132c61175811 Mon Sep 17 00:00:00 2001 From: Ankit Malpani Date: Sun, 19 Jul 2020 00:15:51 -0700 Subject: [PATCH 2/3] Update docs for the ignore_keywords attribute --- .../word-delimiter-graph-tokenfilter.asciidoc | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc index 2fa9c41ad79b6..5ae0249a563a2 100644 --- a/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc @@ -270,6 +270,12 @@ If `true`, the filter includes tokens consisting of only alphabetical characters in the output. If `false`, the filter excludes these tokens from the output. Defaults to `true`. +`ignore_keywords`:: +(Optional, boolean) +If `true`, the filter suppresses processing tokens with +{lucene-core-javadoc}/org/apache/lucene/analysis/tokenattributes/KeywordAttribute.html#isKeyword()[KeywordAttribute.isKeyword()=true] +Defaults to `false`. + [[word-delimiter-graph-tokenfilter-preserve-original]] `preserve_original`:: + @@ -496,4 +502,4 @@ spans one in the token graph, making it invalid. image::images/analysis/token-graph-wd.svg[align="center"] -==== \ No newline at end of file +==== From 4ad08cbfb44ef37d6d482f75e7e4f9e546d70937 Mon Sep 17 00:00:00 2001 From: James Rodewig Date: Mon, 20 Jul 2020 08:39:35 -0400 Subject: [PATCH 3/3] Update docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc --- .../tokenfilters/word-delimiter-graph-tokenfilter.asciidoc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc index 5ae0249a563a2..3858c5eeb0f6e 100644 --- a/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc @@ -272,8 +272,8 @@ Defaults to `true`. `ignore_keywords`:: (Optional, boolean) -If `true`, the filter suppresses processing tokens with -{lucene-core-javadoc}/org/apache/lucene/analysis/tokenattributes/KeywordAttribute.html#isKeyword()[KeywordAttribute.isKeyword()=true] +If `true`, the filter skips tokens with +a `keyword` attribute of `true`. Defaults to `false`. [[word-delimiter-graph-tokenfilter-preserve-original]]