Skip to content

Commit

Permalink
Add support for ignore_keywords flag in word delimiter graph token fi…
Browse files Browse the repository at this point in the history
…lter

Support ignore_keywords flag for word delimiter graph token filter

Lucene's WordDelimiterGraphFilter allows to skip processing of tokens tagged as keyword. However the Elasticsearch word delimiter graph token filter does not support this yet. I would like to update the Elasticsearch implementation to incorporate the ignore_keywords flag to enable better customization of token filters

Fix for #59491
  • Loading branch information
malpani committed Jul 14, 2020
1 parent a51dda8 commit 751dd13
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.CATENATE_WORDS;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_WORD_PARTS;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.IGNORE_KEYWORDS;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.PRESERVE_ORIGINAL;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_NUMERICS;
Expand Down Expand Up @@ -87,6 +88,8 @@ public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environ
// If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
// If not null is the set of tokens to protect from being delimited
flags |= getFlag(IGNORE_KEYWORDS, settings, "ignore_keywords", false);
// If set, suppresses processing terms with KeywordAttribute#isKeyword()=true.
Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words");
this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
this.flags = flags;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,38 @@ public void testAdjustingOffsets() throws IOException {
expectedIncr, expectedPosLen, null);
}

public void testIgnoreKeywords() throws IOException {
//test with keywords but ignore is false (default behavior)
Settings settings = Settings.builder()
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
.put("index.analysis.filter.my_keyword.type", "keyword_marker")
.put("index.analysis.filter.my_keyword.keywords", "PowerHungry")
.put("index.analysis.analyzer.my_analyzer.type", "custom")
.put("index.analysis.analyzer.my_analyzer.tokenizer", "whitespace")
.put("index.analysis.analyzer.my_analyzer.filter", "my_keyword, my_word_delimiter")
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
String source = "PowerShot PowerHungry";
int[] expectedStartOffsets = new int[]{0, 5, 10, 15};
int[] expectedEndOffsets = new int[]{5, 9, 15, 21};
String[] expected = new String[]{"Power", "Shot", "Power", "Hungry"};
NamedAnalyzer analyzer = analysis.indexAnalyzers.get("my_analyzer");
assertAnalyzesTo(analyzer, source, expected, expectedStartOffsets, expectedEndOffsets);

//test with keywords but ignore_keywords is set as true
settings = Settings.builder().put(settings)
.put("index.analysis.filter.my_word_delimiter.ignore_keywords", "true")
.build();
analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
analyzer = analysis.indexAnalyzers.get("my_analyzer");
expectedStartOffsets = new int[]{0, 5, 10};
expectedEndOffsets = new int[]{5, 9, 21};
expected = new String[]{"Power", "Shot", "PowerHungry"};
assertAnalyzesTo(analyzer, source, expected, expectedStartOffsets, expectedEndOffsets);
}

public void testPreconfiguredFilter() throws IOException {
// Before 7.3 we don't adjust offsets
{
Expand Down

0 comments on commit 751dd13

Please sign in to comment.