-
Notifications
You must be signed in to change notification settings - Fork 25k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add conditional token filter to elasticsearch #31958
Changes from 10 commits
e5b20de
da0fd1e
df9bffc
402ed36
fb7c21d
d8f0170
bcee3f0
bba5939
21cd02f
4315682
52955df
dd139c7
1d5deff
57a73f2
609951b
801a704
f923d9c
a21fb82
62fea58
35a6774
9203e50
849e690
de213a8
3bbc8c6
4440df5
2ab7db3
2183fff
0e7a6b4
315999d
b959534
94d3311
2945c3a
e48de6a
ec470f2
f446f91
bd20e99
71cd43b
e045ad6
dabbba1
82e8fce
9d48815
040bc9d
142d24a
ce8b3e3
a1ad7a1
bbe1b7c
391641c
6ec52fe
ced669b
e38e69c
4a9fbe7
5f130a2
1dd0279
53f029b
e325526
e514ad0
50d8fa8
c0ffec7
ee4ef86
9371e77
780697f
1106355
7ce9926
8ff5735
97fbe49
ca2844f
8bad2c6
89bce93
c2ee07b
24547e8
5a383c2
5bad3a8
94330d8
3d0854d
70d2db3
5afea06
bb9fae0
b31dc36
a7c8e07
7490ec6
a481ef6
346edfa
cdf5c8a
a835503
ff7ff36
101458b
88c4f6c
8486f24
413d211
fa16875
688deeb
ace3771
84ee20e
947fa2e
e67cede
8924ac3
b79dc6a
d225048
997ebe8
f591095
67a4dcb
93ecf1d
945fadf
303de4f
33b9da4
b775f71
1dff0f6
546aa11
701fbf2
396843f
8bc3d65
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
[[analysis-condition-tokenfilter]] | ||
=== Conditional Token Filter | ||
|
||
The conditional token filter takes a predicate script and a list of subfilters, and | ||
only applies the subfilters to the current token if it matches the predicate. | ||
|
||
[float] | ||
=== Options | ||
[horizontal] | ||
filters:: a list of token filters to apply to the current token if the predicate | ||
matches. These can be any token filters defined elsewhere in the index mappings. | ||
|
||
script:: a predicate script that determines whether or not the filters will be applied | ||
to the current token. Note that only inline scripts are supported | ||
|
||
[float] | ||
=== Settings example | ||
|
||
You can set it up like: | ||
|
||
[source,js] | ||
-------------------------------------------------- | ||
PUT /condition_example | ||
{ | ||
"settings" : { | ||
"analysis" : { | ||
"analyzer" : { | ||
"my_analyzer" : { | ||
"tokenizer" : "standard", | ||
"filter" : [ "my_condition" ] | ||
} | ||
}, | ||
"filter" : { | ||
"my_condition" : { | ||
"type" : "condition", | ||
"filters" : [ "lowercase" ], | ||
"script" : { | ||
"source" : "return term.term.length() < 5" <1> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't believe you need the |
||
} | ||
} | ||
} | ||
} | ||
} | ||
} | ||
-------------------------------------------------- | ||
// CONSOLE | ||
|
||
<1> This will only apply the lowercase filter to terms that are less than 5 | ||
characters in length | ||
|
||
And test it like: | ||
|
||
[source,js] | ||
-------------------------------------------------- | ||
POST /condition_example/_analyze | ||
{ | ||
"analyzer" : "my_analyzer", | ||
"text" : "What Flapdoodle" | ||
} | ||
-------------------------------------------------- | ||
// CONSOLE | ||
// TEST[continued] | ||
|
||
And it'd respond: | ||
|
||
[source,js] | ||
-------------------------------------------------- | ||
{ | ||
"tokens": [ | ||
{ | ||
"token": "what", <1> | ||
"start_offset": 0, | ||
"end_offset": 4, | ||
"type": "<ALPHANUM>", | ||
"position": 0 | ||
}, | ||
{ | ||
"token": "Flapdoodle", <2> | ||
"start_offset": 5, | ||
"end_offset": 15, | ||
"type": "<ALPHANUM>", | ||
"position": 0 | ||
} | ||
] | ||
} | ||
-------------------------------------------------- | ||
// TESTRESPONSE | ||
<1> The term `What` has been lowercased, because it is only 4 characters long | ||
<2> The term `Flapdoodle` has been left in its original case, because it doesn't pass | ||
the predicate |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -111,9 +111,16 @@ | |
import org.apache.lucene.analysis.tr.ApostropheFilter; | ||
import org.apache.lucene.analysis.tr.TurkishAnalyzer; | ||
import org.apache.lucene.analysis.util.ElisionFilter; | ||
import org.apache.lucene.util.SetOnce; | ||
import org.elasticsearch.client.Client; | ||
import org.elasticsearch.cluster.service.ClusterService; | ||
import org.elasticsearch.common.io.stream.NamedWriteableRegistry; | ||
import org.elasticsearch.common.logging.DeprecationLogger; | ||
import org.elasticsearch.common.logging.Loggers; | ||
import org.elasticsearch.common.regex.Regex; | ||
import org.elasticsearch.common.xcontent.NamedXContentRegistry; | ||
import org.elasticsearch.env.Environment; | ||
import org.elasticsearch.env.NodeEnvironment; | ||
import org.elasticsearch.index.analysis.AnalyzerProvider; | ||
import org.elasticsearch.index.analysis.CharFilterFactory; | ||
import org.elasticsearch.index.analysis.PreBuiltAnalyzerProviderFactory; | ||
|
@@ -127,10 +134,15 @@ | |
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy; | ||
import org.elasticsearch.plugins.AnalysisPlugin; | ||
import org.elasticsearch.plugins.Plugin; | ||
import org.elasticsearch.script.ScriptService; | ||
import org.elasticsearch.threadpool.ThreadPool; | ||
import org.elasticsearch.watcher.ResourceWatcherService; | ||
import org.tartarus.snowball.ext.DutchStemmer; | ||
import org.tartarus.snowball.ext.FrenchStemmer; | ||
|
||
import java.util.ArrayList; | ||
import java.util.Collection; | ||
import java.util.Collections; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.TreeMap; | ||
|
@@ -141,6 +153,14 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin { | |
|
||
private static final DeprecationLogger DEPRECATION_LOGGER = new DeprecationLogger(Loggers.getLogger(CommonAnalysisPlugin.class)); | ||
|
||
private final SetOnce<ScriptService> scriptService = new SetOnce<>(); | ||
|
||
@Override | ||
public Collection<Object> createComponents(Client client, ClusterService clusterService, ThreadPool threadPool, ResourceWatcherService resourceWatcherService, ScriptService scriptService, NamedXContentRegistry xContentRegistry, Environment environment, NodeEnvironment nodeEnvironment, NamedWriteableRegistry namedWriteableRegistry) { | ||
this.scriptService.set(scriptService); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Where're never very happy with using |
||
return Collections.emptyList(); | ||
} | ||
|
||
@Override | ||
public Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAnalyzers() { | ||
Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> analyzers = new TreeMap<>(); | ||
|
@@ -202,6 +222,8 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() { | |
filters.put("classic", ClassicFilterFactory::new); | ||
filters.put("czech_stem", CzechStemTokenFilterFactory::new); | ||
filters.put("common_grams", requriesAnalysisSettings(CommonGramsTokenFilterFactory::new)); | ||
filters.put("condition", | ||
requriesAnalysisSettings((i, e, n, s) -> new ScriptedConditionTokenFilterFactory(i, n, s, scriptService.get()))); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. requires has a typo? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is a pre-existing typo; I'll open a separate PR to fix it |
||
filters.put("decimal_digit", DecimalDigitFilterFactory::new); | ||
filters.put("delimited_payload_filter", LegacyDelimitedPayloadTokenFilterFactory::new); | ||
filters.put("delimited_payload", DelimitedPayloadTokenFilterFactory::new); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
package org.elasticsearch.analysis.common; | ||
|
||
import org.apache.lucene.analysis.TokenStream; | ||
import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter; | ||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | ||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; | ||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; | ||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; | ||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; | ||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute; | ||
import org.elasticsearch.common.settings.Settings; | ||
import org.elasticsearch.index.IndexSettings; | ||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; | ||
import org.elasticsearch.index.analysis.ReferringFilterFactory; | ||
import org.elasticsearch.index.analysis.TokenFilterFactory; | ||
import org.elasticsearch.script.AnalysisPredicateScript; | ||
import org.elasticsearch.script.Script; | ||
import org.elasticsearch.script.ScriptService; | ||
import org.elasticsearch.script.ScriptType; | ||
|
||
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.function.Function; | ||
|
||
/** | ||
* A factory for a conditional token filter that only applies child filters if the underlying token | ||
* matches an {@link AnalysisPredicateScript} | ||
*/ | ||
public class ScriptedConditionTokenFilterFactory extends AbstractTokenFilterFactory implements ReferringFilterFactory { | ||
|
||
private final AnalysisPredicateScript.Factory factory; | ||
private final List<TokenFilterFactory> filters = new ArrayList<>(); | ||
private final List<String> filterNames; | ||
|
||
ScriptedConditionTokenFilterFactory(IndexSettings indexSettings, String name, | ||
Settings settings, ScriptService scriptService) { | ||
super(indexSettings, name, settings); | ||
|
||
Settings scriptSettings = settings.getAsSettings("script"); | ||
Script script = Script.parse(scriptSettings); | ||
if (script.getType() != ScriptType.INLINE) { | ||
throw new IllegalArgumentException("Cannot use stored scripts in tokenfilter [" + name + "]"); | ||
} | ||
this.factory = scriptService.compile(script, AnalysisPredicateScript.CONTEXT); | ||
|
||
this.filterNames = settings.getAsList("filters"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we fail on an empty list? |
||
} | ||
|
||
@Override | ||
public TokenStream create(TokenStream tokenStream) { | ||
Function<TokenStream, TokenStream> filter = in -> { | ||
for (TokenFilterFactory tff : filters) { | ||
in = tff.create(in); | ||
} | ||
return in; | ||
}; | ||
AnalysisPredicateScript script = factory.newInstance(); | ||
final AnalysisPredicateScript.Term term = new AnalysisPredicateScript.Term(); | ||
return new ConditionalTokenFilter(tokenStream, filter) { | ||
|
||
CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); | ||
PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); | ||
PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class); | ||
OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); | ||
TypeAttribute typeAtt = addAttribute(TypeAttribute.class); | ||
KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class); | ||
|
||
@Override | ||
protected boolean shouldFilter() { | ||
term.term = termAtt; | ||
term.posInc = posIncAtt.getPositionIncrement(); | ||
term.posLen = posLenAtt.getPositionLength(); | ||
term.startOffset = offsetAtt.startOffset(); | ||
term.endOffset = offsetAtt.endOffset(); | ||
term.type = typeAtt.type(); | ||
term.isKeyword = keywordAtt.isKeyword(); | ||
return script.execute(term); | ||
} | ||
}; | ||
} | ||
|
||
@Override | ||
public void setReferences(Map<String, TokenFilterFactory> factories) { | ||
for (String filter : filterNames) { | ||
TokenFilterFactory tff = factories.get(filter); | ||
if (tff == null) { | ||
throw new IllegalArgumentException("ScriptedConditionTokenFilter [" + name() + | ||
"] refers to undefined token filter [" + filter + "]"); | ||
} | ||
filters.add(tff); | ||
} | ||
} | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
package org.elasticsearch.analysis.common; | ||
|
||
import org.elasticsearch.Version; | ||
import org.elasticsearch.cluster.metadata.IndexMetaData; | ||
import org.elasticsearch.common.settings.Settings; | ||
import org.elasticsearch.env.Environment; | ||
import org.elasticsearch.env.TestEnvironment; | ||
import org.elasticsearch.index.IndexSettings; | ||
import org.elasticsearch.index.analysis.IndexAnalyzers; | ||
import org.elasticsearch.index.analysis.NamedAnalyzer; | ||
import org.elasticsearch.indices.analysis.AnalysisModule; | ||
import org.elasticsearch.script.AnalysisPredicateScript; | ||
import org.elasticsearch.script.Script; | ||
import org.elasticsearch.script.ScriptContext; | ||
import org.elasticsearch.script.ScriptService; | ||
import org.elasticsearch.test.ESTokenStreamTestCase; | ||
import org.elasticsearch.test.IndexSettingsModule; | ||
|
||
import java.util.Collections; | ||
|
||
public class ScriptedConditionTokenFilterTests extends ESTokenStreamTestCase { | ||
|
||
public void testSimpleCondition() throws Exception { | ||
Settings settings = Settings.builder() | ||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) | ||
.build(); | ||
Settings indexSettings = Settings.builder() | ||
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) | ||
.put("index.analysis.filter.cond.type", "condition") | ||
.put("index.analysis.filter.cond.script.source", "return \"two\".equals(term.term)") | ||
.putList("index.analysis.filter.cond.filters", "uppercase") | ||
.put("index.analysis.analyzer.myAnalyzer.type", "custom") | ||
.put("index.analysis.analyzer.myAnalyzer.tokenizer", "standard") | ||
.putList("index.analysis.analyzer.myAnalyzer.filter", "cond") | ||
.build(); | ||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); | ||
|
||
AnalysisPredicateScript.Factory factory = () -> new AnalysisPredicateScript() { | ||
@Override | ||
public boolean execute(Term term) { | ||
return "two".contentEquals(term.term); | ||
} | ||
}; | ||
|
||
ScriptService scriptService = new ScriptService(indexSettings, Collections.emptyMap(), Collections.emptyMap()){ | ||
@Override | ||
public <FactoryType> FactoryType compile(Script script, ScriptContext<FactoryType> context) { | ||
assertEquals(context, AnalysisPredicateScript.CONTEXT); | ||
assertEquals(new Script("return \"two\".equals(term.term)"), script); | ||
return (FactoryType) factory; | ||
} | ||
}; | ||
|
||
CommonAnalysisPlugin plugin = new CommonAnalysisPlugin(); | ||
plugin.createComponents(null, null, null, null, scriptService, null, null, null, null); | ||
AnalysisModule module | ||
= new AnalysisModule(TestEnvironment.newEnvironment(settings), Collections.singletonList(plugin)); | ||
|
||
IndexAnalyzers analyzers = module.getAnalysisRegistry().build(idxSettings); | ||
|
||
try (NamedAnalyzer analyzer = analyzers.get("myAnalyzer")) { | ||
assertNotNull(analyzer); | ||
assertAnalyzesTo(analyzer, "one two three", new String[]{ | ||
"one", "TWO", "three" | ||
}); | ||
} | ||
|
||
} | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -175,3 +175,13 @@ class org.elasticsearch.index.similarity.ScriptedSimilarity$Doc { | |
int getLength() | ||
float getFreq() | ||
} | ||
|
||
class org.elasticsearch.script.AnalysisPredicateScript$Term { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should it be called Token rather than Term? It might be just me but I feel like it better carries the information that there are other attributes here like positions and offsets. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should look at creating a new context and adding this to the whitelist of that context. See `plugins/example/painless-whitelist' for some of that. |
||
CharSequence term | ||
int posInc | ||
int posLen | ||
int startOffset | ||
int endOffset | ||
String type | ||
boolean isKeyword | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm exposing this information directly at the moment, but it might be better to expose getters instead so that consumers don't try and do things like change offsets or position increments There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 to exposing getters There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also should we expoe the absolute position rather than the position increment? This looks more useful to me for filtering. Or both? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Painless doesn't require the |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
package org.elasticsearch.painless; | ||
|
||
import org.elasticsearch.painless.spi.Whitelist; | ||
import org.elasticsearch.script.AnalysisPredicateScript; | ||
import org.elasticsearch.script.ScriptContext; | ||
|
||
import java.util.Collections; | ||
import java.util.HashMap; | ||
import java.util.List; | ||
import java.util.Map; | ||
|
||
public class AnalysisScriptTests extends ScriptTestCase { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this should be in your plugin instead of painless if we can manage it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't follow - do you mean moving the analysis script context out of core and into the common analysis plugin, or do you mean creating an entirely new plugin just for this token filter (and any other script-based filters we come up with)? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I mean I feel like it should live in the common analysis plugin and not in the server at all. Since it is a new thing I figure we should isolate it as much as we can so we know when stuff is using it. Not because it is dirty or anything, just because a smaller core is easier to reason about. And keeping this entirely within the analysis-common plugin helps to prove out the work that we've done for plugins extending painless. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The tricky part here will be unit testing, as ScriptTestCase is a painless-specific class. The painless extension module uses rest tests only, but I'd much rather have compiled unit tests here. Maybe we should build a separate painless-testing module, that can be used for tests elsewhere? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In general we try to unit test this sort of thing without painless and use a mock script engine instead that just calls code on the test class. It is totally reasonable to have a few integration tests that do use painless though. |
||
|
||
@Override | ||
protected Map<ScriptContext<?>, List<Whitelist>> scriptContexts() { | ||
Map<ScriptContext<?>, List<Whitelist>> contexts = new HashMap<>(); | ||
contexts.put(AnalysisPredicateScript.CONTEXT, Whitelist.BASE_WHITELISTS); | ||
return contexts; | ||
} | ||
|
||
public void testAnalysisScript() { | ||
AnalysisPredicateScript.Factory factory = scriptEngine.compile("test", "return \"one\".contentEquals(term.term)", | ||
AnalysisPredicateScript.CONTEXT, Collections.emptyMap()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Having to use |
||
|
||
AnalysisPredicateScript script = factory.newInstance(); | ||
AnalysisPredicateScript.Term term = new AnalysisPredicateScript.Term(); | ||
term.term = "one"; | ||
assertTrue(script.execute(term)); | ||
term.term = "two"; | ||
assertFalse(script.execute(term)); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should be explicit that these filters are chained. Maybe call it
filter
(no plural) for consistency with analyzers?