Skip to content

Commit

Permalink
[ML] address two edge cases for categorization.GrokPatternCreator#fin…
Browse files Browse the repository at this point in the history
…dBestGrokMatchFromExamples (elastic#51168)

There are two edge cases that can be ran into when example input is matched in a weird way.

1. Recursion depth could continue many many times, resulting in a HUGE runtime cost. I put a limit of 10 recursions (could be adjusted I suppose). 
2. If there are no "fixed regex bits", exploring the grok space would result in a fence-post error during runtime (with assertions turned off)
  • Loading branch information
benwtrent committed Jan 21, 2020
1 parent 788836e commit 301f980
Show file tree
Hide file tree
Showing 2 changed files with 142 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
package org.elasticsearch.xpack.ml.job.categorization;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.elasticsearch.grok.Grok;

import java.util.ArrayList;
Expand All @@ -25,8 +26,11 @@
*/
public final class GrokPatternCreator {

private static final Logger logger = LogManager.getLogger(GrokPatternCreator.class);

private static final String PREFACE = "preface";
private static final String EPILOGUE = "epilogue";
private static final int MAX_RECURSE_DEPTH = 10;

/**
* The first match in this list will be chosen, so it needs to be ordered
Expand Down Expand Up @@ -89,6 +93,11 @@ public static String findBestGrokMatchFromExamples(String jobId, String regex, C
// E.g., ".*?cat.+?sat.+?mat.*" -> [ "", "cat", "sat", "mat" ]
String[] fixedRegexBits = regex.split("\\.[*+]\\??");

// If there are no fixed regex bits, that implies there would only be a single capture group
// And that would mean a pretty uninteresting grok pattern
if (fixedRegexBits.length == 0) {
return regex;
}
// Create a pattern that will capture the bits in between the fixed parts of the regex
//
// E.g., ".*?cat.+?sat.+?mat.*" -> Pattern (.*?)cat(.+?)sat(.+?)mat(.*)
Expand All @@ -112,8 +121,7 @@ public static String findBestGrokMatchFromExamples(String jobId, String regex, C
// We should never get here. If we do it implies a bug in the original categorization,
// as it's produced a regex that doesn't match the examples.
assert matcher.matches() : exampleProcessor.pattern() + " did not match " + example;
LogManager.getLogger(GrokPatternCreator.class).error("[{}] Pattern [{}] did not match example [{}]", jobId,
exampleProcessor.pattern(), example);
logger.error("[{}] Pattern [{}] did not match example [{}]", jobId, exampleProcessor.pattern(), example);
}
}

Expand All @@ -125,19 +133,19 @@ public static String findBestGrokMatchFromExamples(String jobId, String regex, C
// Remember (from the first comment in this method) that the first element in this array is
// always the empty string
overallGrokPatternBuilder.append(fixedRegexBits[inBetweenBitNum]);
appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, inBetweenBitNum == 0,
appendBestGrokMatchForStrings(jobId, fieldNameCountStore, overallGrokPatternBuilder, inBetweenBitNum == 0,
inBetweenBitNum == fixedRegexBits.length - 1, groupsMatchesFromExamples.get(inBetweenBitNum));
}
return overallGrokPatternBuilder.toString();
}

/**
* Given a collection of strings, work out which (if any) of the grok patterns we're allowed
* to use matches it best. Then append the appropriate grok language to represent that finding
* onto the supplied string builder.
*/
static void appendBestGrokMatchForStrings(Map<String, Integer> fieldNameCountStore, StringBuilder overallGrokPatternBuilder,
boolean isFirst, boolean isLast, Collection<String> mustMatchStrings) {
private static void appendBestGrokMatchForStrings(String jobId,
Map<String, Integer> fieldNameCountStore,
StringBuilder overallGrokPatternBuilder,
boolean isFirst,
boolean isLast,
Collection<String> mustMatchStrings,
int numRecurse) {

GrokPatternCandidate bestCandidate = null;
if (mustMatchStrings.isEmpty() == false) {
Expand All @@ -149,7 +157,10 @@ static void appendBestGrokMatchForStrings(Map<String, Integer> fieldNameCountSto
}
}

if (bestCandidate == null) {
if (bestCandidate == null || numRecurse >= MAX_RECURSE_DEPTH) {
if (bestCandidate != null) {
logger.warn("[{}] exited grok discovery early, reached max depth [{}]", jobId, MAX_RECURSE_DEPTH);
}
if (isLast) {
overallGrokPatternBuilder.append(".*");
} else if (isFirst || mustMatchStrings.stream().anyMatch(String::isEmpty)) {
Expand All @@ -161,13 +172,39 @@ static void appendBestGrokMatchForStrings(Map<String, Integer> fieldNameCountSto
Collection<String> prefaces = new ArrayList<>();
Collection<String> epilogues = new ArrayList<>();
populatePrefacesAndEpilogues(mustMatchStrings, bestCandidate.grok, prefaces, epilogues);
appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, isFirst, false, prefaces);
appendBestGrokMatchForStrings(jobId,
fieldNameCountStore,
overallGrokPatternBuilder,
isFirst,
false,
prefaces,
numRecurse + 1);
overallGrokPatternBuilder.append("%{").append(bestCandidate.grokPatternName).append(':')
.append(buildFieldName(fieldNameCountStore, bestCandidate.fieldName)).append('}');
appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, isLast, epilogues);
.append(buildFieldName(fieldNameCountStore, bestCandidate.fieldName)).append('}');
appendBestGrokMatchForStrings(jobId,
fieldNameCountStore,
overallGrokPatternBuilder,
false, isLast,
epilogues,
numRecurse + 1);
}
}


/**
* Given a collection of strings, work out which (if any) of the grok patterns we're allowed
* to use matches it best. Then append the appropriate grok language to represent that finding
* onto the supplied string builder.
*/
static void appendBestGrokMatchForStrings(String jobId,
Map<String, Integer> fieldNameCountStore,
StringBuilder overallGrokPatternBuilder,
boolean isFirst,
boolean isLast,
Collection<String> mustMatchStrings) {
appendBestGrokMatchForStrings(jobId, fieldNameCountStore, overallGrokPatternBuilder, isFirst, isLast, mustMatchStrings, 0);
}

/**
* Given a collection of strings, and a grok pattern that matches some part of them all,
* return collections of the bits that come before (prefaces) and after (epilogues) the
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import java.util.HashMap;
import java.util.Map;

import static org.hamcrest.CoreMatchers.equalTo;
import static org.hamcrest.Matchers.containsInAnyOrder;

public class GrokPatternCreatorTests extends ESTestCase {
Expand Down Expand Up @@ -71,7 +72,8 @@ public void testAppendBestGrokMatchForStringsGivenTimestampsAndLogLevels() {
Map<String, Integer> fieldNameCountStore = new HashMap<>();
StringBuilder overallGrokPatternBuilder = new StringBuilder();

GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false,
false, mustMatchStrings);

assertEquals(".+?%{TIMESTAMP_ISO8601:timestamp}.+?%{LOGLEVEL:loglevel}.+?", overallGrokPatternBuilder.toString());
}
Expand All @@ -87,7 +89,8 @@ public void testAppendBestGrokMatchForStringsGivenTomcatDatestamps() {
Map<String, Integer> fieldNameCountStore = new HashMap<>();
StringBuilder overallGrokPatternBuilder = new StringBuilder();

GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false,
false, mustMatchStrings);

assertEquals(".*?%{TOMCAT_DATESTAMP:timestamp}.+?%{LOGLEVEL:loglevel}.+?", overallGrokPatternBuilder.toString());
}
Expand All @@ -105,7 +108,8 @@ public void testAppendBestGrokMatchForStringsGivenTrappyFloatCandidates() {
Map<String, Integer> fieldNameCountStore = new HashMap<>();
StringBuilder overallGrokPatternBuilder = new StringBuilder();

GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false,
false, mustMatchStrings);

assertEquals(".+?", overallGrokPatternBuilder.toString());
}
Expand All @@ -120,7 +124,8 @@ public void testAppendBestGrokMatchForStringsGivenNumbersInBrackets() {
Map<String, Integer> fieldNameCountStore = new HashMap<>();
StringBuilder overallGrokPatternBuilder = new StringBuilder();

GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false,
false, mustMatchStrings);

assertEquals(".+?%{NUMBER:field}.+?", overallGrokPatternBuilder.toString());
}
Expand All @@ -134,7 +139,8 @@ public void testAppendBestGrokMatchForStringsGivenNegativeNumbersWithoutBreak()
Map<String, Integer> fieldNameCountStore = new HashMap<>();
StringBuilder overallGrokPatternBuilder = new StringBuilder();

GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false,
false, mustMatchStrings);

// It seems sensible that we don't detect these suffices as either base 10 or base 16 numbers
assertEquals(".+?", overallGrokPatternBuilder.toString());
Expand All @@ -150,7 +156,8 @@ public void testAppendBestGrokMatchForStringsGivenHexNumbers() {
Map<String, Integer> fieldNameCountStore = new HashMap<>();
StringBuilder overallGrokPatternBuilder = new StringBuilder();

GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false,
false, mustMatchStrings);

assertEquals(".*?%{BASE16NUM:field}.*?", overallGrokPatternBuilder.toString());
}
Expand All @@ -163,7 +170,8 @@ public void testAppendBestGrokMatchForStringsGivenHostnamesWithNumbers() {
Map<String, Integer> fieldNameCountStore = new HashMap<>();
StringBuilder overallGrokPatternBuilder = new StringBuilder();

GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false,
false, mustMatchStrings);

// We don't want the .1. in the middle to get detected as a hex number
assertEquals(".+?", overallGrokPatternBuilder.toString());
Expand All @@ -178,7 +186,8 @@ public void testAppendBestGrokMatchForStringsGivenEmailAddresses() {
Map<String, Integer> fieldNameCountStore = new HashMap<>();
StringBuilder overallGrokPatternBuilder = new StringBuilder();

GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false,
false, mustMatchStrings);

assertEquals(".*?%{EMAILADDRESS:email}.*?", overallGrokPatternBuilder.toString());
}
Expand All @@ -192,7 +201,8 @@ public void testAppendBestGrokMatchForStringsGivenUris() {
Map<String, Integer> fieldNameCountStore = new HashMap<>();
StringBuilder overallGrokPatternBuilder = new StringBuilder();

GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false,
false, mustMatchStrings);

assertEquals(".*?%{URI:uri}.*?", overallGrokPatternBuilder.toString());
}
Expand All @@ -206,7 +216,8 @@ public void testAppendBestGrokMatchForStringsGivenPaths() {
Map<String, Integer> fieldNameCountStore = new HashMap<>();
StringBuilder overallGrokPatternBuilder = new StringBuilder();

GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false,
false, mustMatchStrings);

assertEquals(".+?%{PATH:path}.*?", overallGrokPatternBuilder.toString());
}
Expand Down Expand Up @@ -263,4 +274,74 @@ public void testFindBestGrokMatchFromExamplesGivenMultiTimestampLogs() {
"%{IP:ipaddress}.+?Authpriv.+?Info.+?sshd.+?subsystem.+?request.+?for.+?sftp.*",
GrokPatternCreator.findBestGrokMatchFromExamples("foo", regex, examples));
}

public void testFindBestGrokMatchFromExamplesGivenAdversarialInputRecurseDepth() {
String regex = ".*?combo.+?rpc\\.statd.+?gethostbyname.+?error.+?for.+?X.+?X.+?Z.+?Z.+?hn.+?hn.*";
// Two timestamps: one local, one UTC
Collection<String> examples = Arrays.asList(
"combo rpc.statd[1605]: gethostbyname error for ^X^X^Z^Z%8x%8x%8x%8x%8x%8x%8x%8x%8x%62716x%hn%51859x%hn" +
"\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\22...",
"combo rpc.statd[1608]: gethostbyname error for ^X^X^Z^Z%8x%8x%8x%8x%8x%8x%8x%8x%8x%62716x%hn%51859x%hn" +
"\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\22...",
"combo rpc.statd[1635]: gethostbyname error for ^X^X^Z^Z%8x%8x%8x%8x%8x%8x%8x%8x%8x%62716x%hn%51859x%hn" +
"\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\22...");
assertEquals(
".*?combo.+?rpc\\.statd.+?%{NUMBER:field}.+?gethostbyname.+?error.+?for.+?X.+?X.+?Z.+?Z.+?hn.+?hn.+?%{NUMBER:field2}" +
".+?%{NUMBER:field3}.+?%{NUMBER:field4}.+?%{NUMBER:field5}.+?%{NUMBER:field6}.+?%{NUMBER:field7}.+?%{NUMBER:field8}" +
".+?%{NUMBER:field9}.+?%{NUMBER:field10}.+?%{NUMBER:field11}.*",
GrokPatternCreator.findBestGrokMatchFromExamples("foo", regex, examples));
}

public void testFindBestGrokMatchFromExamplesGivenMatchAllRegex() {
String regex = ".*";
// Two timestamps: one local, one UTC
Collection<String> examples = Arrays.asList(
"Killing job [count_tweets]",
"Killing job [tweets_by_location]",
"[count_tweets] Killing job",
"[tweets_by_location] Killing job");
assertThat(GrokPatternCreator.findBestGrokMatchFromExamples("foo", regex, examples), equalTo(regex));
}
}

0 comments on commit 301f980

Please sign in to comment.