Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[7.x] [ML] address two edge cases for categorization.GrokPatternCreator#findBestGrokMatchFromExamples (#51168) #51255

Merged
merged 1 commit into from
Jan 21, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
package org.elasticsearch.xpack.ml.job.categorization;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.elasticsearch.grok.Grok;

import java.util.ArrayList;
Expand All @@ -25,8 +26,11 @@
*/
public final class GrokPatternCreator {

private static final Logger logger = LogManager.getLogger(GrokPatternCreator.class);

private static final String PREFACE = "preface";
private static final String EPILOGUE = "epilogue";
private static final int MAX_RECURSE_DEPTH = 10;

/**
* The first match in this list will be chosen, so it needs to be ordered
Expand Down Expand Up @@ -89,6 +93,11 @@ public static String findBestGrokMatchFromExamples(String jobId, String regex, C
// E.g., ".*?cat.+?sat.+?mat.*" -> [ "", "cat", "sat", "mat" ]
String[] fixedRegexBits = regex.split("\\.[*+]\\??");

// If there are no fixed regex bits, that implies there would only be a single capture group
// And that would mean a pretty uninteresting grok pattern
if (fixedRegexBits.length == 0) {
return regex;
}
// Create a pattern that will capture the bits in between the fixed parts of the regex
//
// E.g., ".*?cat.+?sat.+?mat.*" -> Pattern (.*?)cat(.+?)sat(.+?)mat(.*)
Expand All @@ -112,8 +121,7 @@ public static String findBestGrokMatchFromExamples(String jobId, String regex, C
// We should never get here. If we do it implies a bug in the original categorization,
// as it's produced a regex that doesn't match the examples.
assert matcher.matches() : exampleProcessor.pattern() + " did not match " + example;
LogManager.getLogger(GrokPatternCreator.class).error("[{}] Pattern [{}] did not match example [{}]", jobId,
exampleProcessor.pattern(), example);
logger.error("[{}] Pattern [{}] did not match example [{}]", jobId, exampleProcessor.pattern(), example);
}
}

Expand All @@ -125,19 +133,19 @@ public static String findBestGrokMatchFromExamples(String jobId, String regex, C
// Remember (from the first comment in this method) that the first element in this array is
// always the empty string
overallGrokPatternBuilder.append(fixedRegexBits[inBetweenBitNum]);
appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, inBetweenBitNum == 0,
appendBestGrokMatchForStrings(jobId, fieldNameCountStore, overallGrokPatternBuilder, inBetweenBitNum == 0,
inBetweenBitNum == fixedRegexBits.length - 1, groupsMatchesFromExamples.get(inBetweenBitNum));
}
return overallGrokPatternBuilder.toString();
}

/**
* Given a collection of strings, work out which (if any) of the grok patterns we're allowed
* to use matches it best. Then append the appropriate grok language to represent that finding
* onto the supplied string builder.
*/
static void appendBestGrokMatchForStrings(Map<String, Integer> fieldNameCountStore, StringBuilder overallGrokPatternBuilder,
boolean isFirst, boolean isLast, Collection<String> mustMatchStrings) {
private static void appendBestGrokMatchForStrings(String jobId,
Map<String, Integer> fieldNameCountStore,
StringBuilder overallGrokPatternBuilder,
boolean isFirst,
boolean isLast,
Collection<String> mustMatchStrings,
int numRecurse) {

GrokPatternCandidate bestCandidate = null;
if (mustMatchStrings.isEmpty() == false) {
Expand All @@ -149,7 +157,10 @@ static void appendBestGrokMatchForStrings(Map<String, Integer> fieldNameCountSto
}
}

if (bestCandidate == null) {
if (bestCandidate == null || numRecurse >= MAX_RECURSE_DEPTH) {
if (bestCandidate != null) {
logger.warn("[{}] exited grok discovery early, reached max depth [{}]", jobId, MAX_RECURSE_DEPTH);
}
if (isLast) {
overallGrokPatternBuilder.append(".*");
} else if (isFirst || mustMatchStrings.stream().anyMatch(String::isEmpty)) {
Expand All @@ -161,13 +172,39 @@ static void appendBestGrokMatchForStrings(Map<String, Integer> fieldNameCountSto
Collection<String> prefaces = new ArrayList<>();
Collection<String> epilogues = new ArrayList<>();
populatePrefacesAndEpilogues(mustMatchStrings, bestCandidate.grok, prefaces, epilogues);
appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, isFirst, false, prefaces);
appendBestGrokMatchForStrings(jobId,
fieldNameCountStore,
overallGrokPatternBuilder,
isFirst,
false,
prefaces,
numRecurse + 1);
overallGrokPatternBuilder.append("%{").append(bestCandidate.grokPatternName).append(':')
.append(buildFieldName(fieldNameCountStore, bestCandidate.fieldName)).append('}');
appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, isLast, epilogues);
.append(buildFieldName(fieldNameCountStore, bestCandidate.fieldName)).append('}');
appendBestGrokMatchForStrings(jobId,
fieldNameCountStore,
overallGrokPatternBuilder,
false, isLast,
epilogues,
numRecurse + 1);
}
}


/**
* Given a collection of strings, work out which (if any) of the grok patterns we're allowed
* to use matches it best. Then append the appropriate grok language to represent that finding
* onto the supplied string builder.
*/
static void appendBestGrokMatchForStrings(String jobId,
Map<String, Integer> fieldNameCountStore,
StringBuilder overallGrokPatternBuilder,
boolean isFirst,
boolean isLast,
Collection<String> mustMatchStrings) {
appendBestGrokMatchForStrings(jobId, fieldNameCountStore, overallGrokPatternBuilder, isFirst, isLast, mustMatchStrings, 0);
}

/**
* Given a collection of strings, and a grok pattern that matches some part of them all,
* return collections of the bits that come before (prefaces) and after (epilogues) the
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import java.util.HashMap;
import java.util.Map;

import static org.hamcrest.CoreMatchers.equalTo;
import static org.hamcrest.Matchers.containsInAnyOrder;

public class GrokPatternCreatorTests extends ESTestCase {
Expand Down Expand Up @@ -71,7 +72,8 @@ public void testAppendBestGrokMatchForStringsGivenTimestampsAndLogLevels() {
Map<String, Integer> fieldNameCountStore = new HashMap<>();
StringBuilder overallGrokPatternBuilder = new StringBuilder();

GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false,
false, mustMatchStrings);

assertEquals(".+?%{TIMESTAMP_ISO8601:timestamp}.+?%{LOGLEVEL:loglevel}.+?", overallGrokPatternBuilder.toString());
}
Expand All @@ -87,7 +89,8 @@ public void testAppendBestGrokMatchForStringsGivenTomcatDatestamps() {
Map<String, Integer> fieldNameCountStore = new HashMap<>();
StringBuilder overallGrokPatternBuilder = new StringBuilder();

GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false,
false, mustMatchStrings);

assertEquals(".*?%{TOMCAT_DATESTAMP:timestamp}.+?%{LOGLEVEL:loglevel}.+?", overallGrokPatternBuilder.toString());
}
Expand All @@ -105,7 +108,8 @@ public void testAppendBestGrokMatchForStringsGivenTrappyFloatCandidates() {
Map<String, Integer> fieldNameCountStore = new HashMap<>();
StringBuilder overallGrokPatternBuilder = new StringBuilder();

GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false,
false, mustMatchStrings);

assertEquals(".+?", overallGrokPatternBuilder.toString());
}
Expand All @@ -120,7 +124,8 @@ public void testAppendBestGrokMatchForStringsGivenNumbersInBrackets() {
Map<String, Integer> fieldNameCountStore = new HashMap<>();
StringBuilder overallGrokPatternBuilder = new StringBuilder();

GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false,
false, mustMatchStrings);

assertEquals(".+?%{NUMBER:field}.+?", overallGrokPatternBuilder.toString());
}
Expand All @@ -134,7 +139,8 @@ public void testAppendBestGrokMatchForStringsGivenNegativeNumbersWithoutBreak()
Map<String, Integer> fieldNameCountStore = new HashMap<>();
StringBuilder overallGrokPatternBuilder = new StringBuilder();

GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false,
false, mustMatchStrings);

// It seems sensible that we don't detect these suffices as either base 10 or base 16 numbers
assertEquals(".+?", overallGrokPatternBuilder.toString());
Expand All @@ -150,7 +156,8 @@ public void testAppendBestGrokMatchForStringsGivenHexNumbers() {
Map<String, Integer> fieldNameCountStore = new HashMap<>();
StringBuilder overallGrokPatternBuilder = new StringBuilder();

GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false,
false, mustMatchStrings);

assertEquals(".*?%{BASE16NUM:field}.*?", overallGrokPatternBuilder.toString());
}
Expand All @@ -163,7 +170,8 @@ public void testAppendBestGrokMatchForStringsGivenHostnamesWithNumbers() {
Map<String, Integer> fieldNameCountStore = new HashMap<>();
StringBuilder overallGrokPatternBuilder = new StringBuilder();

GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false,
false, mustMatchStrings);

// We don't want the .1. in the middle to get detected as a hex number
assertEquals(".+?", overallGrokPatternBuilder.toString());
Expand All @@ -178,7 +186,8 @@ public void testAppendBestGrokMatchForStringsGivenEmailAddresses() {
Map<String, Integer> fieldNameCountStore = new HashMap<>();
StringBuilder overallGrokPatternBuilder = new StringBuilder();

GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false,
false, mustMatchStrings);

assertEquals(".*?%{EMAILADDRESS:email}.*?", overallGrokPatternBuilder.toString());
}
Expand All @@ -192,7 +201,8 @@ public void testAppendBestGrokMatchForStringsGivenUris() {
Map<String, Integer> fieldNameCountStore = new HashMap<>();
StringBuilder overallGrokPatternBuilder = new StringBuilder();

GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false,
false, mustMatchStrings);

assertEquals(".*?%{URI:uri}.*?", overallGrokPatternBuilder.toString());
}
Expand All @@ -206,7 +216,8 @@ public void testAppendBestGrokMatchForStringsGivenPaths() {
Map<String, Integer> fieldNameCountStore = new HashMap<>();
StringBuilder overallGrokPatternBuilder = new StringBuilder();

GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false,
false, mustMatchStrings);

assertEquals(".+?%{PATH:path}.*?", overallGrokPatternBuilder.toString());
}
Expand Down Expand Up @@ -263,4 +274,74 @@ public void testFindBestGrokMatchFromExamplesGivenMultiTimestampLogs() {
"%{IP:ipaddress}.+?Authpriv.+?Info.+?sshd.+?subsystem.+?request.+?for.+?sftp.*",
GrokPatternCreator.findBestGrokMatchFromExamples("foo", regex, examples));
}

public void testFindBestGrokMatchFromExamplesGivenAdversarialInputRecurseDepth() {
String regex = ".*?combo.+?rpc\\.statd.+?gethostbyname.+?error.+?for.+?X.+?X.+?Z.+?Z.+?hn.+?hn.*";
// Two timestamps: one local, one UTC
Collection<String> examples = Arrays.asList(
"combo rpc.statd[1605]: gethostbyname error for ^X^X^Z^Z%8x%8x%8x%8x%8x%8x%8x%8x%8x%62716x%hn%51859x%hn" +
"\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\22...",
"combo rpc.statd[1608]: gethostbyname error for ^X^X^Z^Z%8x%8x%8x%8x%8x%8x%8x%8x%8x%62716x%hn%51859x%hn" +
"\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\22...",
"combo rpc.statd[1635]: gethostbyname error for ^X^X^Z^Z%8x%8x%8x%8x%8x%8x%8x%8x%8x%62716x%hn%51859x%hn" +
"\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
"\\220\\220\\220\\220\\220\\220\\220\\220\\220\\22...");
assertEquals(
".*?combo.+?rpc\\.statd.+?%{NUMBER:field}.+?gethostbyname.+?error.+?for.+?X.+?X.+?Z.+?Z.+?hn.+?hn.+?%{NUMBER:field2}" +
".+?%{NUMBER:field3}.+?%{NUMBER:field4}.+?%{NUMBER:field5}.+?%{NUMBER:field6}.+?%{NUMBER:field7}.+?%{NUMBER:field8}" +
".+?%{NUMBER:field9}.+?%{NUMBER:field10}.+?%{NUMBER:field11}.*",
GrokPatternCreator.findBestGrokMatchFromExamples("foo", regex, examples));
}

public void testFindBestGrokMatchFromExamplesGivenMatchAllRegex() {
String regex = ".*";
// Two timestamps: one local, one UTC
Collection<String> examples = Arrays.asList(
"Killing job [count_tweets]",
"Killing job [tweets_by_location]",
"[count_tweets] Killing job",
"[tweets_by_location] Killing job");
assertThat(GrokPatternCreator.findBestGrokMatchFromExamples("foo", regex, examples), equalTo(regex));
}
}