From 6135feb529ef412a9957977adc690bdd7f5a5639 Mon Sep 17 00:00:00 2001 From: itsankit-google Date: Mon, 3 Feb 2025 12:23:59 +0000 Subject: [PATCH] Add rule based error classification --- .../io/cdap/cdap/common/conf/Constants.java | 1 + .../proto/ErrorClassificationResponse.java | 23 +-- .../ErrorClassificationResponseWrapper.java | 98 +++++++++ .../cdap/logging/ErrorClassificationRule.java | 195 ++++++++++++++++++ .../cdap/logging/ErrorLogsClassifier.java | 163 +++++++++++---- .../cdap/logging/appender/LogAppender.java | 3 +- .../cdap/logging/ErrorLogsClassifierTest.java | 128 ++++++++++-- 7 files changed, 526 insertions(+), 85 deletions(-) create mode 100644 cdap-watchdog/src/main/java/io/cdap/cdap/logging/ErrorClassificationResponseWrapper.java create mode 100644 cdap-watchdog/src/main/java/io/cdap/cdap/logging/ErrorClassificationRule.java diff --git a/cdap-common/src/main/java/io/cdap/cdap/common/conf/Constants.java b/cdap-common/src/main/java/io/cdap/cdap/common/conf/Constants.java index f231022e5733..2b1cecdf096f 100644 --- a/cdap-common/src/main/java/io/cdap/cdap/common/conf/Constants.java +++ b/cdap-common/src/main/java/io/cdap/cdap/common/conf/Constants.java @@ -1547,6 +1547,7 @@ public static final class Logging { public static final String USER_LOG_TAG_VALUE = "userLog"; public static final String TAG_FAILED_STAGE = "failedStage"; public static final String TAG_ERROR_CATEGORY = "errorCategory"; + public static final String TAG_PARENT_ERROR_CATEGORY = "parentErrorCategory"; public static final String TAG_ERROR_REASON = "errorReason"; public static final String TAG_ERROR_TYPE = "errorType"; public static final String TAG_DEPENDENCY = "dependency"; diff --git a/cdap-proto/src/main/java/io/cdap/cdap/proto/ErrorClassificationResponse.java b/cdap-proto/src/main/java/io/cdap/cdap/proto/ErrorClassificationResponse.java index c6818629d12b..d693f81c96b6 100644 --- a/cdap-proto/src/main/java/io/cdap/cdap/proto/ErrorClassificationResponse.java +++ b/cdap-proto/src/main/java/io/cdap/cdap/proto/ErrorClassificationResponse.java @@ -31,11 +31,10 @@ public final class ErrorClassificationResponse { private final String errorCodeType; private final String errorCode; private final String supportedDocumentationUrl; - private transient final String throwableClassName; private ErrorClassificationResponse(String stageName, String errorCategory, String errorReason, String errorMessage, String errorType, String dependency, String errorCodeType, - String errorCode, String supportedDocumentationUrl, String throwableClassName) { + String errorCode, String supportedDocumentationUrl) { this.stageName = stageName; this.errorCategory = errorCategory; this.errorReason = errorReason; @@ -45,7 +44,6 @@ private ErrorClassificationResponse(String stageName, String errorCategory, Stri this.errorCodeType = errorCodeType; this.errorCode = errorCode; this.supportedDocumentationUrl = supportedDocumentationUrl; - this.throwableClassName = throwableClassName; } /** @@ -111,13 +109,6 @@ public String getSupportedDocumentationUrl() { return supportedDocumentationUrl; } - /** - * Gets the throwable class name for ErrorClassificationResponse. - */ - public String getThrowableClassName() { - return throwableClassName; - } - @Override public boolean equals(Object o) { if (!(o instanceof ErrorClassificationResponse)) { @@ -154,7 +145,6 @@ public static class Builder { private String errorCodeType; private String errorCode; private String supportedDocumentationUrl; - private String throwableClassName; /** * Sets the stage name for ErrorClassificationResponse. @@ -228,21 +218,12 @@ public Builder setSupportedDocumentationUrl(String supportedDocumentationUrl) { return this; } - /** - * Sets the throwable class name for ErrorClassificationResponse. - */ - public Builder setThrowableClassName(String throwableClassName) { - this.throwableClassName = throwableClassName; - return this; - } - /** * Builds and returns a new instance of ErrorClassificationResponse. */ public ErrorClassificationResponse build() { return new ErrorClassificationResponse(stageName, errorCategory, errorReason, errorMessage, - errorType, dependency, errorCodeType, errorCode, supportedDocumentationUrl, - throwableClassName); + errorType, dependency, errorCodeType, errorCode, supportedDocumentationUrl); } } } diff --git a/cdap-watchdog/src/main/java/io/cdap/cdap/logging/ErrorClassificationResponseWrapper.java b/cdap-watchdog/src/main/java/io/cdap/cdap/logging/ErrorClassificationResponseWrapper.java new file mode 100644 index 000000000000..7ccef2f0d39d --- /dev/null +++ b/cdap-watchdog/src/main/java/io/cdap/cdap/logging/ErrorClassificationResponseWrapper.java @@ -0,0 +1,98 @@ +/* + * Copyright © 2025 Cask Data, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package io.cdap.cdap.logging; + +import io.cdap.cdap.proto.ErrorClassificationResponse; +import java.util.Objects; +import javax.annotation.Nullable; + +/** + * Wrapper class for {@link ErrorClassificationResponse}. + */ +public final class ErrorClassificationResponseWrapper { + private final ErrorClassificationResponse errorClassificationResponse; + private final String parentErrorCategory; + private final String throwableClassName; + private final Integer rulePriority; + private final String ruleId; + + /** + * Constructor for {@link ErrorClassificationResponseWrapper}. + */ + public ErrorClassificationResponseWrapper(ErrorClassificationResponse response, + String parentErrorCategory, String throwableClassName, @Nullable Integer rulePriority, + @Nullable String ruleId) { + this.errorClassificationResponse = response; + this.parentErrorCategory = parentErrorCategory; + this.throwableClassName = throwableClassName; + this.ruleId = ruleId; + this.rulePriority = rulePriority; + } + + /** + * Gets the error classification response for ErrorClassificationResponse. + */ + public ErrorClassificationResponse getErrorClassificationResponse() { + return errorClassificationResponse; + } + + /** + * Gets the parent error category for ErrorClassificationResponse. + */ + public String getParentErrorCategory() { + return parentErrorCategory; + } + + /** + * Gets the throwable class name for ErrorClassificationResponse. + */ + public String getThrowableClassName() { + return throwableClassName; + } + + /** + * Gets the rule priority for ErrorClassificationResponse. + */ + public Integer getRulePriority() { + return rulePriority == null ? Integer.MAX_VALUE : rulePriority; + } + + /** + * Gets the rule id for ErrorClassificationResponse. + */ + public String getRuleId() { + return ruleId; + } + + @Override + public boolean equals(Object o) { + if (!(o instanceof ErrorClassificationResponseWrapper)) { + return false; + } + ErrorClassificationResponseWrapper that = (ErrorClassificationResponseWrapper) o; + + return Objects.equals(this.errorClassificationResponse, that.errorClassificationResponse) + && Objects.equals(this.throwableClassName, that.throwableClassName) + && Objects.equals(this.rulePriority, that.rulePriority) + && Objects.equals(this.ruleId, that.ruleId); + } + + @Override + public int hashCode() { + return Objects.hash(errorClassificationResponse, throwableClassName, rulePriority, ruleId); + } +} diff --git a/cdap-watchdog/src/main/java/io/cdap/cdap/logging/ErrorClassificationRule.java b/cdap-watchdog/src/main/java/io/cdap/cdap/logging/ErrorClassificationRule.java new file mode 100644 index 000000000000..123edc5f8906 --- /dev/null +++ b/cdap-watchdog/src/main/java/io/cdap/cdap/logging/ErrorClassificationRule.java @@ -0,0 +1,195 @@ +/* + * Copyright © 2025 Cask Data, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package io.cdap.cdap.logging; + +import com.google.common.base.Strings; +import io.cdap.cdap.api.exception.ErrorType; +import java.util.regex.Pattern; +import javax.annotation.Nullable; + +/** + * Represents the rule for classifying error logs. + */ +public final class ErrorClassificationRule implements Comparable { + public static final Pattern DEFAULT_PATTERN = Pattern.compile(".*"); + private final String id; + private final String description; + private final int priority; + private final String exceptionClassRegex; + private final String codeMethodRegex; + private final ErrorType errorType; + private final boolean dependency; + private Pattern exceptionClassRegexPattern; + private Pattern codeMethodRegexPattern; + + + private ErrorClassificationRule(String id, String description, int priority, + @Nullable String exceptionClassRegex, @Nullable String codeMethodRegex, + ErrorType errorType, boolean dependency) { + this.id = id; + this.description = description; + this.priority = priority; + this.exceptionClassRegex = exceptionClassRegex; + this.codeMethodRegex = codeMethodRegex; + this.errorType = errorType; + this.dependency = dependency; + } + + /** + * Returns the id for ErrorClassificationRule. + */ + public String getId() { + return id; + } + + /** + * Returns the description for ErrorClassificationRule. + */ + public String getDescription() { + return description; + } + + /** + * Returns the priority for ErrorClassificationRule. + */ + public int getPriority() { + return priority; + } + + /** + * Returns the exception class regex pattern for ErrorClassificationRule. + */ + public Pattern getExceptionClassRegex() { + if (this.exceptionClassRegexPattern == null) { + this.exceptionClassRegexPattern = + Strings.isNullOrEmpty(this.exceptionClassRegex) ? DEFAULT_PATTERN + : Pattern.compile(this.exceptionClassRegex); + } + return this.exceptionClassRegexPattern; + } + + /** + * Returns the code method regex pattern for ErrorClassificationRule. + */ + public Pattern getCodeMethodRegex() { + if (this.codeMethodRegexPattern == null) { + this.codeMethodRegexPattern = Strings.isNullOrEmpty(this.codeMethodRegex) ? DEFAULT_PATTERN + : Pattern.compile(this.codeMethodRegex); + } + return this.codeMethodRegexPattern; + } + + /** + * Returns the {@link ErrorType} for ErrorClassificationRule. + */ + public ErrorType getErrorType() { + return errorType == null ? ErrorType.UNKNOWN : errorType; + } + + /** + * Returns the dependency flag for ErrorClassificationRule. + */ + public boolean isDependency() { + return dependency; + } + + @Override + public int compareTo(ErrorClassificationRule that) { + return Integer.compare(this.priority, that.priority); + } + + @Override + public String toString() { + return String.format("Rule Id: '%s', Description: '%s', Exception Class Name: '%s'," + + "Code Method Regex: '%s', Error Type: '%s', Dependency: '%s'", id, description, + getExceptionClassRegex().pattern(), getCodeMethodRegex().pattern(), errorType, dependency); + } + + /** + * Builder class for {@link ErrorClassificationRule}. + */ + public static class Builder { + private String id; + private String description; + private int priority; + private String exceptionClassRegex; + private String codeMethodRegex; + private ErrorType errorType; + private boolean dependency; + + /** + * Sets the id for ErrorClassificationRule. + */ + public Builder setId(String id) { + this.id = id; + return this; + } + + /** + * Sets the description for ErrorClassificationRule. + */ + public Builder setDescription(String description) { + this.description = description; + return this; + } + + /** + * Sets the priority for ErrorClassificationRule. + */ + public Builder setPriority(int priority) { + this.priority = priority; + return this; + } + + /** + * Sets the exception class name regex for ErrorClassificationRule. + */ + public Builder setExceptionClassName(String exceptionClassRegex) { + this.exceptionClassRegex = exceptionClassRegex; + return this; + } + + /** + * Sets the code method regex for ErrorClassificationRule. + */ + public Builder setCodeMethodRegex(String codeMethodRegex) { + this.codeMethodRegex = codeMethodRegex; + return this; + } + + /** + * Sets the {@link ErrorType} for ErrorClassificationRule. + */ + public Builder setErrorType(ErrorType errorType) { + this.errorType = errorType; + return this; + } + + /** + * Sets the dependency flag for ErrorClassificationRule. + */ + public Builder setDependency(boolean dependency) { + this.dependency = dependency; + return this; + } + + public ErrorClassificationRule build() { + return new ErrorClassificationRule(id, description, priority, exceptionClassRegex, + codeMethodRegex, errorType, dependency); + } + } +} diff --git a/cdap-watchdog/src/main/java/io/cdap/cdap/logging/ErrorLogsClassifier.java b/cdap-watchdog/src/main/java/io/cdap/cdap/logging/ErrorLogsClassifier.java index fe6b3376f917..a2233d829148 100644 --- a/cdap-watchdog/src/main/java/io/cdap/cdap/logging/ErrorLogsClassifier.java +++ b/cdap-watchdog/src/main/java/io/cdap/cdap/logging/ErrorLogsClassifier.java @@ -18,6 +18,8 @@ import ch.qos.logback.classic.spi.ILoggingEvent; import ch.qos.logback.classic.spi.IThrowableProxy; +import ch.qos.logback.classic.spi.StackTraceElementProxy; +import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Strings; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; @@ -26,34 +28,42 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.gson.Gson; +import com.google.gson.reflect.TypeToken; import com.google.inject.Inject; import io.cdap.cdap.api.dataset.lib.CloseableIterator; +import io.cdap.cdap.api.exception.ErrorCategory.ErrorCategoryEnum; import io.cdap.cdap.api.exception.ErrorType; import io.cdap.cdap.api.exception.FailureDetailsProvider; import io.cdap.cdap.api.exception.WrappedStageException; import io.cdap.cdap.api.metrics.MetricsCollectionService; import io.cdap.cdap.api.metrics.MetricsContext; +import io.cdap.cdap.common.conf.CConfiguration; import io.cdap.cdap.common.conf.Constants.Logging; import io.cdap.cdap.common.conf.Constants.Metrics; import io.cdap.cdap.logging.read.LogEvent; import io.cdap.cdap.proto.ErrorClassificationResponse; import io.cdap.http.HttpResponder; import io.netty.handler.codec.http.HttpResponseStatus; +import java.io.FileReader; +import java.io.Reader; +import java.lang.reflect.Type; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.TimeUnit; +import java.util.regex.Pattern; +import java.util.stream.Collectors; import javax.annotation.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Classifies error logs and returns {@link ErrorClassificationResponse}. - * TODO - - * - Add rule based classification. */ public class ErrorLogsClassifier { private static final Gson GSON = new Gson(); @@ -61,6 +71,7 @@ public class ErrorLogsClassifier { "io.cdap.cdap.runtime.spi.provisioner.dataproc.DataprocRuntimeException"; private static final ImmutableList ALLOWLIST_CLASSES = ImmutableList.builder().add(DATAPROC_RUNTIME_EXCEPTION).build(); + private static final String RULES_FILE_PATH_KEY = "error.classification.rules.file.path"; private static final LoadingCache CLASS_CACHE = CacheBuilder.newBuilder() .maximumSize(5000) .build(new CacheLoader() { @@ -78,6 +89,7 @@ public Boolean load(String className) { private final Cache> responseCache; private final MetricsCollectionService metricsCollectionService; + private final List ruleList; private static boolean isFailureDetailsProviderInstance(String className) { try { @@ -91,10 +103,25 @@ private static boolean isFailureDetailsProviderInstance(String className) { * Constructor for {@link ErrorLogsClassifier}. */ @Inject - public ErrorLogsClassifier(MetricsCollectionService metricsCollectionService) { + public ErrorLogsClassifier(CConfiguration cConf, + MetricsCollectionService metricsCollectionService) { this.metricsCollectionService = metricsCollectionService; responseCache = CacheBuilder.newBuilder().maximumSize(1000) .expireAfterWrite(7, TimeUnit.DAYS).build(); + String rulesFilePath = cConf.get(RULES_FILE_PATH_KEY); + if (Strings.isNullOrEmpty(rulesFilePath)) { + LOG.info("Skipping read rules list file as it is not configured."); + ruleList = Collections.emptyList(); + return; + } + + Type listType = new TypeToken>() {}.getType(); + try (Reader reader = new FileReader(rulesFilePath)) { + ruleList = GSON.fromJson(reader, listType); + } catch (Exception e) { + throw new IllegalArgumentException("Unable to read rules file.", e); + } + Collections.sort(ruleList); } /** @@ -107,13 +134,14 @@ public ErrorLogsClassifier(MetricsCollectionService metricsCollectionService) { * @param appId The name of application. * @param runId The run id of program. */ - public void classify(CloseableIterator logIter, HttpResponder responder, - String namespace, @Nullable String program, String appId, String runId) throws Exception { + public void classify(CloseableIterator logIter, + HttpResponder responder, String namespace, @Nullable String program, String appId, + String runId) { ErrorClassificationResponseCacheKey errorClassificationResponseCacheKey = new ErrorClassificationResponseCacheKey(namespace, program, appId, runId); List responses = responseCache.getIfPresent(errorClassificationResponseCacheKey); - if (responses != null) { + if (responses != null && !responses.isEmpty()) { responder.sendJson(HttpResponseStatus.OK, GSON.toJson(responses)); return; } @@ -122,11 +150,18 @@ public void classify(CloseableIterator logIter, HttpResponder responde responseCache.put(errorClassificationResponseCacheKey, responses); } + @VisibleForTesting + public List getRuleList() { + return ruleList; + } + private List getErrorClassificationResponse(String namespace, @Nullable String program, String appId, String runId, CloseableIterator logIter, HttpResponder responder) { - Map responseMap = new HashMap<>(); - Set responseSet = new HashSet<>(); + Map responseMap = new HashMap<>(); + Set responseSet = new HashSet<>(); + ErrorClassificationResponseWrapper ruleMatchedResponse = null; + boolean isFailureDetailsProviderInstancePresent = false; while (logIter.hasNext()) { ILoggingEvent logEvent = logIter.next().getLoggingEvent(); @@ -135,14 +170,26 @@ private List getErrorClassificationResponse(String IThrowableProxy throwableProxy = logEvent.getThrowableProxy(); while (throwableProxy != null) { if (isFailureDetailsProviderInstance(throwableProxy.getClassName())) { + isFailureDetailsProviderInstancePresent = true; populateResponse(throwableProxy, mdc, stageName, responseMap, responseSet); + } else if (!isFailureDetailsProviderInstancePresent) { + String errorReason = String.format("Program run '%s:%s:%s' failed," + + "View raw logs for more details.", namespace, appId, runId); + ruleMatchedResponse = findMatchingRule(throwableProxy, ruleMatchedResponse, errorReason); } throwableProxy = throwableProxy.getCause(); } } - List responses = new ArrayList<>(responseMap.values()); + + List responses = new ArrayList<>(responseMap.values()); responses.addAll(responseSet); - responder.sendJson(HttpResponseStatus.OK, GSON.toJson(responses)); + if (responses.isEmpty() && ruleMatchedResponse != null) { + responses.add(ruleMatchedResponse); + } + List errorClassificationResponses = responses.stream() + .map(ErrorClassificationResponseWrapper::getErrorClassificationResponse) + .collect(Collectors.toList()); + responder.sendJson(HttpResponseStatus.OK, GSON.toJson(errorClassificationResponses)); // emit metric try { @@ -150,14 +197,14 @@ private List getErrorClassificationResponse(String } catch (Exception e) { LOG.error("Unable to emit metric {}.", Metrics.Program.FAILED_RUNS_CLASSIFICATION_COUNT, e); } - return responses; + return errorClassificationResponses; } private void emitMetric(String namespace, @Nullable String program, String appId, String runId, - List responses) { + List responses) { MetricsContext metricsContext = metricsCollectionService.getContext( getParentTags(namespace, program, appId, runId)); - for (ErrorClassificationResponse response : responses) { + for (ErrorClassificationResponseWrapper response : responses) { MetricsContext context = metricsContext.childContext(getChildTags(response)); context.gauge(Metrics.Program.FAILED_RUNS_CLASSIFICATION_COUNT, 1); } @@ -175,16 +222,10 @@ private Map getParentTags(String namespace, return tags.build(); } - private Map getChildTags(ErrorClassificationResponse response) { - String stageName = response.getStageName(); - String errorCategory = response.getErrorCategory(); - if (!Strings.isNullOrEmpty(stageName) && !Strings.isNullOrEmpty(errorCategory) - && errorCategory.endsWith("-" + stageName)) { - // remove stageName from errorCategory to reduce metric cardinality. - errorCategory = errorCategory.substring(0, errorCategory.length() - stageName.length() - 1); - } + private Map getChildTags(ErrorClassificationResponseWrapper responseWrapper) { + ErrorClassificationResponse response = responseWrapper.getErrorClassificationResponse(); ImmutableMap.Builder tags = ImmutableMap.builder(); - tags.put(Metrics.Tag.ERROR_CATEGORY, errorCategory); + tags.put(Metrics.Tag.ERROR_CATEGORY, responseWrapper.getParentErrorCategory()); tags.put(Metrics.Tag.ERROR_TYPE, response.getErrorType()); tags.put(Metrics.Tag.DEPENDENCY, response.getDependency()); if (!Strings.isNullOrEmpty(response.getErrorCodeType())) { @@ -196,10 +237,9 @@ private Map getChildTags(ErrorClassificationResponse response) { return tags.build(); } - private void populateResponse(IThrowableProxy throwableProxy, - Map mdc, String stageName, - Map responseMap, - Set responseSet) { + private void populateResponse(IThrowableProxy throwableProxy, Map mdc, + String stageName, Map responseMap, + Set responseSet) { boolean stageNotPresent = Strings.isNullOrEmpty(stageName); if (stageNotPresent) { responseSet.add(getClassificationResponse(stageName, mdc, throwableProxy, @@ -219,21 +259,62 @@ private void populateResponse(IThrowableProxy throwableProxy, } } - private ErrorClassificationResponse getClassificationResponse(String stageName, + @Nullable + private ErrorClassificationResponseWrapper findMatchingRule(IThrowableProxy throwableProxy, + ErrorClassificationResponseWrapper ruleMatchedResponsePrev, String errorReason) { + ErrorClassificationResponseWrapper ruleMatchedResponse = null; + for (ErrorClassificationRule rule : getRuleList()) { + if (classNameMatched(rule.getExceptionClassRegex(), throwableProxy.getClassName()) + && codeMethodMatched(rule.getCodeMethodRegex(), + throwableProxy.getStackTraceElementProxyArray())) { + String errorMessage = throwableProxy.getMessage(); + ruleMatchedResponse = new ErrorClassificationResponseWrapper( + new ErrorClassificationResponse.Builder() + .setErrorCategory(ErrorCategoryEnum.OTHERS.name()) + .setErrorType(rule.getErrorType().name()) + .setErrorReason(errorReason) + .setErrorMessage(errorMessage) + .setDependency(String.valueOf(rule.isDependency())).build(), + throwableProxy.getClassName(), ErrorCategoryEnum.OTHERS.name(), rule.getPriority(), + rule.getId()); + break; + } + } + if (ruleMatchedResponse == null) { + return ruleMatchedResponsePrev; + } else if (ruleMatchedResponsePrev == null) { + return ruleMatchedResponse; + } else { + return ruleMatchedResponse.getRulePriority() < ruleMatchedResponsePrev.getRulePriority() + ? ruleMatchedResponse : ruleMatchedResponsePrev; + } + } + + private boolean codeMethodMatched(Pattern codeMethodRegex, + StackTraceElementProxy[] stackTraceElementProxies) { + return Arrays.stream(stackTraceElementProxies).map(StackTraceElementProxy::getStackTraceElement) + .anyMatch(ste -> codeMethodRegex.matcher(ste.toString()).find()); + } + + private boolean classNameMatched(Pattern classNameRegex, String exceptionClassName) { + return classNameRegex.matcher(exceptionClassName).find(); + } + + private ErrorClassificationResponseWrapper getClassificationResponse(String stageName, Map mdc, IThrowableProxy throwableProxy, String errorCategory) { - return new ErrorClassificationResponse.Builder() - .setStageName(stageName) - .setErrorCategory(errorCategory) - .setErrorReason(mdc.get(Logging.TAG_ERROR_REASON)) - .setErrorMessage(throwableProxy.getMessage()) - .setErrorType(mdc.get(Logging.TAG_ERROR_TYPE) == null ? ErrorType.UNKNOWN.name() : - mdc.get(Logging.TAG_ERROR_TYPE)) - .setDependency(mdc.get(Logging.TAG_DEPENDENCY) == null ? String.valueOf(false) : - mdc.get(Logging.TAG_DEPENDENCY)) - .setErrorCodeType(mdc.get(Logging.TAG_ERROR_CODE_TYPE)) - .setErrorCode(mdc.get(Logging.TAG_ERROR_CODE)) - .setSupportedDocumentationUrl(mdc.get(Logging.TAG_SUPPORTED_DOC_URL)) - .setThrowableClassName(throwableProxy.getClassName()) - .build(); + return new ErrorClassificationResponseWrapper( + new ErrorClassificationResponse.Builder() + .setStageName(stageName) + .setErrorCategory(errorCategory) + .setErrorReason(mdc.get(Logging.TAG_ERROR_REASON)) + .setErrorMessage(throwableProxy.getMessage()) + .setErrorType(mdc.get(Logging.TAG_ERROR_TYPE) == null ? ErrorType.UNKNOWN.name() + : mdc.get(Logging.TAG_ERROR_TYPE)) + .setDependency(mdc.get(Logging.TAG_DEPENDENCY) == null ? String.valueOf(false) + : mdc.get(Logging.TAG_DEPENDENCY)) + .setErrorCodeType(mdc.get(Logging.TAG_ERROR_CODE_TYPE)) + .setErrorCode(mdc.get(Logging.TAG_ERROR_CODE)) + .setSupportedDocumentationUrl(mdc.get(Logging.TAG_SUPPORTED_DOC_URL)).build(), + mdc.get(Logging.TAG_PARENT_ERROR_CATEGORY), throwableProxy.getClassName(), null, null); } } diff --git a/cdap-watchdog/src/main/java/io/cdap/cdap/logging/appender/LogAppender.java b/cdap-watchdog/src/main/java/io/cdap/cdap/logging/appender/LogAppender.java index 719145acf8c5..9de4d95645b6 100644 --- a/cdap-watchdog/src/main/java/io/cdap/cdap/logging/appender/LogAppender.java +++ b/cdap-watchdog/src/main/java/io/cdap/cdap/logging/appender/LogAppender.java @@ -26,7 +26,6 @@ import com.google.common.cache.CacheBuilder; import io.cdap.cdap.api.exception.ErrorCodeType; import io.cdap.cdap.api.exception.FailureDetailsProvider; -import io.cdap.cdap.api.exception.ProgramFailureException; import io.cdap.cdap.common.conf.Constants; import io.cdap.cdap.common.logging.LoggingContext; import io.cdap.cdap.common.logging.LoggingContextAccessor; @@ -138,6 +137,8 @@ private void addErrorClassificationTags(ILoggingEvent event, Map } modifiableMdc.put(Constants.Logging.TAG_ERROR_CATEGORY, provider.getErrorCategory().getErrorCategory()); + modifiableMdc.put(Constants.Logging.TAG_PARENT_ERROR_CATEGORY, + provider.getErrorCategory().getParentCategory().name()); if (provider.getErrorReason() != null) { modifiableMdc.put(Constants.Logging.TAG_ERROR_REASON, provider.getErrorReason()); } diff --git a/cdap-watchdog/src/test/java/io/cdap/cdap/logging/ErrorLogsClassifierTest.java b/cdap-watchdog/src/test/java/io/cdap/cdap/logging/ErrorLogsClassifierTest.java index 7c31164b9994..627290e2f8e5 100644 --- a/cdap-watchdog/src/test/java/io/cdap/cdap/logging/ErrorLogsClassifierTest.java +++ b/cdap-watchdog/src/test/java/io/cdap/cdap/logging/ErrorLogsClassifierTest.java @@ -18,21 +18,25 @@ import ch.qos.logback.classic.spi.ILoggingEvent; import ch.qos.logback.classic.spi.IThrowableProxy; +import ch.qos.logback.classic.spi.StackTraceElementProxy; import com.google.common.collect.Iterators; import com.google.gson.Gson; import com.google.gson.reflect.TypeToken; import io.cdap.cdap.api.dataset.lib.CloseableIterator; +import io.cdap.cdap.api.exception.ErrorType; import io.cdap.cdap.api.exception.ProgramFailureException; import io.cdap.cdap.api.exception.WrappedStageException; import io.cdap.cdap.api.metrics.MetricValue; import io.cdap.cdap.api.metrics.MetricValues; import io.cdap.cdap.api.metrics.MetricsCollectionService; +import io.cdap.cdap.common.conf.CConfiguration; import io.cdap.cdap.common.conf.Constants; import io.cdap.cdap.common.conf.Constants.Metrics; import io.cdap.cdap.logging.read.LogEvent; import io.cdap.cdap.logging.read.LogOffset; import io.cdap.cdap.metrics.collect.AggregatedMetricsCollectionService; import io.cdap.cdap.proto.ErrorClassificationResponse; +import java.io.IOException; import java.lang.reflect.Type; import java.util.ArrayList; import java.util.Collection; @@ -51,39 +55,26 @@ public class ErrorLogsClassifierTest { private static final Gson GSON = new Gson(); private final MockResponder responder = new MockResponder(); + private static final Type LIST_TYPE = + new TypeToken>() {}.getType(); @Test - public void testClassifyLogs() throws Exception { + public void testClassifyLogsWithFailureDetailsProvider() { LogEvent logEvent1 = new LogEvent(getEvent1(), LogOffset.LATEST_OFFSET); LogEvent logEvent2 = new LogEvent(getEvent2(), LogOffset.LATEST_OFFSET); List events = new ArrayList<>(); events.add(logEvent2); events.add(logEvent1); - Iterator iterator = events.iterator(); - CloseableIterator closeableIterator = new CloseableIterator() { - @Override - public boolean hasNext() { - return iterator.hasNext(); - } - - @Override - public LogEvent next() { - return iterator.next(); - } - - @Override - public void close() { - // no-op - } - }; + CloseableIterator closeableIterator = getCloseableIterator(events.iterator()); List metricValuesList = new ArrayList<>(); - MetricsCollectionService mockMetricsCollectionService = getMockCollectionService(metricValuesList); + MetricsCollectionService mockMetricsCollectionService = + getMockCollectionService(metricValuesList); mockMetricsCollectionService.startAndWait(); - ErrorLogsClassifier classifier = new ErrorLogsClassifier(mockMetricsCollectionService); + CConfiguration cConf = Mockito.mock(CConfiguration.class); + ErrorLogsClassifier classifier = new ErrorLogsClassifier(cConf, mockMetricsCollectionService); classifier.classify(closeableIterator, responder, "namespace", "program", "app", "run"); - Type listType = new TypeToken>() {}.getType(); List responses = - GSON.fromJson(responder.getResponseContentAsString(), listType); + GSON.fromJson(responder.getResponseContentAsString(), LIST_TYPE); mockMetricsCollectionService.stopAndWait(); Assert.assertEquals(1, responses.size()); Assert.assertEquals("stageName", responses.get(0).getStageName()); @@ -101,9 +92,62 @@ public void close() { Metrics.Program.FAILED_RUNS_CLASSIFICATION_COUNT)); } + @Test + public void testClassifyLogsWithRuleBasedClassification() { + List metricValuesList = new ArrayList<>(); + MetricsCollectionService mockMetricsCollectionService = + getMockCollectionService(metricValuesList); + mockMetricsCollectionService.startAndWait(); + CConfiguration cConf = Mockito.mock(CConfiguration.class); + ErrorLogsClassifier classifier = new ErrorLogsClassifier(cConf, mockMetricsCollectionService); + LogEvent logEvent3 = new LogEvent(getEvent3(IllegalArgumentException.class.getName()), + LogOffset.LATEST_OFFSET); + LogEvent logEvent4 = new LogEvent(getEvent3(IOException.class.getName()), + LogOffset.LATEST_OFFSET); + List events = new ArrayList<>(); + events.add(logEvent3); + events.add(logEvent4); + ErrorLogsClassifier spy = Mockito.spy(classifier); + Mockito.when(spy.getRuleList()).thenReturn(getRulesList()); + Mockito.doCallRealMethod().when(spy).classify(Mockito.any(), Mockito.any(), Mockito.any(), + Mockito.any(), Mockito.any(), Mockito.any()); + mockMetricsCollectionService.startAndWait(); + CloseableIterator closeableIterator = getCloseableIterator(events.iterator()); + spy.classify(closeableIterator, responder, "namespace", "program", "app", "run2"); + mockMetricsCollectionService.stopAndWait(); + List responses = + GSON.fromJson(responder.getResponseContentAsString(), LIST_TYPE); + Assert.assertEquals(1, responses.size()); + Assert.assertEquals("OTHERS", responses.get(0).getErrorCategory()); + Assert.assertEquals("USER", responses.get(0).getErrorType()); + Assert.assertSame(1, metricValuesList.size()); + Assert.assertTrue(containsMetric(metricValuesList.get(0), + Metrics.Program.FAILED_RUNS_CLASSIFICATION_COUNT)); + } + + private CloseableIterator getCloseableIterator(Iterator iterator) { + return new CloseableIterator() { + @Override + public boolean hasNext() { + return iterator.hasNext(); + } + + @Override + public LogEvent next() { + return iterator.next(); + } + + @Override + public void close() { + // no-op + } + }; + } + private ILoggingEvent getEvent1() { Map map = new HashMap<>(); map.put(Constants.Logging.TAG_FAILED_STAGE, "stageName"); + map.put(Constants.Logging.TAG_PARENT_ERROR_CATEGORY, "errorCategory"); map.put(Constants.Logging.TAG_ERROR_CATEGORY, "errorCategory"); map.put(Constants.Logging.TAG_ERROR_REASON, "errorReason"); map.put(Constants.Logging.TAG_ERROR_TYPE, "errorType"); @@ -119,6 +163,7 @@ private ILoggingEvent getEvent1() { private ILoggingEvent getEvent2() { Map map = new HashMap<>(); map.put(Constants.Logging.TAG_FAILED_STAGE, "stageName"); + map.put(Constants.Logging.TAG_PARENT_ERROR_CATEGORY, "errorCategory"); map.put(Constants.Logging.TAG_ERROR_CATEGORY, "errorCategory"); map.put(Constants.Logging.TAG_ERROR_REASON, "errorReason"); map.put(Constants.Logging.TAG_ERROR_TYPE, "errorType"); @@ -135,6 +180,45 @@ private ILoggingEvent getEvent2() { return event; } + private ILoggingEvent getEvent3(String exceptionClassName) { + Map map = new HashMap<>(); + IThrowableProxy throwableProxy = Mockito.mock(IThrowableProxy.class); + Mockito.when(throwableProxy.getMessage()).thenReturn("some error occurred"); + Mockito.when(throwableProxy.getClassName()) + .thenReturn(exceptionClassName); + StackTraceElementProxy steProxy = new StackTraceElementProxy( + new StackTraceElement("anyClass", "anyMethod", null, 2)); + StackTraceElementProxy [] stackTraceElementProxies = {steProxy}; + Mockito.when(throwableProxy.getStackTraceElementProxyArray()) + .thenReturn(stackTraceElementProxies); + ILoggingEvent event = Mockito.mock(ILoggingEvent.class); + Mockito.when(event.getThrowableProxy()).thenReturn(throwableProxy); + Mockito.when(event.getMDCPropertyMap()).thenReturn(map); + return event; + } + + private List getRulesList() { + List rules = new ArrayList<>(); + ErrorClassificationRule rule = + new ErrorClassificationRule.Builder() + .setId("001_illegal_argument_exception") + .setExceptionClassName(IllegalArgumentException.class.getName()) + .setErrorType(ErrorType.USER) + .setDependency(false) + .setPriority(100) + .build(); + rules.add(rule); + rule = new ErrorClassificationRule.Builder() + .setId("002_ioe_exception") + .setExceptionClassName(IOException.class.getName()) + .setErrorType(ErrorType.UNKNOWN) + .setDependency(false) + .setPriority(200) + .build(); + rules.add(rule); + return rules; + } + private MetricsCollectionService getMockCollectionService(Collection collection) { return new AggregatedMetricsCollectionService(1000L) { @Override