Skip to content

Commit

Permalink
[Wf-Diagnostics] Introduce new invariant to identify activity and wor…
Browse files Browse the repository at this point in the history
…kflow failures (cadence-workflow#6339)

* [Wf-Diagnostics] Introduce new variant to identify activity and workflow failures

* address comments
  • Loading branch information
sankari165 authored Oct 9, 2024
1 parent fd46d4c commit 74ec9b1
Show file tree
Hide file tree
Showing 6 changed files with 297 additions and 12 deletions.
101 changes: 101 additions & 0 deletions service/worker/diagnostics/invariant/failure/failure.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
// The MIT License (MIT)

// Copyright (c) 2017-2020 Uber Technologies Inc.

// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.

package failure

import (
"context"
"strings"

"github.com/uber/cadence/common/types"
"github.com/uber/cadence/service/worker/diagnostics/invariant"
)

// Failure is an invariant that will be used to identify the different failures in the workflow execution history
type Failure invariant.Invariant

type failure struct {
workflowExecutionHistory *types.GetWorkflowExecutionHistoryResponse
domain string
}

type Params struct {
WorkflowExecutionHistory *types.GetWorkflowExecutionHistoryResponse
Domain string
}

func NewInvariant(p Params) Failure {
return &failure{
workflowExecutionHistory: p.WorkflowExecutionHistory,
domain: p.Domain,
}
}

func (f *failure) Check(context.Context) ([]invariant.InvariantCheckResult, error) {
result := make([]invariant.InvariantCheckResult, 0)
events := f.workflowExecutionHistory.GetHistory().GetEvents()
for _, event := range events {
if event.GetWorkflowExecutionFailedEventAttributes() != nil && event.WorkflowExecutionFailedEventAttributes.Reason != nil {
attr := event.WorkflowExecutionFailedEventAttributes
reason := attr.Reason
identity := fetchIdentity(attr, events)
result = append(result, invariant.InvariantCheckResult{
InvariantType: WorkflowFailed.String(),
Reason: errorTypeFromReason(*reason).String(),
Metadata: invariant.MarshalData(failureMetadata{Identity: identity}),
})
}
if event.GetActivityTaskFailedEventAttributes() != nil && event.ActivityTaskFailedEventAttributes.Reason != nil {
attr := event.ActivityTaskFailedEventAttributes
reason := attr.Reason
result = append(result, invariant.InvariantCheckResult{
InvariantType: ActivityFailed.String(),
Reason: errorTypeFromReason(*reason).String(),
Metadata: invariant.MarshalData(failureMetadata{Identity: attr.Identity}),
})
}
}
return result, nil
}

func errorTypeFromReason(reason string) ErrorType {
if strings.Contains(reason, "Generic") {
return GenericError
}
if strings.Contains(reason, "Panic") {
return PanicError
}
return CustomError
}

func fetchIdentity(attr *types.WorkflowExecutionFailedEventAttributes, events []*types.HistoryEvent) string {
for _, event := range events {
if event.ID == attr.DecisionTaskCompletedEventID {
return event.GetDecisionTaskCompletedEventAttributes().Identity
}
}
return ""
}

func (f *failure) RootCause(ctx context.Context, results []invariant.InvariantCheckResult) ([]invariant.InvariantRootCauseResult, error) {
return nil, nil
}
134 changes: 134 additions & 0 deletions service/worker/diagnostics/invariant/failure/failure_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
// The MIT License (MIT)

// Copyright (c) 2017-2020 Uber Technologies Inc.

// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.

package failure

import (
"context"
"encoding/json"
"testing"

"github.com/stretchr/testify/require"

"github.com/uber/cadence/common"
"github.com/uber/cadence/common/types"
"github.com/uber/cadence/service/worker/diagnostics/invariant"
)

const (
testDomain = "test-domain"
)

func Test__Check(t *testing.T) {
metadata := failureMetadata{
Identity: "localhost",
}
metadataInBytes, err := json.Marshal(metadata)
require.NoError(t, err)
testCases := []struct {
name string
testData *types.GetWorkflowExecutionHistoryResponse
expectedResult []invariant.InvariantCheckResult
err error
}{
{
name: "workflow execution timeout",
testData: failedWfHistory(),
expectedResult: []invariant.InvariantCheckResult{
{
InvariantType: ActivityFailed.String(),
Reason: GenericError.String(),
Metadata: metadataInBytes,
},
{
InvariantType: ActivityFailed.String(),
Reason: PanicError.String(),
Metadata: metadataInBytes,
},
{
InvariantType: ActivityFailed.String(),
Reason: CustomError.String(),
Metadata: metadataInBytes,
},
{
InvariantType: WorkflowFailed.String(),
Reason: CustomError.String(),
Metadata: metadataInBytes,
},
},
err: nil,
},
}
for _, tc := range testCases {
inv := NewInvariant(Params{
WorkflowExecutionHistory: tc.testData,
Domain: testDomain,
})
result, err := inv.Check(context.Background())
require.Equal(t, tc.err, err)
require.Equal(t, len(tc.expectedResult), len(result))
require.ElementsMatch(t, tc.expectedResult, result)
}
}

func failedWfHistory() *types.GetWorkflowExecutionHistoryResponse {
return &types.GetWorkflowExecutionHistoryResponse{
History: &types.History{
Events: []*types.HistoryEvent{
{
ActivityTaskFailedEventAttributes: &types.ActivityTaskFailedEventAttributes{
Reason: common.StringPtr("cadenceInternal:Generic"),
Details: []byte("test-activity-failure"),
Identity: "localhost",
},
},
{
ActivityTaskFailedEventAttributes: &types.ActivityTaskFailedEventAttributes{
Reason: common.StringPtr("cadenceInternal:Panic"),
Details: []byte("test-activity-failure"),
Identity: "localhost",
},
},
{
ActivityTaskFailedEventAttributes: &types.ActivityTaskFailedEventAttributes{
Reason: common.StringPtr("custom error"),
Details: []byte("test-activity-failure"),
Identity: "localhost",
},
},
{
ID: 10,
DecisionTaskCompletedEventAttributes: &types.DecisionTaskCompletedEventAttributes{
Identity: "localhost",
},
},
{
WorkflowExecutionFailedEventAttributes: &types.WorkflowExecutionFailedEventAttributes{
Reason: common.StringPtr("custom error"),
Details: []byte("test-activity-failure"),
DecisionTaskCompletedEventID: 10,
},
},
},
},
}
}
50 changes: 50 additions & 0 deletions service/worker/diagnostics/invariant/failure/types.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// The MIT License (MIT)

// Copyright (c) 2017-2020 Uber Technologies Inc.

// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.

package failure

type ErrorType string

const (
CustomError ErrorType = "The failure is caused by a specific custom error returned from the service code"
GenericError ErrorType = "The failure is because of an error returned from the service code"
PanicError ErrorType = "The failure is caused by a panic in the service code"
)

func (e ErrorType) String() string {
return string(e)
}

type FailureType string

const (
ActivityFailed FailureType = "Activity Failed"
WorkflowFailed FailureType = "Workflow Failed"
)

func (f FailureType) String() string {
return string(f)
}

type failureMetadata struct {
Identity string
}
6 changes: 6 additions & 0 deletions service/worker/diagnostics/invariant/interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ package invariant

import (
"context"
"encoding/json"
)

// InvariantCheckResult is the result from the invariant check
Expand Down Expand Up @@ -57,3 +58,8 @@ type Invariant interface {
Check(context.Context) ([]InvariantCheckResult, error)
RootCause(context.Context, []InvariantCheckResult) ([]InvariantRootCauseResult, error)
}

func MarshalData(rc any) []byte {
data, _ := json.Marshal(rc)
return data
}
12 changes: 6 additions & 6 deletions service/worker/diagnostics/invariant/timeout/timeout.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ func (t *timeout) Check(context.Context) ([]invariant.InvariantCheckResult, erro
result = append(result, invariant.InvariantCheckResult{
InvariantType: TimeoutTypeExecution.String(),
Reason: event.GetWorkflowExecutionTimedOutEventAttributes().GetTimeoutType().String(),
Metadata: marshalData(data),
Metadata: invariant.MarshalData(data),
})
}
if event.ActivityTaskTimedOutEventAttributes != nil {
Expand All @@ -81,15 +81,15 @@ func (t *timeout) Check(context.Context) ([]invariant.InvariantCheckResult, erro
result = append(result, invariant.InvariantCheckResult{
InvariantType: TimeoutTypeActivity.String(),
Reason: event.GetActivityTaskTimedOutEventAttributes().GetTimeoutType().String(),
Metadata: marshalData(metadata),
Metadata: invariant.MarshalData(metadata),
})
}
if event.DecisionTaskTimedOutEventAttributes != nil {
reason, metadata := reasonForDecisionTaskTimeouts(event, events)
result = append(result, invariant.InvariantCheckResult{
InvariantType: TimeoutTypeDecision.String(),
Reason: reason,
Metadata: marshalData(metadata),
Metadata: invariant.MarshalData(metadata),
})
}
if event.ChildWorkflowExecutionTimedOutEventAttributes != nil {
Expand All @@ -102,7 +102,7 @@ func (t *timeout) Check(context.Context) ([]invariant.InvariantCheckResult, erro
result = append(result, invariant.InvariantCheckResult{
InvariantType: TimeoutTypeChildWorkflow.String(),
Reason: event.GetChildWorkflowExecutionTimedOutEventAttributes().TimeoutType.String(),
Metadata: marshalData(data),
Metadata: invariant.MarshalData(data),
})
}
}
Expand Down Expand Up @@ -166,7 +166,7 @@ func (t *timeout) checkTasklist(ctx context.Context, issue invariant.InvariantCh
}

tasklistBacklog := resp.GetTaskListStatus().GetBacklogCountHint()
polllersMetadataInBytes := marshalData(PollersMetadata{TaskListBacklog: tasklistBacklog})
polllersMetadataInBytes := invariant.MarshalData(PollersMetadata{TaskListBacklog: tasklistBacklog})
if len(resp.GetPollers()) == 0 {
return invariant.InvariantRootCauseResult{
RootCause: invariant.RootCauseTypeMissingPollers,
Expand All @@ -187,7 +187,7 @@ func checkHeartbeatStatus(issue invariant.InvariantCheckResult) ([]invariant.Inv
return nil, err
}

heartbeatingMetadataInBytes := marshalData(HeartbeatingMetadata{TimeElapsed: metadata.TimeElapsed})
heartbeatingMetadataInBytes := invariant.MarshalData(HeartbeatingMetadata{TimeElapsed: metadata.TimeElapsed})

if metadata.HeartBeatTimeout == 0 && activityStarted(metadata) {
return []invariant.InvariantRootCauseResult{
Expand Down
6 changes: 0 additions & 6 deletions service/worker/diagnostics/invariant/timeout/timeout_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
package timeout

import (
"encoding/json"
"fmt"
"sort"
"time"
Expand Down Expand Up @@ -134,8 +133,3 @@ func getExecutionTime(startID, timeoutID int64, events []*types.HistoryEvent) ti
lastEvent := events[timeoutID-1]
return time.Unix(0, common.Int64Default(lastEvent.Timestamp)).Sub(time.Unix(0, common.Int64Default(firstEvent.Timestamp)))
}

func marshalData(rc any) []byte {
data, _ := json.Marshal(rc)
return data
}

0 comments on commit 74ec9b1

Please sign in to comment.