Skip to content

Commit

Permalink
feat(dmesg): skip "detected invalid context" from peermem events
Browse files Browse the repository at this point in the history
as the latest driver fixes the issue

ref. Mellanox/nv_peer_memory#120

Signed-off-by: Gyuho Lee <[email protected]>
  • Loading branch information
gyuho committed Jan 6, 2025
1 parent 35fe95a commit 521e9b5
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 66 deletions.
53 changes: 1 addition & 52 deletions components/accelerator/nvidia/peermem/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,11 @@ package peermem
import (
"context"
"fmt"
"strconv"
"time"

"github.com/leptonai/gpud/components"
nvidia_peermem_id "github.com/leptonai/gpud/components/accelerator/nvidia/peermem/id"
nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query"
"github.com/leptonai/gpud/components/dmesg"
"github.com/leptonai/gpud/components/query"
"github.com/leptonai/gpud/log"
)
Expand Down Expand Up @@ -98,56 +96,7 @@ const (
)

func (c *component) Events(ctx context.Context, since time.Time) ([]components.Event, error) {
dmesgC, err := components.GetComponent(dmesg.Name)
if err != nil {
return nil, err
}

var dmesgComponent *dmesg.Component
if o, ok := dmesgC.(interface{ Unwrap() interface{} }); ok {
if unwrapped, ok := o.Unwrap().(*dmesg.Component); ok {
dmesgComponent = unwrapped
} else {
return nil, fmt.Errorf("expected *dmesg.Component, got %T", dmesgC)
}
}
dmesgTailResults, err := dmesgComponent.TailScan()
if err != nil {
return nil, err
}

// dedup by minute level
seenMinute := make(map[int64]struct{})
events := make([]components.Event, 0)
for _, logItem := range dmesgTailResults.TailScanMatched {
if logItem.Error != nil {
continue
}
if logItem.Matched == nil {
continue
}
if logItem.Matched.Name != dmesg.EventNvidiaPeermemInvalidContext {
continue
}

minute := logItem.Time.Unix() / 60
if _, ok := seenMinute[minute]; ok {
continue
}
seenMinute[minute] = struct{}{}

events = append(events, components.Event{
Time: logItem.Time,
Name: EventNamePeermemInvalidContextFromDmesg,
Type: components.EventTypeCritical,
ExtraInfo: map[string]string{
EventKeyPeermemInvalidContextFromDmesgUnixSeconds: strconv.FormatInt(logItem.Time.Unix(), 10),
EventKeyPeermemInvalidContextFromDmesgLogLine: logItem.Line,
},
})
}

return events, nil
return nil, nil
}

func (c *component) Metrics(ctx context.Context, since time.Time) ([]components.Metric, error) {
Expand Down
10 changes: 10 additions & 0 deletions components/accelerator/nvidia/query/peermem/dmesg.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,14 @@ package peermem
// [Thu Sep 19 02:29:46 2024] nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing
// [Thu Sep 19 02:29:46 2024] nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing
// [Thu Sep 19 02:29:46 2024] nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing
//
// NOTE
// skip this for now
// the latest driver https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-560-35-03/index.html#abstract fixes this issue
// "4272659 – A design defect has been identified and mitigated in the GPU kernel-mode driver, related to the GPUDirect RDMA support
// in MLNX_OFED and some Ubuntu kernels, commonly referred to as the PeerDirect technology, i.e. the one using the peer-memory kernel
// patch. In specific scenarios, for example involving the cleanup after killing of a multi-process application, this issue may lead to
// use-after-free and potentially to kernel memory corruption."
// ref. https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-535-129-03/index.html
// ref. https://github.com/Mellanox/nv_peer_memory/issues/120
const RegexInvalidContext = `.*ERROR detected invalid context, skipping further processing`
14 changes: 0 additions & 14 deletions components/dmesg/filters_nvidia.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@ import (
nvidia_component_error_sxid_id "github.com/leptonai/gpud/components/accelerator/nvidia/error/sxid/id"
nvidia_component_error_xid_id "github.com/leptonai/gpud/components/accelerator/nvidia/error/xid/id"
nvidia_nccl_id "github.com/leptonai/gpud/components/accelerator/nvidia/nccl/id"
nvidia_peermem_id "github.com/leptonai/gpud/components/accelerator/nvidia/peermem/id"
nvidia_query_nccl "github.com/leptonai/gpud/components/accelerator/nvidia/query/nccl"
nvidia_query_peermem "github.com/leptonai/gpud/components/accelerator/nvidia/query/peermem"
nvidia_query_sxid "github.com/leptonai/gpud/components/accelerator/nvidia/query/sxid"
nvidia_query_xid "github.com/leptonai/gpud/components/accelerator/nvidia/query/xid"
query_log_common "github.com/leptonai/gpud/components/query/log/common"
Expand All @@ -33,13 +31,6 @@ const (
// https://docs.nvidia.com/datacenter/tesla/pdf/fabric-manager-user-guide.pdf
EventNvidiaNVSwitchSXid = "nvidia_nvswitch_sxid"

// repeated messages may indicate more persistent issue on the inter-GPU communication
// e.g.,
// [Thu Sep 19 02:29:46 2024] nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing
// [Thu Sep 19 02:29:46 2024] nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing
// [Thu Sep 19 02:29:46 2024] nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing
EventNvidiaPeermemInvalidContext = "nvidia_peermem_invalid_context"

// repeated messages may indicate GPU communication issues, which may happen due to fabric manager issues
// e.g.,
// [Thu Oct 10 03:06:53 2024] pt_main_thread[2536443]: segfault at 7f797fe00000 ip 00007f7c7ac69996 sp 00007f7c12fd7c30 error 4 in libnccl.so.2[7f7c7ac00000+d3d3000]
Expand All @@ -58,11 +49,6 @@ func DefaultDmesgFiltersForNvidia() []*query_log_common.Filter {
Regex: ptr.To(nvidia_query_sxid.RegexNVSwitchSXidDmesg),
OwnerReferences: []string{nvidia_component_error_sxid_id.Name},
},
{
Name: EventNvidiaPeermemInvalidContext,
Regex: ptr.To(nvidia_query_peermem.RegexInvalidContext),
OwnerReferences: []string{nvidia_peermem_id.Name},
},
{
Name: EventNvidiaNCCLSegfaultInLibnccl,
Regex: ptr.To(nvidia_query_nccl.RegexSegfaultInLibnccl),
Expand Down

0 comments on commit 521e9b5

Please sign in to comment.