From 521e9b5cd545ba0f8cf33e932bf82de0d0e6a5b9 Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Mon, 6 Jan 2025 20:58:10 +0800 Subject: [PATCH] feat(dmesg): skip "detected invalid context" from peermem events as the latest driver fixes the issue ref. https://github.com/Mellanox/nv_peer_memory/issues/120 Signed-off-by: Gyuho Lee --- .../accelerator/nvidia/peermem/component.go | 53 +------------------ .../accelerator/nvidia/query/peermem/dmesg.go | 10 ++++ components/dmesg/filters_nvidia.go | 14 ----- 3 files changed, 11 insertions(+), 66 deletions(-) diff --git a/components/accelerator/nvidia/peermem/component.go b/components/accelerator/nvidia/peermem/component.go index 4396345a..64d99517 100644 --- a/components/accelerator/nvidia/peermem/component.go +++ b/components/accelerator/nvidia/peermem/component.go @@ -5,13 +5,11 @@ package peermem import ( "context" "fmt" - "strconv" "time" "github.com/leptonai/gpud/components" nvidia_peermem_id "github.com/leptonai/gpud/components/accelerator/nvidia/peermem/id" nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" - "github.com/leptonai/gpud/components/dmesg" "github.com/leptonai/gpud/components/query" "github.com/leptonai/gpud/log" ) @@ -98,56 +96,7 @@ const ( ) func (c *component) Events(ctx context.Context, since time.Time) ([]components.Event, error) { - dmesgC, err := components.GetComponent(dmesg.Name) - if err != nil { - return nil, err - } - - var dmesgComponent *dmesg.Component - if o, ok := dmesgC.(interface{ Unwrap() interface{} }); ok { - if unwrapped, ok := o.Unwrap().(*dmesg.Component); ok { - dmesgComponent = unwrapped - } else { - return nil, fmt.Errorf("expected *dmesg.Component, got %T", dmesgC) - } - } - dmesgTailResults, err := dmesgComponent.TailScan() - if err != nil { - return nil, err - } - - // dedup by minute level - seenMinute := make(map[int64]struct{}) - events := make([]components.Event, 0) - for _, logItem := range dmesgTailResults.TailScanMatched { - if logItem.Error != nil { - continue - } - if logItem.Matched == nil { - continue - } - if logItem.Matched.Name != dmesg.EventNvidiaPeermemInvalidContext { - continue - } - - minute := logItem.Time.Unix() / 60 - if _, ok := seenMinute[minute]; ok { - continue - } - seenMinute[minute] = struct{}{} - - events = append(events, components.Event{ - Time: logItem.Time, - Name: EventNamePeermemInvalidContextFromDmesg, - Type: components.EventTypeCritical, - ExtraInfo: map[string]string{ - EventKeyPeermemInvalidContextFromDmesgUnixSeconds: strconv.FormatInt(logItem.Time.Unix(), 10), - EventKeyPeermemInvalidContextFromDmesgLogLine: logItem.Line, - }, - }) - } - - return events, nil + return nil, nil } func (c *component) Metrics(ctx context.Context, since time.Time) ([]components.Metric, error) { diff --git a/components/accelerator/nvidia/query/peermem/dmesg.go b/components/accelerator/nvidia/query/peermem/dmesg.go index 3a12cef8..1c6ccea7 100644 --- a/components/accelerator/nvidia/query/peermem/dmesg.go +++ b/components/accelerator/nvidia/query/peermem/dmesg.go @@ -6,4 +6,14 @@ package peermem // [Thu Sep 19 02:29:46 2024] nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing // [Thu Sep 19 02:29:46 2024] nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing // [Thu Sep 19 02:29:46 2024] nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing +// +// NOTE +// skip this for now +// the latest driver https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-560-35-03/index.html#abstract fixes this issue +// "4272659 – A design defect has been identified and mitigated in the GPU kernel-mode driver, related to the GPUDirect RDMA support +// in MLNX_OFED and some Ubuntu kernels, commonly referred to as the PeerDirect technology, i.e. the one using the peer-memory kernel +// patch. In specific scenarios, for example involving the cleanup after killing of a multi-process application, this issue may lead to +// use-after-free and potentially to kernel memory corruption." +// ref. https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-535-129-03/index.html +// ref. https://github.com/Mellanox/nv_peer_memory/issues/120 const RegexInvalidContext = `.*ERROR detected invalid context, skipping further processing` diff --git a/components/dmesg/filters_nvidia.go b/components/dmesg/filters_nvidia.go index 3838d6fd..785eacf5 100644 --- a/components/dmesg/filters_nvidia.go +++ b/components/dmesg/filters_nvidia.go @@ -4,9 +4,7 @@ import ( nvidia_component_error_sxid_id "github.com/leptonai/gpud/components/accelerator/nvidia/error/sxid/id" nvidia_component_error_xid_id "github.com/leptonai/gpud/components/accelerator/nvidia/error/xid/id" nvidia_nccl_id "github.com/leptonai/gpud/components/accelerator/nvidia/nccl/id" - nvidia_peermem_id "github.com/leptonai/gpud/components/accelerator/nvidia/peermem/id" nvidia_query_nccl "github.com/leptonai/gpud/components/accelerator/nvidia/query/nccl" - nvidia_query_peermem "github.com/leptonai/gpud/components/accelerator/nvidia/query/peermem" nvidia_query_sxid "github.com/leptonai/gpud/components/accelerator/nvidia/query/sxid" nvidia_query_xid "github.com/leptonai/gpud/components/accelerator/nvidia/query/xid" query_log_common "github.com/leptonai/gpud/components/query/log/common" @@ -33,13 +31,6 @@ const ( // https://docs.nvidia.com/datacenter/tesla/pdf/fabric-manager-user-guide.pdf EventNvidiaNVSwitchSXid = "nvidia_nvswitch_sxid" - // repeated messages may indicate more persistent issue on the inter-GPU communication - // e.g., - // [Thu Sep 19 02:29:46 2024] nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing - // [Thu Sep 19 02:29:46 2024] nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing - // [Thu Sep 19 02:29:46 2024] nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing - EventNvidiaPeermemInvalidContext = "nvidia_peermem_invalid_context" - // repeated messages may indicate GPU communication issues, which may happen due to fabric manager issues // e.g., // [Thu Oct 10 03:06:53 2024] pt_main_thread[2536443]: segfault at 7f797fe00000 ip 00007f7c7ac69996 sp 00007f7c12fd7c30 error 4 in libnccl.so.2[7f7c7ac00000+d3d3000] @@ -58,11 +49,6 @@ func DefaultDmesgFiltersForNvidia() []*query_log_common.Filter { Regex: ptr.To(nvidia_query_sxid.RegexNVSwitchSXidDmesg), OwnerReferences: []string{nvidia_component_error_sxid_id.Name}, }, - { - Name: EventNvidiaPeermemInvalidContext, - Regex: ptr.To(nvidia_query_peermem.RegexInvalidContext), - OwnerReferences: []string{nvidia_peermem_id.Name}, - }, { Name: EventNvidiaNCCLSegfaultInLibnccl, Regex: ptr.To(nvidia_query_nccl.RegexSegfaultInLibnccl),