Skip to content

Commit

Permalink
feat(components/memory): use common db + dmesg poller for events, mov…
Browse files Browse the repository at this point in the history
…e out of "dmesg" component

Signed-off-by: Gyuho Lee <[email protected]>
  • Loading branch information
gyuho committed Jan 25, 2025
1 parent d24c32c commit 94487d8
Show file tree
Hide file tree
Showing 10 changed files with 348 additions and 206 deletions.
3 changes: 1 addition & 2 deletions components/dmesg/filters.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@ import (
)

func DefaultLogFilters(ctx context.Context) ([]*query_log_common.Filter, error) {
defaultFilters := DefaultDmesgFiltersForMemory()
defaultFilters = append(defaultFilters, DefaultDmesgFiltersForCPU()...)
defaultFilters := DefaultDmesgFiltersForCPU()
defaultFilters = append(defaultFilters, DefaultDmesgFiltersForFileDescriptor()...)

nvidiaInstalled, err := nvidia_query.GPUsInstalled(ctx)
Expand Down
70 changes: 0 additions & 70 deletions components/dmesg/filters_memory.go

This file was deleted.

110 changes: 37 additions & 73 deletions components/memory/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,42 +5,62 @@ import (
"context"
"database/sql"
"fmt"
"strconv"
"time"

"github.com/leptonai/gpud/components"
"github.com/leptonai/gpud/components/common"
"github.com/leptonai/gpud/components/dmesg"
events_db "github.com/leptonai/gpud/components/db"
memory_id "github.com/leptonai/gpud/components/memory/id"
"github.com/leptonai/gpud/components/memory/metrics"
"github.com/leptonai/gpud/components/query"
query_log "github.com/leptonai/gpud/components/query/log"
"github.com/leptonai/gpud/log"

"github.com/prometheus/client_golang/prometheus"
)

func New(ctx context.Context, cfg Config) components.Component {
func New(ctx context.Context, cfg Config) (components.Component, error) {
cfg.Query.SetDefaultsIfNotSet()
setDefaultPoller(cfg)

cctx, ccancel := context.WithCancel(ctx)
getDefaultPoller().Start(cctx, cfg.Query, memory_id.Name)

return &component{
rootCtx: ctx,
cancel: ccancel,
poller: getDefaultPoller(),
eventsStore, err := events_db.NewStore(
cfg.Query.State.DBRW,
cfg.Query.State.DBRO,
events_db.CreateDefaultTableName(memory_id.Name),
3*24*time.Hour,
)
if err != nil {
ccancel()
return nil, err
}

w, err := newWatcher(cctx, eventsStore)
if err != nil {
ccancel()
return nil, err
}

return &component{
rootCtx: ctx,
cancel: ccancel,
poller: getDefaultPoller(),
cfg: cfg,
watcher: w,
eventsStore: eventsStore,
}, nil
}

var _ components.Component = (*component)(nil)

type component struct {
rootCtx context.Context
cancel context.CancelFunc
poller query.Poller
gatherer prometheus.Gatherer
rootCtx context.Context
cancel context.CancelFunc
poller query.Poller
cfg Config
watcher *watcher
eventsStore events_db.Store
gatherer prometheus.Gatherer
}

func (c *component) Name() string { return memory_id.Name }
Expand Down Expand Up @@ -87,67 +107,8 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
return output.States()
}

const (
EventKeyUnixSeconds = "unix_seconds"
EventKeyLogLine = "log_line"
)

func (c *component) Events(ctx context.Context, since time.Time) ([]components.Event, error) {
dmesgC, err := components.GetComponent(dmesg.Name)
if err != nil {
return nil, err
}

var dmesgComponent *dmesg.Component
if o, ok := dmesgC.(interface{ Unwrap() interface{} }); ok {
if unwrapped, ok := o.Unwrap().(*dmesg.Component); ok {
dmesgComponent = unwrapped
} else {
return nil, fmt.Errorf("expected *dmesg.Component, got %T", dmesgC)
}
}
dmesgEvents, err := dmesgComponent.Events(ctx, since)
if err != nil {
return nil, err
}

events := make([]components.Event, 0)
for _, ev := range dmesgEvents {
v, ok := ev.ExtraInfo[dmesg.EventKeyDmesgMatchedLogItem]
if !ok {
continue
}
item, err := query_log.ParseItemJSON([]byte(v))
if err != nil || item.Matched == nil {
log.Logger.Errorw("failed to parse log item or none matched", "error", err)
continue
}

name := ""
included := false
for _, owner := range item.Matched.OwnerReferences {
if owner != memory_id.Name {
continue
}
name = item.Matched.Name
included = true
}
if !included {
continue
}

events = append(events, components.Event{
Time: ev.Time,
Name: name,
Type: common.EventTypeWarning,
ExtraInfo: map[string]string{
EventKeyUnixSeconds: strconv.FormatInt(ev.Time.Unix(), 10),
EventKeyLogLine: item.Line,
},
})
}

return events, nil
return c.eventsStore.Get(ctx, since)
}

func (c *component) Metrics(ctx context.Context, since time.Time) ([]components.Metric, error) {
Expand Down Expand Up @@ -186,6 +147,9 @@ func (c *component) Close() error {
// safe to call stop multiple times
c.poller.Stop(memory_id.Name)

c.watcher.close()
c.eventsStore.Close()

return nil
}

Expand Down
21 changes: 21 additions & 0 deletions components/memory/dmesg/dmesg.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,25 @@ const (
// NOTE: this is often followed by a line like:
// [Sun Dec 8 09:23:39 2024] oom_reaper: reaped process 345646 (vector), now anon-rss:0kB, file-rss:0kB, shmem-rss:0
// (to reap the memory used by the OOM victim)
EventOOM = "memory_oom"
RegexOOM = `Out of memory:`

// e.g.,
// oom-kill:constraint=CONSTRAINT_MEMCG,nodemask=(null),
// [...] oom-kill:constraint=CONSTRAINT_MEMCG,nodemask=(null),
EventOOMKillConstraint = "memory_oom_kill_constraint"
RegexOOMKillConstraint = `oom-kill:constraint=`

// e.g.,
// postgres invoked oom-killer: gfp_mask=0x201d2, order=0, oomkilladj=0
// [...] postgres invoked oom-killer: gfp_mask=0x201d2, order=0, oomkilladj=0
EventOOMKiller = "memory_oom_killer"
RegexOOMKiller = `(?i)\b(invoked|triggered) oom-killer\b`

// e.g.,
// Memory cgroup out of memory: Killed process 123, UID 48, (httpd).
// [...] Memory cgroup out of memory: Killed process 123, UID 48, (httpd).
EventOOMCgroup = "memory_oom_cgroup"
RegexOOMCgroup = `Memory cgroup out of memory`

// May indicate that Dual Inline Memory Module (DIMM) is beginning to fail.
Expand All @@ -38,6 +42,7 @@ const (
// ref.
// https://serverfault.com/questions/682909/how-to-find-faulty-memory-module-from-mce-message
// https://github.com/Azure/azurehpc/blob/2d57191cb35ed638525ba9424cc2aa1b5abe1c05/experimental/aks_npd_draino/npd/deployment/node-problem-detector-config.yaml#L51C20-L51C40
EventEDACCorrectableErrors = "memory_edac_correctable_errors"
RegexEDACCorrectableErrors = `.*CE memory read error.*`
)

Expand Down Expand Up @@ -85,3 +90,19 @@ func HasEDACCorrectableErrors(line string) bool {
}
return false
}

type Match struct {
Check func(string) bool
Name string
Message string
}

func GetMatches() []Match {
return []Match{
{Check: HasOOM, Name: EventOOM, Message: "oom detected"},
{Check: HasOOMKillConstraint, Name: EventOOMKillConstraint, Message: "oom kill constraint detected"},
{Check: HasOOMKiller, Name: EventOOMKiller, Message: "oom killer detected"},
{Check: HasOOMCgroup, Name: EventOOMCgroup, Message: "oom cgroup detected"},
{Check: HasEDACCorrectableErrors, Name: EventEDACCorrectableErrors, Message: "edac correctable errors detected"},
}
}
24 changes: 24 additions & 0 deletions components/memory/testdata/dmesg.decode.iso.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
kern :warn : 2025-01-21T04:41:44,283302+00:00 NVRM: Xid (PCI:0000:38:00): 13, pid='<unknown>', name=<unknown>, Graphics SM Global Exception on (GPC 8, TPC 5, SM 1): Multiple Warp Errors
kern :warn : 2025-01-21T04:41:44,283390+00:00 NVRM: Xid (PCI:0000:38:00): 13, pid='<unknown>', name=<unknown>, Graphics Exception: ESR 0x546fb0=0x11f000e 0x546fb4=0x24 0x546fa8=0xf81eb60 0x546fac=0x1174
kern :warn : 2025-01-21T04:41:44,283575+00:00 NVRM: Xid (PCI:0000:38:00): 13, pid='<unknown>', name=<unknown>, Graphics SM Warp Exception on (GPC 9, TPC 1, SM 1): Out Of Range Address
kern :warn : 2025-01-21T04:41:44,283671+00:00 NVRM: Xid (PCI:0000:38:00): 13, pid='<unknown>', name=<unknown>, Graphics SM Global Exception on (GPC 9, TPC 1, SM 1): Multiple Warp Errors
kern :warn : 2025-01-21T04:41:44,283758+00:00 NVRM: Xid (PCI:0000:38:00): 13, pid='<unknown>', name=<unknown>, Graphics Exception: ESR 0x54cfb0=0x11f000e 0x54cfb4=0x24 0x54cfa8=0xf81eb60 0x54cfac=0x1174
kern :warn : 2025-01-21T04:41:44,283937+00:00 NVRM: Xid (PCI:0000:38:00): 13, pid='<unknown>', name=<unknown>, Graphics SM Warp Exception on (GPC 9, TPC 2, SM 0): Out Of Range Address
kern :warn : 2025-01-21T04:41:44,284042+00:00 NVRM: Xid (PCI:0000:38:00): 13, pid='<unknown>', name=<unknown>, Graphics Exception: ESR 0x54d730=0x11c000e 0x54d734=0x20 0x54d728=0xf81eb60 0x54d72c=0x1174
kern :warn : 2025-01-21T04:41:44,284250+00:00 NVRM: Xid (PCI:0000:38:00): 13, pid='<unknown>', name=<unknown>, Graphics SM Warp Exception on (GPC 9, TPC 5, SM 0): Out Of Range Address
kern :warn : 2025-01-21T04:41:44,284345+00:00 NVRM: Xid (PCI:0000:38:00): 13, pid='<unknown>', name=<unknown>, Graphics SM Global Exception on (GPC 9, TPC 5, SM 0): Multiple Warp Errors
kern :warn : 2025-01-21T04:41:44,284433+00:00 NVRM: Xid (PCI:0000:38:00): 13, pid='<unknown>', name=<unknown>, Graphics Exception: ESR 0x54ef30=0x117000e 0x54ef34=0x24 0x54ef28=0xf81eb60 0x54ef2c=0x1174
kern :warn : 2025-01-21T04:41:44,284616+00:00 NVRM: Xid (PCI:0000:38:00): 13, pid='<unknown>', name=<unknown>, Graphics SM Warp Exception on (GPC 10, TPC 1, SM 1): Out Of Range Address
kern :warn : 2025-01-21T04:41:44,284705+00:00 NVRM: Xid (PCI:0000:38:00): 13, pid='<unknown>', name=<unknown>, Graphics SM Global Exception on (GPC 10, TPC 1, SM 1): Multiple Warp Errors
kern :warn : 2025-01-21T04:41:44,284792+00:00 NVRM: Xid (PCI:0000:38:00): 13, pid='<unknown>', name=<unknown>, Graphics Exception: ESR 0x554fb0=0x119000e 0x554fb4=0x24 0x554fa8=0xf81eb60 0x554fac=0x1174
kern :warn : 2025-01-21T04:41:44,284971+00:00 NVRM: Xid (PCI:0000:38:00): 13, pid='<unknown>', name=<unknown>, Graphics SM Warp Exception on (GPC 10, TPC 4, SM 0): Out Of Range Address
kern :warn : 2025-01-21T04:41:44,285060+00:00 NVRM: Xid (PCI:0000:38:00): 13, pid='<unknown>', name=<unknown>, Graphics SM Global Exception on (GPC 10, TPC 4, SM 0): Multiple Warp Errors
kern :warn : 2025-01-21T04:41:44,285147+00:00 NVRM: Xid (PCI:0000:38:00): 13, pid='<unknown>', name=<unknown>, Graphics Exception: ESR 0x556730=0x117000e 0x556734=0x24 0x556728=0xf81eb60 0x55672c=0x1174
kern :warn : 2025-01-21T04:41:44,287197+00:00 NVRM: Xid (PCI:0000:38:00): 43, pid=2924364, name=pt_main_thread, Ch 00000008
kern :warn : 2025-01-21T08:41:30,287197+00:00 oom_reaper: reaped process 345646 (vector), now anon-rss:0kB, file-rss:0kB, shmem-rss:0
kern :warn : 2025-01-21T08:42:30,287197+00:00 oom-kill:constraint=CONSTRAINT_MEMCG,nodemask=(null),
kern :warn : 2025-01-21T08:43:30,287197+00:00 Out of memory: Killed process 123, UID 48, (httpd)
kern :warn : 2025-01-21T08:44:30,287197+00:00 Out of memory: Kill process 456 (python) score 50 or sacrifice child
kern :warn : 2025-01-21T08:45:30,287197+00:00 Out of memory: Killed process 123, UID 48, (httpd).
kern :warn : 2025-01-21T08:46:30,287197+00:00 oom-kill:constraint=CONSTRAINT_MEMCG,nodemask=(null),cpuset=cri-containerd-3fc28fa1c647ceede9f6b340d0b16c9f1f663698972d22a52e296f291638e014.scope,mems_allowed=0-1,oom_memcg=/lxc.payload.ny2g2r5hh3-lxc/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-poda8697f49_441d_4d4f_90d2_6d8e1fa3bbe7.slice,task_memcg=/lxc.payload.ny2g2r5hh3-lxc/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-poda8697f49_441d_4d4f_90d2_6d8e1fa3bbe7.slice/cri-containerd-3fc28fa1c647ceede9f6b340d0b16c9f1f663698972d22a52e296f291638e014.scope,task=node,pid=863987,uid=0
kern :warn : 2025-01-21T08:47:30,287197+00:00 oom-kill:constraint=OTHER_CONSTRAINT
Loading

0 comments on commit 94487d8

Please sign in to comment.