Skip to content

Commit

Permalink
feat(components/memory): use common db + dmesg poller for events, mov…
Browse files Browse the repository at this point in the history
…e out of "dmesg" component (#324)

c.f., #322

Tested:

<img width="1538" alt="Screenshot 2025-01-25 at 10 38 26 AM"
src="https://github.com/user-attachments/assets/89cbaebe-9e24-4343-a1a2-3883e592ca21"
/>

```json
    "component": "memory",
    "startTime": "2024-11-15T04:48:56Z",
    "endTime": "2025-01-25T02:39:01.610443725Z",
    "events": [
      {
        "time": "2025-01-25T02:38:04Z",
        "name": "memory_oom_kill_constraint",
        "type": "Warning",
        "message": "oom kill constraint detected",
        "extra_info": {
          "log_line": "oom-kill:constraint=CONSTRAINT_MEMCG,nodemask=(null),cpuset=cri-containerd-3fc28fa1c647ceede9f6b340d0b16c9f1f663698972d22a52e296f291638e014.scope,mems_allowed=0-1,oom_memcg=/lxc.payload.ny2g2r5hh3-lxc/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-poda8697f49_441d_4d4f_90d2_6d8e1fa3bbe7.slice,task_memcg=/lxc.payload.ny2g2r5hh3-lxc/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-poda8697f49_441d_4d4f_90d2_6d8e1fa3bbe7.slice/cri-containerd-3fc28fa1c647ceede9f6b340d0b16c9f1f663698972d22a52e296f291638e014.scope,task=node,pid=863987,uid=0"
        },
        "suggested_actions": {
          "descriptions": null,
          "repair_actions": null
        }
      },
      {
        "time": "2025-01-25T02:36:53Z",
        "name": "memory_oom_kill_constraint",
        "type": "Warning",
        "message": "oom kill constraint detected",
        "extra_info": {
          "log_line": "oom-kill:constraint=OTHER_CONSTRAINT"
        },
        "suggested_actions": {
          "descriptions": null,
          "repair_actions": null
        }
      },
      {
        "time": "2025-01-25T02:36:50Z",
        "name": "memory_oom_kill_constraint",
        "type": "Warning",
        "message": "oom kill constraint detected",
        "extra_info": {
          "log_line": "oom-kill:constraint=CONSTRAINT_MEMCG,nodemask=(null),cpuset=cri-containerd-3fc28fa1c647ceede9f6b340d0b16c9f1f663698972d22a52e296f291638e014.scope,mems_allowed=0-1,oom_memcg=/lxc.payload.ny2g2r5hh3-lxc/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-poda8697f49_441d_4d4f_90d2_6d8e1fa3bbe7.slice,task_memcg=/lxc.payload.ny2g2r5hh3-lxc/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-poda8697f49_441d_4d4f_90d2_6d8e1fa3bbe7.slice/cri-containerd-3fc28fa1c647ceede9f6b340d0b16c9f1f663698972d22a52e296f291638e014.scope,task=node,pid=863987,uid=0"
        },
        "suggested_actions": {
          "descriptions": null,
          "repair_actions": null
        }
      },
      {
        "time": "2025-01-25T02:36:46Z",
        "name": "memory_oom",
        "type": "Warning",
        "message": "oom detected",
        "extra_info": {
          "log_line": "Out of memory: Kill process 456 (python) score 50 or sacrifice child"
        },
        "suggested_actions": {
          "descriptions": null,
          "repair_actions": null
        }
      },
      {
        "time": "2025-01-25T02:36:42Z",
        "name": "memory_oom",
        "type": "Warning",
        "message": "oom detected",
        "extra_info": {
          "log_line": "Out of memory: Killed process 123, UID 48, (httpd)"
        },
        "suggested_actions": {
          "descriptions": null,
          "repair_actions": null
        }
      }
    ]
```

---------

Signed-off-by: Gyuho Lee <[email protected]>
  • Loading branch information
gyuho authored Jan 26, 2025
1 parent 02d1814 commit be52aef
Show file tree
Hide file tree
Showing 11 changed files with 431 additions and 206 deletions.
3 changes: 1 addition & 2 deletions components/dmesg/filters.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@ import (
)

func DefaultLogFilters(ctx context.Context) ([]*query_log_common.Filter, error) {
defaultFilters := DefaultDmesgFiltersForMemory()
defaultFilters = append(defaultFilters, DefaultDmesgFiltersForCPU()...)
defaultFilters := DefaultDmesgFiltersForCPU()
defaultFilters = append(defaultFilters, DefaultDmesgFiltersForFileDescriptor()...)

nvidiaInstalled, err := nvidia_query.GPUsInstalled(ctx)
Expand Down
70 changes: 0 additions & 70 deletions components/dmesg/filters_memory.go

This file was deleted.

110 changes: 37 additions & 73 deletions components/memory/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,42 +5,61 @@ import (
"context"
"database/sql"
"fmt"
"strconv"
"time"

"github.com/leptonai/gpud/components"
"github.com/leptonai/gpud/components/common"
"github.com/leptonai/gpud/components/dmesg"
events_db "github.com/leptonai/gpud/components/db"
memory_id "github.com/leptonai/gpud/components/memory/id"
"github.com/leptonai/gpud/components/memory/metrics"
"github.com/leptonai/gpud/components/query"
query_log "github.com/leptonai/gpud/components/query/log"
"github.com/leptonai/gpud/log"

"github.com/prometheus/client_golang/prometheus"
)

func New(ctx context.Context, cfg Config) components.Component {
func New(ctx context.Context, cfg Config) (components.Component, error) {
eventsStore, err := events_db.NewStore(
cfg.Query.State.DBRW,
cfg.Query.State.DBRO,
events_db.CreateDefaultTableName(memory_id.Name),
3*24*time.Hour,
)
if err != nil {
return nil, err
}

cfg.Query.SetDefaultsIfNotSet()
setDefaultPoller(cfg)

cctx, ccancel := context.WithCancel(ctx)
getDefaultPoller().Start(cctx, cfg.Query, memory_id.Name)

return &component{
rootCtx: ctx,
cancel: ccancel,
poller: getDefaultPoller(),
w, err := newWatcher(cctx, eventsStore)
if err != nil {
ccancel()
return nil, err
}

return &component{
ctx: cctx,
cancel: ccancel,
poller: getDefaultPoller(),
cfg: cfg,
watcher: w,
eventsStore: eventsStore,
}, nil
}

var _ components.Component = (*component)(nil)

type component struct {
rootCtx context.Context
cancel context.CancelFunc
poller query.Poller
gatherer prometheus.Gatherer
ctx context.Context
cancel context.CancelFunc
poller query.Poller
cfg Config
watcher *watcher
eventsStore events_db.Store
gatherer prometheus.Gatherer
}

func (c *component) Name() string { return memory_id.Name }
Expand Down Expand Up @@ -87,67 +106,8 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
return output.States()
}

const (
EventKeyUnixSeconds = "unix_seconds"
EventKeyLogLine = "log_line"
)

func (c *component) Events(ctx context.Context, since time.Time) ([]components.Event, error) {
dmesgC, err := components.GetComponent(dmesg.Name)
if err != nil {
return nil, err
}

var dmesgComponent *dmesg.Component
if o, ok := dmesgC.(interface{ Unwrap() interface{} }); ok {
if unwrapped, ok := o.Unwrap().(*dmesg.Component); ok {
dmesgComponent = unwrapped
} else {
return nil, fmt.Errorf("expected *dmesg.Component, got %T", dmesgC)
}
}
dmesgEvents, err := dmesgComponent.Events(ctx, since)
if err != nil {
return nil, err
}

events := make([]components.Event, 0)
for _, ev := range dmesgEvents {
v, ok := ev.ExtraInfo[dmesg.EventKeyDmesgMatchedLogItem]
if !ok {
continue
}
item, err := query_log.ParseItemJSON([]byte(v))
if err != nil || item.Matched == nil {
log.Logger.Errorw("failed to parse log item or none matched", "error", err)
continue
}

name := ""
included := false
for _, owner := range item.Matched.OwnerReferences {
if owner != memory_id.Name {
continue
}
name = item.Matched.Name
included = true
}
if !included {
continue
}

events = append(events, components.Event{
Time: ev.Time,
Name: name,
Type: common.EventTypeWarning,
ExtraInfo: map[string]string{
EventKeyUnixSeconds: strconv.FormatInt(ev.Time.Unix(), 10),
EventKeyLogLine: item.Line,
},
})
}

return events, nil
return c.eventsStore.Get(ctx, since)
}

func (c *component) Metrics(ctx context.Context, since time.Time) ([]components.Metric, error) {
Expand Down Expand Up @@ -182,10 +142,14 @@ func (c *component) Metrics(ctx context.Context, since time.Time) ([]components.

func (c *component) Close() error {
log.Logger.Debugw("closing component")
c.cancel()

// safe to call stop multiple times
c.poller.Stop(memory_id.Name)

c.watcher.close()
c.eventsStore.Close()

return nil
}

Expand Down
30 changes: 30 additions & 0 deletions components/memory/dmesg/dmesg.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,25 @@ const (
// NOTE: this is often followed by a line like:
// [Sun Dec 8 09:23:39 2024] oom_reaper: reaped process 345646 (vector), now anon-rss:0kB, file-rss:0kB, shmem-rss:0
// (to reap the memory used by the OOM victim)
EventOOM = "memory_oom"
RegexOOM = `Out of memory:`

// e.g.,
// oom-kill:constraint=CONSTRAINT_MEMCG,nodemask=(null),
// [...] oom-kill:constraint=CONSTRAINT_MEMCG,nodemask=(null),
EventOOMKillConstraint = "memory_oom_kill_constraint"
RegexOOMKillConstraint = `oom-kill:constraint=`

// e.g.,
// postgres invoked oom-killer: gfp_mask=0x201d2, order=0, oomkilladj=0
// [...] postgres invoked oom-killer: gfp_mask=0x201d2, order=0, oomkilladj=0
EventOOMKiller = "memory_oom_killer"
RegexOOMKiller = `(?i)\b(invoked|triggered) oom-killer\b`

// e.g.,
// Memory cgroup out of memory: Killed process 123, UID 48, (httpd).
// [...] Memory cgroup out of memory: Killed process 123, UID 48, (httpd).
EventOOMCgroup = "memory_oom_cgroup"
RegexOOMCgroup = `Memory cgroup out of memory`

// May indicate that Dual Inline Memory Module (DIMM) is beginning to fail.
Expand All @@ -38,6 +42,7 @@ const (
// ref.
// https://serverfault.com/questions/682909/how-to-find-faulty-memory-module-from-mce-message
// https://github.com/Azure/azurehpc/blob/2d57191cb35ed638525ba9424cc2aa1b5abe1c05/experimental/aks_npd_draino/npd/deployment/node-problem-detector-config.yaml#L51C20-L51C40
EventEDACCorrectableErrors = "memory_edac_correctable_errors"
RegexEDACCorrectableErrors = `.*CE memory read error.*`
)

Expand Down Expand Up @@ -85,3 +90,28 @@ func HasEDACCorrectableErrors(line string) bool {
}
return false
}

func Match(line string) (name string, message string) {
for _, m := range getMatches() {
if m.check(line) {
return m.name, m.message
}
}
return "", ""
}

type match struct {
check func(string) bool
name string
message string
}

func getMatches() []match {
return []match{
{check: HasOOM, name: EventOOM, message: "oom detected"},
{check: HasOOMKillConstraint, name: EventOOMKillConstraint, message: "oom kill constraint detected"},
{check: HasOOMKiller, name: EventOOMKiller, message: "oom killer detected"},
{check: HasOOMCgroup, name: EventOOMCgroup, message: "oom cgroup detected"},
{check: HasEDACCorrectableErrors, name: EventEDACCorrectableErrors, message: "edac correctable errors detected"},
}
}
80 changes: 80 additions & 0 deletions components/memory/dmesg/dmesg_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ package dmesg

import (
"testing"

"github.com/stretchr/testify/assert"
)

func TestHasOOM(t *testing.T) {
Expand Down Expand Up @@ -223,3 +225,81 @@ func TestHasEDACCorrectableErrors(t *testing.T) {
})
}
}

func TestMatch(t *testing.T) {
tests := []struct {
name string
input string
expectedName string
expectedMsg string
}{
{
name: "OOM basic case",
input: "Out of memory: Killed process 123, UID 48, (httpd).",
expectedName: EventOOM,
expectedMsg: "oom detected",
},
{
name: "OOM with timestamp",
input: "[Sun Dec 8 09:23:39 2024] Out of memory: Killed process 123, UID 48, (httpd).",
expectedName: EventOOM,
expectedMsg: "oom detected",
},
{
name: "OOM kill constraint",
input: "oom-kill:constraint=CONSTRAINT_MEMCG,nodemask=(null),",
expectedName: EventOOMKillConstraint,
expectedMsg: "oom kill constraint detected",
},
{
name: "OOM killer invoked",
input: "postgres invoked oom-killer: gfp_mask=0x201d2, order=0, oomkilladj=0",
expectedName: EventOOMKiller,
expectedMsg: "oom killer detected",
},
{
name: "OOM killer triggered",
input: "process triggered oom-killer: gfp_mask=0x201d2",
expectedName: EventOOMKiller,
expectedMsg: "oom killer detected",
},
{
name: "OOM cgroup",
input: "Memory cgroup out of memory: Killed process 123, UID 48, (httpd).",
expectedName: EventOOMCgroup,
expectedMsg: "oom cgroup detected",
},
{
name: "EDAC correctable error",
input: "EDAC MC0: 1 CE memory read error",
expectedName: EventEDACCorrectableErrors,
expectedMsg: "edac correctable errors detected",
},
{
name: "EDAC correctable error with DIMM info",
input: "EDAC MC1: 128 CE memory read error on CPU_SrcID#1_Ha#0_Chan#1_DIMM#1",
expectedName: EventEDACCorrectableErrors,
expectedMsg: "edac correctable errors detected",
},
{
name: "non-matching line",
input: "some random log line that doesn't match any patterns",
expectedName: "",
expectedMsg: "",
},
{
name: "empty line",
input: "",
expectedName: "",
expectedMsg: "",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
name, msg := Match(tt.input)
assert.Equal(t, tt.expectedName, name)
assert.Equal(t, tt.expectedMsg, msg)
})
}
}
Loading

0 comments on commit be52aef

Please sign in to comment.