Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[chore][process] narrow the log that is passed to the caller #195

Merged
merged 5 commits into from
Dec 12, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ go 1.22.8

require (
github.com/docker/docker v26.1.5+incompatible
github.com/elastic/elastic-agent-libs v0.17.4-0.20241126154321-6ed75416832d
github.com/elastic/elastic-agent-libs v0.17.4
leehinman marked this conversation as resolved.
Show resolved Hide resolved
github.com/elastic/go-licenser v0.4.2
github.com/elastic/go-structform v0.0.9
github.com/elastic/go-sysinfo v1.14.1
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ github.com/docker/go-connections v0.4.0 h1:El9xVISelRB7BuFusrZozjnkIM5YnzCViNKoh
github.com/docker/go-connections v0.4.0/go.mod h1:Gbd7IOopHjR8Iph03tsViu4nIes5XhDvyHbTtUxmeec=
github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4=
github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
github.com/elastic/elastic-agent-libs v0.17.4-0.20241126154321-6ed75416832d h1:nY8LSeTYU1uSDAAg7WwGH/cALgdovAXLdIzV25Ky0Bo=
github.com/elastic/elastic-agent-libs v0.17.4-0.20241126154321-6ed75416832d/go.mod h1:5CR02awPrBr+tfmjBBK+JI+dMmHNQjpVY24J0wjbC7M=
github.com/elastic/elastic-agent-libs v0.17.4 h1:kWK5Kn2EQjM97yHqbeXv+cFAIti4IiI9Qj8huM+lZzE=
github.com/elastic/elastic-agent-libs v0.17.4/go.mod h1:5CR02awPrBr+tfmjBBK+JI+dMmHNQjpVY24J0wjbC7M=
github.com/elastic/go-licenser v0.4.2 h1:bPbGm8bUd8rxzSswFOqvQh1dAkKGkgAmrPxbUi+Y9+A=
github.com/elastic/go-licenser v0.4.2/go.mod h1:W8eH6FaZDR8fQGm+7FnVa7MxI1b/6dAqxz+zPB8nm5c=
github.com/elastic/go-structform v0.0.9 h1:HpcS7xljL4kSyUfDJ8cXTJC6rU5ChL1wYb6cx3HLD+o=
Expand Down
4 changes: 2 additions & 2 deletions metric/system/process/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,9 @@ type NonFatalErr struct {

func (c NonFatalErr) Error() string {
if c.Err != nil {
return "Not enough privileges to fetch information: " + c.Err.Error()
return "non fatal error; reporting partial metrics: " + c.Err.Error()
}
return "Not enough privileges to fetch information"
return "non fatal error"
}

func (c NonFatalErr) Is(other error) bool {
Expand Down
27 changes: 23 additions & 4 deletions metric/system/process/process.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ import (
sysinfotypes "github.com/elastic/go-sysinfo/types"
)

var errFetchingPIDs = "error fetching PID metrics for %d processes, most likely a \"permission denied\" error. Enable debug logging to determine the exact cause."

// ListStates is a wrapper that returns a list of processess with only the basic PID info filled out.
func ListStates(hostfs resolve.Resolver) ([]ProcState, error) {
init := Stats{
Expand Down Expand Up @@ -96,6 +98,7 @@ func (procStats *Stats) Get() ([]mapstr.M, []mapstr.M, error) {
if wrappedErr != nil && !isNonFatal(wrappedErr) {
return nil, nil, fmt.Errorf("error gathering PIDs: %w", wrappedErr)
}
failedPIDs := extractFailedPIDs(pidMap)
// We use this to track processes over time.
procStats.ProcsMap.SetMap(pidMap)

Expand Down Expand Up @@ -133,8 +136,11 @@ func (procStats *Stats) Get() ([]mapstr.M, []mapstr.M, error) {
procs = append(procs, proc)
rootEvents = append(rootEvents, rootMap)
}

return procs, rootEvents, toNonFatal(wrappedErr)
if len(failedPIDs) > 0 {
procStats.logger.Debugf("error fetching process metrics: %v", wrappedErr)
return procs, rootEvents, NonFatalErr{Err: fmt.Errorf(errFetchingPIDs, len(failedPIDs))}
}
return procs, rootEvents, nil
}

// GetOne fetches process data for a given PID if its name matches the regexes provided from the host.
Expand Down Expand Up @@ -197,6 +203,7 @@ func (procStats *Stats) pidIter(pid int, procMap ProcsMap, proclist []ProcState)
status, saved, err := procStats.pidFill(pid, true)
var nonFatalErr error
if err != nil {
procMap[pid] = ProcState{Failed: true}
if !errors.Is(err, NonFatalErr{}) {
procStats.logger.Debugf("Error fetching PID info for %d, skipping: %s", pid, err)
// While monitoring a set of processes, some processes might get killed after we get all the PIDs
Expand All @@ -206,7 +213,7 @@ func (procStats *Stats) pidIter(pid int, procMap ProcsMap, proclist []ProcState)
}
return procMap, proclist, err
}
nonFatalErr = fmt.Errorf("non fatal error fetching PID some info for %d, metrics are valid, but partial: %w", pid, err)
nonFatalErr = fmt.Errorf("error for pid %d: %w", pid, err)
procStats.logger.Debugf(err.Error())
}
if !saved {
Expand Down Expand Up @@ -252,7 +259,7 @@ func (procStats *Stats) pidFill(pid int, filter bool) (ProcState, bool, error) {
if !errors.Is(err, NonFatalErr{}) {
return status, true, fmt.Errorf("FillPidMetrics failed for PID %d: %w", pid, err)
}
wrappedErr = errors.Join(wrappedErr, fmt.Errorf("non-fatal error fetching PID metrics for %d, metrics are valid, but partial: %w", pid, err))
wrappedErr = errors.Join(wrappedErr, err)
procStats.logger.Debugf(wrappedErr.Error())
}

Expand Down Expand Up @@ -409,3 +416,15 @@ func (procStats *Stats) isWhitelistedEnvVar(varName string) bool {
}
return false
}

func extractFailedPIDs(procMap ProcsMap) []int {
list := make([]int, 0)
for pid, state := range procMap {
if state.Failed {
list = append(list, pid)
// delete the failed state so we don't return the state to caller
delete(procMap, pid)
}
}
return list
}
3 changes: 3 additions & 0 deletions metric/system/process/process_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ type ProcState struct {

// meta
SampleTime time.Time `struct:"-,omitempty"`

// boolean to indicate that given PID has failed due to some error.
Failed bool `struct:"-,omitempty"`
}

// ProcCPUInfo is the main struct for CPU metrics
Expand Down
Loading