diff --git a/go.mod b/go.mod index 264dc2d9f..c6c3c1245 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,7 @@ go 1.22.8 require ( github.com/docker/docker v26.1.5+incompatible - github.com/elastic/elastic-agent-libs v0.17.4-0.20241126154321-6ed75416832d + github.com/elastic/elastic-agent-libs v0.17.4 github.com/elastic/go-licenser v0.4.2 github.com/elastic/go-structform v0.0.9 github.com/elastic/go-sysinfo v1.14.1 diff --git a/go.sum b/go.sum index 26097d7a5..bf5ff0216 100644 --- a/go.sum +++ b/go.sum @@ -21,8 +21,8 @@ github.com/docker/go-connections v0.4.0 h1:El9xVISelRB7BuFusrZozjnkIM5YnzCViNKoh github.com/docker/go-connections v0.4.0/go.mod h1:Gbd7IOopHjR8Iph03tsViu4nIes5XhDvyHbTtUxmeec= github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= -github.com/elastic/elastic-agent-libs v0.17.4-0.20241126154321-6ed75416832d h1:nY8LSeTYU1uSDAAg7WwGH/cALgdovAXLdIzV25Ky0Bo= -github.com/elastic/elastic-agent-libs v0.17.4-0.20241126154321-6ed75416832d/go.mod h1:5CR02awPrBr+tfmjBBK+JI+dMmHNQjpVY24J0wjbC7M= +github.com/elastic/elastic-agent-libs v0.17.4 h1:kWK5Kn2EQjM97yHqbeXv+cFAIti4IiI9Qj8huM+lZzE= +github.com/elastic/elastic-agent-libs v0.17.4/go.mod h1:5CR02awPrBr+tfmjBBK+JI+dMmHNQjpVY24J0wjbC7M= github.com/elastic/go-licenser v0.4.2 h1:bPbGm8bUd8rxzSswFOqvQh1dAkKGkgAmrPxbUi+Y9+A= github.com/elastic/go-licenser v0.4.2/go.mod h1:W8eH6FaZDR8fQGm+7FnVa7MxI1b/6dAqxz+zPB8nm5c= github.com/elastic/go-structform v0.0.9 h1:HpcS7xljL4kSyUfDJ8cXTJC6rU5ChL1wYb6cx3HLD+o= diff --git a/metric/system/process/helpers.go b/metric/system/process/helpers.go index 5f4b639e5..0b7546565 100644 --- a/metric/system/process/helpers.go +++ b/metric/system/process/helpers.go @@ -114,9 +114,9 @@ type NonFatalErr struct { func (c NonFatalErr) Error() string { if c.Err != nil { - return "Not enough privileges to fetch information: " + c.Err.Error() + return "non fatal error; reporting partial metrics: " + c.Err.Error() } - return "Not enough privileges to fetch information" + return "non fatal error" } func (c NonFatalErr) Is(other error) bool { diff --git a/metric/system/process/process.go b/metric/system/process/process.go index 55c486180..7045af80d 100644 --- a/metric/system/process/process.go +++ b/metric/system/process/process.go @@ -40,6 +40,8 @@ import ( sysinfotypes "github.com/elastic/go-sysinfo/types" ) +var errFetchingPIDs = "error fetching PID metrics for %d processes, most likely a \"permission denied\" error. Enable debug logging to determine the exact cause." + // ListStates is a wrapper that returns a list of processess with only the basic PID info filled out. func ListStates(hostfs resolve.Resolver) ([]ProcState, error) { init := Stats{ @@ -54,11 +56,15 @@ func ListStates(hostfs resolve.Resolver) ([]ProcState, error) { } // actually fetch the PIDs from the OS-specific code - _, plist, err := init.FetchPids() + pidMap, plist, err := init.FetchPids() if err != nil && !isNonFatal(err) { return nil, fmt.Errorf("error gathering PIDs: %w", err) } - + failedPIDs := extractFailedPIDs(pidMap) + if err != nil && len(failedPIDs) > 0 { + init.logger.Debugf("error fetching process metrics: %v", err) + return plist, NonFatalErr{Err: fmt.Errorf(errFetchingPIDs, len(failedPIDs))} + } return plist, toNonFatal(err) } @@ -96,6 +102,7 @@ func (procStats *Stats) Get() ([]mapstr.M, []mapstr.M, error) { if wrappedErr != nil && !isNonFatal(wrappedErr) { return nil, nil, fmt.Errorf("error gathering PIDs: %w", wrappedErr) } + failedPIDs := extractFailedPIDs(pidMap) // We use this to track processes over time. procStats.ProcsMap.SetMap(pidMap) @@ -133,8 +140,11 @@ func (procStats *Stats) Get() ([]mapstr.M, []mapstr.M, error) { procs = append(procs, proc) rootEvents = append(rootEvents, rootMap) } - - return procs, rootEvents, toNonFatal(wrappedErr) + if len(failedPIDs) > 0 { + procStats.logger.Debugf("error fetching process metrics: %v", wrappedErr) + return procs, rootEvents, NonFatalErr{Err: fmt.Errorf(errFetchingPIDs, len(failedPIDs))} + } + return procs, rootEvents, nil } // GetOne fetches process data for a given PID if its name matches the regexes provided from the host. @@ -197,6 +207,7 @@ func (procStats *Stats) pidIter(pid int, procMap ProcsMap, proclist []ProcState) status, saved, err := procStats.pidFill(pid, true) var nonFatalErr error if err != nil { + procMap[pid] = ProcState{Failed: true} if !errors.Is(err, NonFatalErr{}) { procStats.logger.Debugf("Error fetching PID info for %d, skipping: %s", pid, err) // While monitoring a set of processes, some processes might get killed after we get all the PIDs @@ -206,7 +217,7 @@ func (procStats *Stats) pidIter(pid int, procMap ProcsMap, proclist []ProcState) } return procMap, proclist, err } - nonFatalErr = fmt.Errorf("non fatal error fetching PID some info for %d, metrics are valid, but partial: %w", pid, err) + nonFatalErr = fmt.Errorf("error for pid %d: %w", pid, err) procStats.logger.Debugf(err.Error()) } if !saved { @@ -252,7 +263,7 @@ func (procStats *Stats) pidFill(pid int, filter bool) (ProcState, bool, error) { if !errors.Is(err, NonFatalErr{}) { return status, true, fmt.Errorf("FillPidMetrics failed for PID %d: %w", pid, err) } - wrappedErr = errors.Join(wrappedErr, fmt.Errorf("non-fatal error fetching PID metrics for %d, metrics are valid, but partial: %w", pid, err)) + wrappedErr = errors.Join(wrappedErr, err) procStats.logger.Debugf(wrappedErr.Error()) } @@ -409,3 +420,15 @@ func (procStats *Stats) isWhitelistedEnvVar(varName string) bool { } return false } + +func extractFailedPIDs(procMap ProcsMap) []int { + list := make([]int, 0) + for pid, state := range procMap { + if state.Failed { + list = append(list, pid) + // delete the failed state so we don't return the state to caller + delete(procMap, pid) + } + } + return list +} diff --git a/metric/system/process/process_types.go b/metric/system/process/process_types.go index 6df41f29b..16d7e2426 100644 --- a/metric/system/process/process_types.go +++ b/metric/system/process/process_types.go @@ -56,6 +56,9 @@ type ProcState struct { // meta SampleTime time.Time `struct:"-,omitempty"` + + // boolean to indicate that given PID has failed due to some error. + Failed bool `struct:"-,omitempty"` } // ProcCPUInfo is the main struct for CPU metrics