Skip to content

Commit

Permalink
[process] Improve sampling of cpu.pct
Browse files Browse the repository at this point in the history
Improvements:
- Non-blocking call to `cpu_percent` after the first sample
- Retrieves an averaged value since the last call (instead of an
averaged value over the fraction of a second defined by
`cpu_check_interval`)

To use the non-blocking call we keep Process instances through
multiple check runs.

Also updated a test on the check, on which we stop using real PIDs
because the test was previously relying on the of `find_pids` to
return a list of identical PIDs, whereas `find_pids` actually returns
a set of PIDs.
  • Loading branch information
olivielpeau committed Sep 18, 2015
1 parent 040e967 commit 60a8eb3
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 31 deletions.
43 changes: 33 additions & 10 deletions checks.d/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ def __init__(self, name, init_config, agentConfig, instances=None):
)
)

# Process cache, indexed by instance
self.process_cache = {}

def should_refresh_ad_cache(self):
now = time.time()
return now - self.last_ad_cache_ts > self.access_denied_cache_duration
Expand Down Expand Up @@ -163,21 +166,36 @@ def psutil_wrapper(self, process, method, accessors, *args, **kwargs):

return result


def get_process_state(self, name, pids, cpu_check_interval):
st = defaultdict(list)

if name not in self.process_cache:
self.process_cache[name] = {}

This comment has been minimized.

Copy link
@yannmh

yannmh Sep 18, 2015

Member

You can remove these two lines by using a defaultdict here too

self.process_cache = defaultdict(dict)

# Remove from cache the processes that are not in `pids`
cached_pids = set(self.process_cache[name].keys())
pids_to_remove = cached_pids - pids
for pid in pids_to_remove:
del self.process_cache[name][pid]

for pid in pids:
st['pids'].append(pid)

try:
p = psutil.Process(pid)
# Skip processes dead in the meantime
except psutil.NoSuchProcess:
self.warning('Process %s disappeared while scanning' % pid)
# reset the PID cache now, something changed
self.last_pid_cache_ts[name] = 0
continue
new_process = False
# If the pid's process is not cached, retrieve it
if pid not in self.process_cache[name] or not self.process_cache[name][pid].is_running():
new_process = True
try:
self.process_cache[name][pid] = psutil.Process(pid)
self.log.debug('New process in cache: %s' % pid)
# Skip processes dead in the meantime
except psutil.NoSuchProcess:
self.warning('Process %s disappeared while scanning' % pid)
# reset the PID cache now, something changed
self.last_pid_cache_ts[name] = 0
continue

p = self.process_cache[name][pid]

meminfo = self.psutil_wrapper(p, 'memory_info', ['rss', 'vms'])
st['rss'].append(meminfo.get('rss'))
Expand All @@ -195,7 +213,12 @@ def get_process_state(self, name, pids, cpu_check_interval):
st['ctx_swtch_invol'].append(ctxinfo.get('involuntary'))

st['thr'].append(self.psutil_wrapper(p, 'num_threads', None))
st['cpu'].append(self.psutil_wrapper(p, 'cpu_percent', None, cpu_check_interval))
if new_process:
# Blocking call for `cpu_check_interval` seconds
st['cpu'].append(self.psutil_wrapper(p, 'cpu_percent', None, cpu_check_interval))
else:
# Non-blocking call
st['cpu'].append(self.psutil_wrapper(p, 'cpu_percent', None))

st['open_fd'].append(self.psutil_wrapper(p, 'num_fds', None))

Expand Down
52 changes: 31 additions & 21 deletions tests/checks/integration/test_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@
_PSUTIL_MEM_SHARED = False


class MockProcess(object):
def is_running(self):
return True


class ProcessCheckTest(AgentCheckTest):
CHECK_NAME = 'process'
Expand All @@ -43,7 +47,7 @@ class ProcessCheckTest(AgentCheckTest):
'warning': [1, 5]
}
},
'mocked_processes': 0
'mocked_processes': set()
},
{
'config': {
Expand All @@ -54,7 +58,7 @@ class ProcessCheckTest(AgentCheckTest):
'warning': [2, 4]
}
},
'mocked_processes': 1
'mocked_processes': set([1])
},
{
'config': {
Expand All @@ -65,7 +69,7 @@ class ProcessCheckTest(AgentCheckTest):
'warning': [1, 4]
}
},
'mocked_processes': 3
'mocked_processes': set([22, 35])
},
{
'config': {
Expand All @@ -76,7 +80,7 @@ class ProcessCheckTest(AgentCheckTest):
'warning': [2, 5]
}
},
'mocked_processes': 5
'mocked_processes': set([1, 5, 44, 901, 34])
},
{
'config': {
Expand All @@ -87,15 +91,15 @@ class ProcessCheckTest(AgentCheckTest):
'warning': [2, 5]
}
},
'mocked_processes': 6
'mocked_processes': set([3, 7, 2, 9, 34, 72])
},
{
'config': {
'name': 'test_tags',
'search_string': ['test_5'], # index in the array for our find_pids mock
'tags': ['onetag', 'env:prod']
},
'mocked_processes': 1
'mocked_processes': set([2])
},
{
'config': {
Expand All @@ -105,7 +109,7 @@ class ProcessCheckTest(AgentCheckTest):
'test': 'test'
}
},
'mocked_processes': 1
'mocked_processes': set([89])
},
]

Expand Down Expand Up @@ -204,12 +208,21 @@ def deny_cmdline(obj):
def mock_find_pids(self, name, search_string, exact_match=True, ignore_ad=True,
refresh_ad_cache=True):
idx = search_string[0].split('_')[1]
# Use a real PID to get real metrics!
return [os.getpid()] * self.CONFIG_STUBS[int(idx)]['mocked_processes']
return self.CONFIG_STUBS[int(idx)]['mocked_processes']

def mock_psutil_wrapper(self, process, method, accessors, *args, **kwargs):
if accessors is None:
result = 0
else:
result = dict([(accessor, 0) for accessor in accessors])

return result

def test_check(self):
@patch('psutil.Process', return_value=MockProcess())
def test_check(self, mock_process):
mocks = {
'find_pids': self.mock_find_pids
'find_pids': self.mock_find_pids,
'psutil_wrapper': self.mock_psutil_wrapper,
}

config = {
Expand All @@ -230,17 +243,12 @@ def test_check(self):
expected_tags += stub['config']['tags']

expected_value = None
# cases where we don't actually expect some metrics
# - if no processes are matched we don't send metrics except number
# - if io_counters() is not available skip the metrics
# - if memory_info_ex() is not available skip the metrics
if (mocked_processes == 0 and mname != 'system.processes.number')\
or (not _PSUTIL_IO_COUNTERS and '.io' in mname)\
or (not _PSUTIL_MEM_SHARED and 'mem.real' in mname):
# if no processes are matched we don't send metrics except number
if len(mocked_processes) == 0 and mname != 'system.processes.number':

This comment has been minimized.

Copy link
@yannmh

yannmh Sep 18, 2015

Member

I find a bit strange to 'test' and continue on something that should never happen

len(mocked_processes) == 0 and mname != system.processes.number
continue

if mname == 'system.processes.number':
expected_value = mocked_processes
expected_value = len(mocked_processes)

self.assertMetric(
mname, count=1,
Expand All @@ -252,7 +260,7 @@ def test_check(self):
expected_tags = ['process:{0}'.format(stub['config']['name'])]
critical = stub['config'].get('thresholds', {}).get('critical')
warning = stub['config'].get('thresholds', {}).get('warning')
procs = stub['mocked_processes']
procs = len(stub['mocked_processes'])

if critical is not None and (procs < critical[0] or procs > critical[1]):
self.assertServiceCheckCritical('process.up', count=1, tags=expected_tags)
Expand Down Expand Up @@ -280,7 +288,9 @@ def test_check_real_process(self):

expected_tags = ['py', 'process_name:py']
for mname in self.PROCESS_METRIC:
# See same tests comment in the function `test_check`
# cases where we don't actually expect some metrics here:
# - if io_counters() is not available
# - if memory_info_ex() is not available
if (not _PSUTIL_IO_COUNTERS and '.io' in mname)\
or (not _PSUTIL_MEM_SHARED and 'mem.real' in mname):
continue
Expand Down

0 comments on commit 60a8eb3

Please sign in to comment.