From 774e97bd13edb06c1162fd1d62c42be45e7c4ad1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A9o=20Cavaill=C3=A9?= Date: Tue, 5 May 2015 19:36:02 -0400 Subject: [PATCH 1/2] [process] prettify & cache AccessDenied failures Refactor a bit this check, two noticeable changes: 1. on Windows especially, we get randomly some `AccessDenied` exceptions when trying to look at some processes. Because we iterate on all the processes at every run, it would trigger as many failures *every run*. Once we get this error and `ignore_denied_access` is set to true, we'll not look at it again until DEFAULT_AD_CACHE_DURATION or `access_denied_cache_duration` in the init config. 2. instead of doing some sketchy try/catch over the code, all the psutil-related code that actually gets metric values is wrapped into `psutil_wrapper` which knows how to handle exceptions and known failures. Last remark: before we were setting some metrics to 0 even if we had no values parsed, this is not the case anymore. --- checks.d/process.py | 303 ++++++++++-------- .../{mock => integration}/test_process.py | 145 ++++++++- 2 files changed, 294 insertions(+), 154 deletions(-) rename tests/checks/{mock => integration}/test_process.py (56%) diff --git a/checks.d/process.py b/checks.d/process.py index 8ee50d54be..6598085baf 100644 --- a/checks.d/process.py +++ b/checks.d/process.py @@ -1,4 +1,8 @@ -# 3rd party +# stdlib +from collections import defaultdict +import time + +# 3p import psutil # project @@ -7,170 +11,181 @@ from utils.platform import Platform +DEFAULT_AD_CACHE_DURATION = 120 + + +ATTR_TO_METRIC = { + 'thr': 'threads', + 'cpu': 'cpu.pct', + 'rss': 'mem.rss', + 'vms': 'mem.vms', + 'real': 'mem.real', + 'open_fd': 'open_file_descriptors', + 'r_count': 'ioread_count', # FIXME: namespace me correctly (6.x), io.r_count + 'w_count': 'iowrite_count', # FIXME: namespace me correctly (6.x) io.r_bytes + 'r_bytes': 'ioread_bytes', # FIXME: namespace me correctly (6.x) io.w_count + 'w_bytes': 'iowrite_bytes', # FIXME: namespace me correctly (6.x) io.w_bytes + 'ctx_swtch_vol': 'voluntary_ctx_switches', # FIXME: namespace me correctly (6.x), ctx_swt.voluntary + 'ctx_swtch_invol': 'involuntary_ctx_switches', # FIXME: namespace me correctly (6.x), ctx_swt.involuntary +} + + class ProcessCheck(AgentCheck): + def __init__(self, name, init_config, agentConfig, instances=None): + AgentCheck.__init__(self, name, init_config, agentConfig, instances) + + # ad stands for access denied + # We cache the PIDs getting this error and don't iterate on them + # more often than `access_denied_cache_duration` + self.last_ad_cache_ts = 0 + self.ad_cache = set() + + self.access_denied_cache_duration = int( + init_config.get( + 'access_denied_cache_duration', + DEFAULT_AD_CACHE_DURATION + ) + ) + + def should_refresh_ad_cache(self): + now = time.time() + return now - self.last_ad_cache_ts > self.access_denied_cache_duration - SOURCE_TYPE_NAME = 'system' - - PROCESS_GAUGE = ( - 'system.processes.threads', - 'system.processes.cpu.pct', - 'system.processes.mem.rss', - 'system.processes.mem.vms', - 'system.processes.mem.real', - 'system.processes.open_file_descriptors', - 'system.processes.ioread_count', - 'system.processes.iowrite_count', - 'system.processes.ioread_bytes', - 'system.processes.iowrite_bytes', - 'system.processes.voluntary_ctx_switches', - 'system.processes.involuntary_ctx_switches', - ) - - def find_pids(self, search_string, exact_match, ignore_denied_access): + def find_pids(self, search_string, exact_match, ignore_ad=True, + refresh_ad_cache=True): """ Create a set of pids of selected processes. Search for search_string """ - found_process_list = [] + matching_pids = set() + for proc in psutil.process_iter(): + # Skip access denied processes + if not refresh_ad_cache and proc.pid in self.ad_cache: + continue + found = False for string in search_string: - if exact_match: - try: + try: + if exact_match: if proc.name() == string: found = True - except psutil.NoSuchProcess: - self.log.warning('Process disappeared while scanning') - except psutil.AccessDenied, e: - self.log.error('Access denied to %s process' % string) - self.log.error('Error: %s' % e) - if not ignore_denied_access: - raise + break + else: + cmdline = proc.cmdline() + if string in ' '.join(cmdline): + found = True + break + except psutil.NoSuchProcess: + self.log.warning('Process disappeared while scanning') + except psutil.AccessDenied, e: + self.log.error('Access denied to %s process' % string) + self.log.error('Error: %s' % e) + if refresh_ad_cache: + self.ad_cache.add(proc.pid) + if not ignore_ad: + raise else: - if not found: - try: - cmdline = proc.cmdline() - if string in ' '.join(cmdline): - found = True - except psutil.NoSuchProcess: - self.warning('Process disappeared while scanning') - except psutil.AccessDenied, e: - self.log.error('Access denied to %s process' % string) - self.log.error('Error: %s' % e) - if not ignore_denied_access: - raise - - if found or string == 'All': - found_process_list.append(proc.pid) - - return set(found_process_list) - - def get_process_metrics(self, pids, cpu_check_interval, ignore_denied_access=True): - - # initialize process metrics - # process metrics available for all versions of psutil - rss = 0 - vms = 0 - cpu = 0 - thr = 0 - voluntary_ctx_switches = 0 - involuntary_ctx_switches = 0 - - # process metrics available for psutil versions 0.6.0 and later - if Platform.is_win32() or Platform.is_solaris(): - real = None - else: - real = 0 + if refresh_ad_cache: + self.ad_cache.discard(proc.pid) - if Platform.is_unix(): - open_file_descriptors = 0 - else: - open_file_descriptors = None + if found or string == 'All': + matching_pids.add(proc.pid) - # process I/O counters (agent might not have permission to access) - read_count = 0 - write_count = 0 - read_bytes = 0 - write_bytes = 0 + return matching_pids - got_denied = False + def psutil_wrapper(self, process, method, accessors, *args, **kwargs): + """ + A psutil wrapper that is calling + * psutil.method(*args, **kwargs) and returns the result + OR + * psutil.method(*args, **kwargs).accessor[i] for each accessors given in + a list, the result being indexed in a dictionary by the accessor name + """ - for pid in set(pids): - try: - p = psutil.Process(pid) - try: - if real is not None: - mem = p.memory_info_ex() - real += mem.rss - mem.shared - else: - mem = p.memory_info() - - if Platform.is_unix(): - ctx_switches = p.num_ctx_switches() - voluntary_ctx_switches += ctx_switches.voluntary - involuntary_ctx_switches += ctx_switches.involuntary - - rss += mem.rss - vms += mem.vms - thr += p.num_threads() - cpu += p.cpu_percent(cpu_check_interval) - - if open_file_descriptors is not None: - open_file_descriptors += p.num_fds() - - except NotImplementedError: - # Handle old Kernels which don't provide this info. - voluntary_ctx_switches = None - involuntary_ctx_switches = None - except AttributeError: - self.log.debug("process attribute not supported on this platform") - except psutil.AccessDenied: - got_denied = True - - # user agent might not have permission to call io_counters() - # user agent might have access to io counters for some processes and not others - if read_count is not None: + if accessors is None: + result = None + else: + result = {} + + # Ban certain method that we know fail + if method == 'memory_info_ex'\ + and (Platform.is_win32() or Platform.is_solaris()): + return result + elif method == 'num_fds' and not Platform.is_unix(): + return result + + try: + res = getattr(process, method)(*args, **kwargs) + if accessors is None: + result = res + else: + for acc in accessors: try: - io_counters = p.io_counters() - read_count += io_counters.read_count - write_count += io_counters.write_count - read_bytes += io_counters.read_bytes - write_bytes += io_counters.write_bytes + result[acc] = getattr(res, acc) except AttributeError: - self.log.debug("process attribute not supported on this platform") - except psutil.AccessDenied: - log_func = self.log.debug if ignore_denied_access else self.log.info - log_func('dd-agent user does not have access \ - to I/O counters for process %d: %s' % (pid, p.name())) - read_count = None - write_count = None - read_bytes = None - write_bytes = None + self.log.debug("psutil.{0}().{1} attribute does not exist".format(method, acc)) + except (NotImplementedError, AttributeError): + self.log.debug("psutil method {0} not implemented".format(method)) + except psutil.AccessDenied: + self.log.debug("psutil was denied acccess for method {0}".format(method)) + except psutil.NoSuchProcess: + self.warning("Process {0} disappeared while scanning" % process.pid) + + return result + + def get_process_state(self, pids, cpu_check_interval, ignore_ad=True): + st = defaultdict(list) + for pid in pids: + st['pids'].append(pid) + + try: + p = psutil.Process(pid) # Skip processes dead in the meantime except psutil.NoSuchProcess: self.warning('Process %s disappeared while scanning' % pid) - if got_denied and not ignore_denied_access: - self.warning('The Datadog Agent was denied access ' - 'when trying to get the number of file descriptors') + meminfo = self.psutil_wrapper(p, 'memory_info', ['rss', 'vms']) + st['rss'].append(meminfo.get('rss')) + st['vms'].append(meminfo.get('vms')) + + # will fail on win32 and solaris + shared_mem = self.psutil_wrapper(p, 'memory_info_ex', ['shared']).get('shared') + if shared_mem is not None and meminfo.get('rss') is not None: + st['real'].append(meminfo['rss'] - shared_mem) + else: + st['real'].append(None) + + ctxinfo = self.psutil_wrapper(p, 'num_ctx_switches', ['voluntary', 'involuntary']) + st['ctx_swtch_vol'].append(ctxinfo.get('voluntary')) + st['ctx_swtch_invol'].append(ctxinfo.get('involuntary')) - # Memory values are in Byte - return (thr, cpu, rss, vms, real, open_file_descriptors, - read_count, write_count, read_bytes, write_bytes, - voluntary_ctx_switches, involuntary_ctx_switches) + st['thr'].append(self.psutil_wrapper(p, 'num_threads', None)) + st['cpu'].append(self.psutil_wrapper(p, 'cpu_percent', None, cpu_check_interval)) + + st['open_fd'].append(self.psutil_wrapper(p, 'num_fds', None)) + + ioinfo = self.psutil_wrapper(p, 'io_counters', ['read_count', 'write_count', 'read_bytes', 'write_bytes']) + st['r_count'].append(ioinfo.get('read_count')) + st['w_count'].append(ioinfo.get('write_count')) + st['r_bytes'].append(ioinfo.get('read_bytes')) + st['w_bytes'].append(ioinfo.get('write_bytes')) + + return st def check(self, instance): name = instance.get('name', None) tags = instance.get('tags', []) exact_match = _is_affirmative(instance.get('exact_match', True)) search_string = instance.get('search_string', None) - ignore_denied_access = _is_affirmative(instance.get('ignore_denied_access', True)) + ignore_ad = _is_affirmative(instance.get('ignore_denied_access', True)) cpu_check_interval = instance.get('cpu_check_interval', 0.1) if not isinstance(search_string, list): raise KeyError('"search_string" parameter should be a list') + # FIXME 6.x remove me if "All" in search_string: self.warning('Deprecated: Having "All" in your search_string will' 'greatly reduce the performance of the check and ' @@ -186,21 +201,29 @@ def check(self, instance): self.warning("cpu_check_interval must be a number. Defaulting to 0.1") cpu_check_interval = 0.1 - pids = self.find_pids(search_string, - exact_match, - ignore_denied_access) + refresh_ad_cache = self.should_refresh_ad_cache() + + pids = self.find_pids( + search_string, + exact_match, + ignore_ad=ignore_ad, + refresh_ad_cache=refresh_ad_cache + ) + + proc_state = self.get_process_state(pids, cpu_check_interval) + + # FIXME 6.x remove the `name` tag tags.extend(['process_name:%s' % name, name]) self.log.debug('ProcessCheck: process %s analysed' % name) - self.gauge('system.processes.number', len(pids), tags=tags) - metrics = dict(zip(ProcessCheck.PROCESS_GAUGE, self.get_process_metrics(pids, - cpu_check_interval, ignore_denied_access))) - - for metric, value in metrics.iteritems(): - if value is not None: - self.gauge(metric, value, tags=tags) + for attr, mname in ATTR_TO_METRIC.iteritems(): + vals = [x for x in proc_state[attr] if x is not None] + # skip [] + if vals: + # FIXME 6.x: change this prefix? + self.gauge('system.processes.%s' % mname, sum(vals), tags=tags) self._process_service_check(name, len(pids), instance.get('thresholds', None)) diff --git a/tests/checks/mock/test_process.py b/tests/checks/integration/test_process.py similarity index 56% rename from tests/checks/mock/test_process.py rename to tests/checks/integration/test_process.py index 67d6ac4321..2186b7ae17 100644 --- a/tests/checks/mock/test_process.py +++ b/tests/checks/integration/test_process.py @@ -1,12 +1,36 @@ +""" Put in integration/ +because it requires psutil to function properly +""" + +# stdlib import os -from nose.plugins.attrib import attr -from checks import AgentCheck +# 3p +from mock import patch +import psutil + +# project from tests.checks.common import AgentCheckTest -@attr('process') -class ProcessTestCase(AgentCheckTest): +# cross-platform switches +_PSUTIL_IO_COUNTERS = True +try: + p = psutil.Process(os.getpid()) + p.io_counters() +except Exception: + _PSUTIL_IO_COUNTERS = False + +_PSUTIL_MEM_SHARED = True +try: + p = psutil.Process(os.getpid()) + p.memory_info_ex().shared +except Exception: + _PSUTIL_MEM_SHARED = False + + + +class ProcessCheckTest(AgentCheckTest): CHECK_NAME = 'process' CONFIG_STUBS = [ @@ -101,7 +125,84 @@ class ProcessTestCase(AgentCheckTest): 'system.processes.voluntary_ctx_switches' ] - def mock_find_pids(self, search_string, exact_match=True, ignore_denied_access=True): + def get_psutil_proc(self): + return psutil.Process(os.getpid()) + + def test_psutil_wrapper_simple(self): + # Load check with empty config + self.run_check({}) + name = self.check.psutil_wrapper( + self.get_psutil_proc(), + 'name', + None + ) + + self.assertNotEquals(name, None) + + def test_psutil_wrapper_simple_fail(self): + # Load check with empty config + self.run_check({}) + name = self.check.psutil_wrapper( + self.get_psutil_proc(), + 'blah', + None + ) + + self.assertEquals(name, None) + + def test_psutil_wrapper_accessors(self): + # Load check with empty config + self.run_check({}) + meminfo = self.check.psutil_wrapper( + self.get_psutil_proc(), + 'memory_info', + ['rss', 'vms', 'foo'] + ) + + self.assertIn('rss', meminfo) + self.assertIn('vms', meminfo) + self.assertNotIn('foo', meminfo) + + def test_psutil_wrapper_accessors_fail(self): + # Load check with empty config + self.run_check({}) + meminfo = self.check.psutil_wrapper( + self.get_psutil_proc(), + 'memory_infoo', + ['rss', 'vms'] + ) + + self.assertNotIn('rss', meminfo) + self.assertNotIn('vms', meminfo) + + def test_ad_cache(self): + config = { + 'instances': [{ + 'name': 'python', + 'search_string': ['python'], + 'ignore_denied_access': 'false' + }] + } + + def deny_cmdline(obj): + raise psutil.AccessDenied() + + try: + with patch.object(psutil.Process, 'name', deny_cmdline): + self.run_check(config) + except psutil.AccessDenied: + pass + + self.assertTrue(len(self.check.ad_cache) > 0) + # The next run shoudn't throw an exception + self.run_check(config) + + # Reset the cache now + self.check.last_ad_cache_ts = 0 + self.assertRaises(Exception, lambda: self.run_check, config) + + def mock_find_pids(self, search_string, exact_match=True, ignore_ad=True, + refresh_ad_cache=True): idx = search_string[0].split('_')[1] # Use a real PID to get real metrics! return [os.getpid()] * self.CONFIG_STUBS[int(idx)]['mocked_processes'] @@ -117,6 +218,8 @@ def test_check(self): self.run_check(config, mocks=mocks) for stub in self.CONFIG_STUBS: + mocked_processes = stub['mocked_processes'] + # Assert metrics for mname in self.PROCESS_METRIC: proc_name = stub['config']['name'] @@ -127,10 +230,23 @@ def test_check(self): expected_tags += stub['config']['tags'] expected_value = None + # cases where we don't actually expect some metrics + # - if no processes are matched we don't send metrics except number + # - if io_counters() is not available skip the metrics + # - if memory_info_ex() is not available skip the metrics + if (mocked_processes == 0 and mname != 'system.processes.number')\ + or (not _PSUTIL_IO_COUNTERS and '.io' in mname)\ + or (not _PSUTIL_MEM_SHARED and 'mem.real' in mname): + continue + if mname == 'system.processes.number': - expected_value = stub['mocked_processes'] + expected_value = mocked_processes - self.assertMetric(mname, count=1, tags=expected_tags, value=expected_value) + self.assertMetric( + mname, count=1, + tags=expected_tags, + value=expected_value + ) # Assert service checks expected_tags = ['process:{0}'.format(stub['config']['name'])] @@ -139,13 +255,11 @@ def test_check(self): procs = stub['mocked_processes'] if critical is not None and (procs < critical[0] or procs > critical[1]): - expected_status = AgentCheck.CRITICAL + self.assertServiceCheckCritical('process.up', count=1, tags=expected_tags) elif warning is not None and (procs < warning[0] or procs > warning[1]): - expected_status = AgentCheck.WARNING + self.assertServiceCheckWarning('process.up', count=1, tags=expected_tags) else: - expected_status = AgentCheck.OK - self.assertServiceCheck('process.up', status=expected_status, - count=1, tags=expected_tags) + self.assertServiceCheckOK('process.up', count=1, tags=expected_tags) # Raises when COVERAGE=true and coverage < 100% self.coverage_report() @@ -166,9 +280,12 @@ def test_check_real_process(self): expected_tags = ['py', 'process_name:py'] for mname in self.PROCESS_METRIC: + # See same tests comment in the function `test_check` + if (not _PSUTIL_IO_COUNTERS and '.io' in mname)\ + or (not _PSUTIL_MEM_SHARED and 'mem.real' in mname): + continue self.assertMetric(mname, at_least=1, tags=expected_tags) - self.assertServiceCheck('process.up', status=AgentCheck.OK, - count=1, tags=['process:py']) + self.assertServiceCheckOK('process.up', count=1, tags=['process:py']) self.coverage_report() From cc5a67bb47110ef145e58561ab4471b946354742 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A9o=20Cavaill=C3=A9?= Date: Wed, 13 May 2015 11:49:57 -0400 Subject: [PATCH 2/2] [process] add a PID cache, avoid iter on all procs The new init_config `pid_cache_duration` is the period during which the agent won't try to refresh the "matching" PID list for each instance. It avoids having to iterate over all the processes every run, the cache is invalidated when: * the deadline expire * or one of the cached process can't be read anymore One would want to lower the cache deadline if he wants to alert on no-data. --- checks.d/process.py | 70 +++++++++++++++++------- conf.d/process.yaml.example | 4 ++ tests/checks/integration/test_process.py | 2 +- 3 files changed, 55 insertions(+), 21 deletions(-) diff --git a/checks.d/process.py b/checks.d/process.py index 6598085baf..56cc0b6925 100644 --- a/checks.d/process.py +++ b/checks.d/process.py @@ -12,6 +12,7 @@ DEFAULT_AD_CACHE_DURATION = 120 +DEFAULT_PID_CACHE_DURATION = 120 ATTR_TO_METRIC = { @@ -37,9 +38,9 @@ def __init__(self, name, init_config, agentConfig, instances=None): # ad stands for access denied # We cache the PIDs getting this error and don't iterate on them # more often than `access_denied_cache_duration` + # This cache is for all PIDs so it's global self.last_ad_cache_ts = 0 self.ad_cache = set() - self.access_denied_cache_duration = int( init_config.get( 'access_denied_cache_duration', @@ -47,16 +48,41 @@ def __init__(self, name, init_config, agentConfig, instances=None): ) ) + # By default cache the PID list for a while + # Sometimes it's not wanted b/c it can mess with no-data monitoring + # This cache is indexed per instance + self.last_pid_cache_ts = {} + self.pid_cache = {} + self.pid_cache_duration = int( + init_config.get( + 'pid_cache_duration', + DEFAULT_PID_CACHE_DURATION + ) + ) + def should_refresh_ad_cache(self): now = time.time() return now - self.last_ad_cache_ts > self.access_denied_cache_duration - def find_pids(self, search_string, exact_match, ignore_ad=True, + def should_refresh_pid_cache(self, name): + now = time.time() + return now - self.last_pid_cache_ts.get(name, 0) > self.pid_cache_duration + + def find_pids(self, name, search_string, exact_match, ignore_ad=True, refresh_ad_cache=True): """ Create a set of pids of selected processes. Search for search_string """ + if not self.should_refresh_pid_cache(name): + return self.pid_cache[name] + + ad_error_logger = self.log.debug + if not ignore_ad: + ad_error_logger = self.log.error + + refresh_ad_cache = self.should_refresh_ad_cache() + matching_pids = set() for proc in psutil.process_iter(): @@ -67,20 +93,21 @@ def find_pids(self, search_string, exact_match, ignore_ad=True, found = False for string in search_string: try: + # FIXME 6.x: All has been deprecated from the doc, should be removed + if string == 'All': + found = True if exact_match: if proc.name() == string: found = True - break else: cmdline = proc.cmdline() if string in ' '.join(cmdline): found = True - break except psutil.NoSuchProcess: self.log.warning('Process disappeared while scanning') except psutil.AccessDenied, e: - self.log.error('Access denied to %s process' % string) - self.log.error('Error: %s' % e) + ad_error_logger('Access denied to process with PID %s', proc.pid) + ad_error_logger('Error: %s', e) if refresh_ad_cache: self.ad_cache.add(proc.pid) if not ignore_ad: @@ -88,10 +115,12 @@ def find_pids(self, search_string, exact_match, ignore_ad=True, else: if refresh_ad_cache: self.ad_cache.discard(proc.pid) + if found: + matching_pids.add(proc.pid) + break - if found or string == 'All': - matching_pids.add(proc.pid) - + self.pid_cache[name] = matching_pids + self.last_pid_cache_ts[name] = time.time() return matching_pids def psutil_wrapper(self, process, method, accessors, *args, **kwargs): @@ -124,17 +153,18 @@ def psutil_wrapper(self, process, method, accessors, *args, **kwargs): try: result[acc] = getattr(res, acc) except AttributeError: - self.log.debug("psutil.{0}().{1} attribute does not exist".format(method, acc)) + self.log.debug("psutil.%s().%s attribute does not exist", method, acc) except (NotImplementedError, AttributeError): - self.log.debug("psutil method {0} not implemented".format(method)) + self.log.debug("psutil method %s not implemented", method) except psutil.AccessDenied: - self.log.debug("psutil was denied acccess for method {0}".format(method)) + self.log.debug("psutil was denied acccess for method %s", method) except psutil.NoSuchProcess: - self.warning("Process {0} disappeared while scanning" % process.pid) + self.warning("Process {0} disappeared while scanning".format(process.pid)) return result - def get_process_state(self, pids, cpu_check_interval, ignore_ad=True): + + def get_process_state(self, name, pids, cpu_check_interval): st = defaultdict(list) for pid in pids: @@ -145,6 +175,8 @@ def get_process_state(self, pids, cpu_check_interval, ignore_ad=True): # Skip processes dead in the meantime except psutil.NoSuchProcess: self.warning('Process %s disappeared while scanning' % pid) + # reset the PID cache now, something chaned + self.last_pid_cache_ts[name] = 0 meminfo = self.psutil_wrapper(p, 'memory_info', ['rss', 'vms']) st['rss'].append(meminfo.get('rss')) @@ -201,21 +233,19 @@ def check(self, instance): self.warning("cpu_check_interval must be a number. Defaulting to 0.1") cpu_check_interval = 0.1 - refresh_ad_cache = self.should_refresh_ad_cache() - pids = self.find_pids( + name, search_string, exact_match, - ignore_ad=ignore_ad, - refresh_ad_cache=refresh_ad_cache + ignore_ad=ignore_ad ) - proc_state = self.get_process_state(pids, cpu_check_interval) + proc_state = self.get_process_state(name, pids, cpu_check_interval) # FIXME 6.x remove the `name` tag tags.extend(['process_name:%s' % name, name]) - self.log.debug('ProcessCheck: process %s analysed' % name) + self.log.debug('ProcessCheck: process %s analysed', name) self.gauge('system.processes.number', len(pids), tags=tags) for attr, mname in ATTR_TO_METRIC.iteritems(): diff --git a/conf.d/process.yaml.example b/conf.d/process.yaml.example index 4ad3846741..66a14e4f3e 100644 --- a/conf.d/process.yaml.example +++ b/conf.d/process.yaml.example @@ -1,4 +1,8 @@ init_config: + # the check will refresh the matching pid list every X seconds + # except if it detects a change before. You might want to set it + # low if you want to alert on process service checks. + # pid_cache_duration: 120 instances: # - name: (required) STRING. It will be used to uniquely identify your metrics as they will be tagged with this name diff --git a/tests/checks/integration/test_process.py b/tests/checks/integration/test_process.py index 2186b7ae17..a39ba38691 100644 --- a/tests/checks/integration/test_process.py +++ b/tests/checks/integration/test_process.py @@ -201,7 +201,7 @@ def deny_cmdline(obj): self.check.last_ad_cache_ts = 0 self.assertRaises(Exception, lambda: self.run_check, config) - def mock_find_pids(self, search_string, exact_match=True, ignore_ad=True, + def mock_find_pids(self, name, search_string, exact_match=True, ignore_ad=True, refresh_ad_cache=True): idx = search_string[0].split('_')[1] # Use a real PID to get real metrics!