[process] Improve sampling of cpu.pct

Improvements: - Non-blocking call to `cpu_percent` after the first sample - Retrieves an averaged value since the last call (instead of an averaged value over the fraction of a second defined by `cpu_check_interval`) To use the non-blocking call we keep Process instances through multiple check runs. Also updated a test on the check, on which we stop using real PIDs because the test was previously relying on the of `find_pids` to return a list of identical PIDs, whereas `find_pids` actually returns a set of PIDs.
DataDog · Sep 18, 2015 · 60a8eb3 · yannmh · Sep 18, 2015 · yannmh
1 parent 040e967
commit 60a8eb3
Show file tree

Hide file tree

Showing 2 changed files with 64 additions and 31 deletions.
diff --git a/checks.d/process.py b/checks.d/process.py
@@ -60,6 +60,9 @@ def __init__(self, name, init_config, agentConfig, instances=None):
             )
         )
 
+        # Process cache, indexed by instance
+        self.process_cache = {}
+
     def should_refresh_ad_cache(self):
         now = time.time()
         return now - self.last_ad_cache_ts > self.access_denied_cache_duration
@@ -163,21 +166,36 @@ def psutil_wrapper(self, process, method, accessors, *args, **kwargs):
 
         return result
 
-
     def get_process_state(self, name, pids, cpu_check_interval):
         st = defaultdict(list)
 
+        if name not in self.process_cache:
+            self.process_cache[name] = {}
+
+        # Remove from cache the processes that are not in `pids`
+        cached_pids = set(self.process_cache[name].keys())
+        pids_to_remove = cached_pids - pids
+        for pid in pids_to_remove:
+            del self.process_cache[name][pid]
+
         for pid in pids:
             st['pids'].append(pid)
 
-            try:
-                p = psutil.Process(pid)
-            # Skip processes dead in the meantime
-            except psutil.NoSuchProcess:
-                self.warning('Process %s disappeared while scanning' % pid)
-                # reset the PID cache now, something changed
-                self.last_pid_cache_ts[name] = 0
-                continue
+            new_process = False
+            # If the pid's process is not cached, retrieve it
+            if pid not in self.process_cache[name] or not self.process_cache[name][pid].is_running():
+                new_process = True
+                try:
+                    self.process_cache[name][pid] = psutil.Process(pid)
+                    self.log.debug('New process in cache: %s' % pid)
+                # Skip processes dead in the meantime
+                except psutil.NoSuchProcess:
+                    self.warning('Process %s disappeared while scanning' % pid)
+                    # reset the PID cache now, something changed
+                    self.last_pid_cache_ts[name] = 0
+                    continue
+
+            p = self.process_cache[name][pid]
 
             meminfo = self.psutil_wrapper(p, 'memory_info', ['rss', 'vms'])
             st['rss'].append(meminfo.get('rss'))
@@ -195,7 +213,12 @@ def get_process_state(self, name, pids, cpu_check_interval):
             st['ctx_swtch_invol'].append(ctxinfo.get('involuntary'))
 
             st['thr'].append(self.psutil_wrapper(p, 'num_threads', None))
-            st['cpu'].append(self.psutil_wrapper(p, 'cpu_percent', None, cpu_check_interval))
+            if new_process:
+                # Blocking call for `cpu_check_interval` seconds
+                st['cpu'].append(self.psutil_wrapper(p, 'cpu_percent', None, cpu_check_interval))
+            else:
+                # Non-blocking call
+                st['cpu'].append(self.psutil_wrapper(p, 'cpu_percent', None))
 
             st['open_fd'].append(self.psutil_wrapper(p, 'num_fds', None))
 

diff --git a/tests/checks/integration/test_process.py b/tests/checks/integration/test_process.py
@@ -29,6 +29,10 @@
     _PSUTIL_MEM_SHARED = False
 
 
+class MockProcess(object):
+    def is_running(self):
+        return True
+
 
 class ProcessCheckTest(AgentCheckTest):
     CHECK_NAME = 'process'
@@ -43,7 +47,7 @@ class ProcessCheckTest(AgentCheckTest):
                     'warning': [1, 5]
                 }
             },
-            'mocked_processes': 0
+            'mocked_processes': set()
         },
         {
             'config': {
@@ -54,7 +58,7 @@ class ProcessCheckTest(AgentCheckTest):
                     'warning': [2, 4]
                 }
             },
-            'mocked_processes': 1
+            'mocked_processes': set([1])
         },
         {
             'config': {
@@ -65,7 +69,7 @@ class ProcessCheckTest(AgentCheckTest):
                     'warning': [1, 4]
                 }
             },
-            'mocked_processes': 3
+            'mocked_processes': set([22, 35])
         },
         {
             'config': {
@@ -76,7 +80,7 @@ class ProcessCheckTest(AgentCheckTest):
                     'warning': [2, 5]
                 }
             },
-            'mocked_processes': 5
+            'mocked_processes': set([1, 5, 44, 901, 34])
         },
         {
             'config': {
@@ -87,15 +91,15 @@ class ProcessCheckTest(AgentCheckTest):
                     'warning': [2, 5]
                 }
             },
-            'mocked_processes': 6
+            'mocked_processes': set([3, 7, 2, 9, 34, 72])
         },
         {
             'config': {
                 'name': 'test_tags',
                 'search_string': ['test_5'],  # index in the array for our find_pids mock
                 'tags': ['onetag', 'env:prod']
             },
-            'mocked_processes': 1
+            'mocked_processes': set([2])
         },
         {
             'config': {
@@ -105,7 +109,7 @@ class ProcessCheckTest(AgentCheckTest):
                     'test': 'test'
                 }
             },
-            'mocked_processes': 1
+            'mocked_processes': set([89])
         },
     ]
 
@@ -204,12 +208,21 @@ def deny_cmdline(obj):
     def mock_find_pids(self, name, search_string, exact_match=True, ignore_ad=True,
                        refresh_ad_cache=True):
         idx = search_string[0].split('_')[1]
-        # Use a real PID to get real metrics!
-        return [os.getpid()] * self.CONFIG_STUBS[int(idx)]['mocked_processes']
+        return self.CONFIG_STUBS[int(idx)]['mocked_processes']
+
+    def mock_psutil_wrapper(self, process, method, accessors, *args, **kwargs):
+        if accessors is None:
+            result = 0
+        else:
+            result = dict([(accessor, 0) for accessor in accessors])
+
+        return result
 
-    def test_check(self):
+    @patch('psutil.Process', return_value=MockProcess())
+    def test_check(self, mock_process):
         mocks = {
-            'find_pids': self.mock_find_pids
+            'find_pids': self.mock_find_pids,
+            'psutil_wrapper': self.mock_psutil_wrapper,
         }
 
         config = {
@@ -230,17 +243,12 @@ def test_check(self):
                     expected_tags += stub['config']['tags']
 
                 expected_value = None
-                # cases where we don't actually expect some metrics
-                #  - if no processes are matched we don't send metrics except number
-                #  - if io_counters() is not available skip the metrics
-                #  - if memory_info_ex() is not available skip the metrics
-                if (mocked_processes == 0 and mname != 'system.processes.number')\
-                        or (not _PSUTIL_IO_COUNTERS and '.io' in mname)\
-                        or (not _PSUTIL_MEM_SHARED and 'mem.real' in mname):
+                # if no processes are matched we don't send metrics except number
+                if len(mocked_processes) == 0 and mname != 'system.processes.number':
                     continue
 
                 if mname == 'system.processes.number':
-                    expected_value = mocked_processes
+                    expected_value = len(mocked_processes)
 
                 self.assertMetric(
                     mname, count=1,
@@ -252,7 +260,7 @@ def test_check(self):
             expected_tags = ['process:{0}'.format(stub['config']['name'])]
             critical = stub['config'].get('thresholds', {}).get('critical')
             warning = stub['config'].get('thresholds', {}).get('warning')
-            procs = stub['mocked_processes']
+            procs = len(stub['mocked_processes'])
 
             if critical is not None and (procs < critical[0] or procs > critical[1]):
                 self.assertServiceCheckCritical('process.up', count=1, tags=expected_tags)
@@ -280,7 +288,9 @@ def test_check_real_process(self):
 
         expected_tags = ['py', 'process_name:py']
         for mname in self.PROCESS_METRIC:
-            # See same tests comment in the function `test_check`
+            # cases where we don't actually expect some metrics here:
+            #  - if io_counters() is not available
+            #  - if memory_info_ex() is not available
             if (not _PSUTIL_IO_COUNTERS and '.io' in mname)\
                     or (not _PSUTIL_MEM_SHARED and 'mem.real' in mname):
                 continue