From 16d1ffad093a4562d29729d8916def05ff27124d Mon Sep 17 00:00:00 2001 From: Conor Branagan Date: Tue, 31 Dec 2013 18:45:48 +0000 Subject: [PATCH 1/3] Get bernard working with new style config. Instead of defining each check separately, you define the check once with multiple parameters in the config, such as port or password. * Tests are passing. * Removed "attempts" option because this logic is handled in Colvin. * Refactored code in multiple places. --- bernard.yaml.example | 46 ++++----- bernard/check.py | 225 +++++++++++++++++++++--------------------- bernard/cli.py | 10 +- bernard/core.py | 30 +++--- bernard/scheduler.py | 114 ++++++++------------- tests/test_bernard.py | 193 ++++++++++++++++++------------------ 6 files changed, 288 insertions(+), 330 deletions(-) diff --git a/bernard.yaml.example b/bernard.yaml.example index 27db67d58d..1332c72d29 100644 --- a/bernard.yaml.example +++ b/bernard.yaml.example @@ -1,28 +1,20 @@ ## Default configuration -# core: -# schedule: -# timeout: 5 # To check will timeout and exit after {timeout} seconds -# period: 60 # Scheduled once every {period} seconds -# attempts: 3 # The state change is confirmed only after {attempts} attempts (1 for instant change). -# notification: "" # String added in the event body -# notify_startup: none # Which state to notify at startup, can be all, warning, critical or none -# checks: - -## Advanced example -# core: -# schedule: -# timeout: 3 -# period: 90 -# notification: "@all" -# checks: -# - path: /path2/default_checks/ -# attempts: 2 -# - path: /path1/my_checks/ -# timeout: 2 -# period: 50 -# notification: "@my_email Take a look" -# notify_startup: warning -# args: ["--verbose"] -# - filename: /path3/checks/my_check -# args: ["-w", "10", "-w", "30"] -# name: custom name for this check +#core: +# schedule: +# timeout: 5 # To check will timeout and exit after {timeout} seconds +# period: 15 # Scheduled once every {period} seconds +# +#checks: +# check_ntp: +# command: /usr/local/bin/check_ntp_peer -H $host +# +# check_pg: +# command: /usr/local/bin/check_pg.sh -p $port -d $db +# options: +# timeout: 2 +# period: 60 +# params: +# - port: 5432 +# db: db1 +# - port: 5433 +# db: db2 diff --git a/bernard/check.py b/bernard/check.py index 2178335082..b48f8cafe9 100644 --- a/bernard/check.py +++ b/bernard/check.py @@ -1,17 +1,19 @@ +# stdlib import logging import os import re import signal +import shlex import subprocess import time + +# project from util import ( + get_hostname, namedtuple, StaticWatchdog, ) -class InvalidCheckOutput(Exception): - pass - class Timeout(Exception): pass @@ -22,15 +24,15 @@ class InvalidPath(Exception): CheckResult = namedtuple('CheckResult', ['status', 'state', 'message', 'execution_date', 'execution_time']) -# Status of the execution of the check -ExecutionStatus = namedtuple('ExecutionStatus', +# State of the last execution of the check +ExecutionState = namedtuple('ExecutionState', ['OK', 'TIMEOUT', 'EXCEPTION', 'INVALID_OUTPUT']) -S = ExecutionStatus('ok', 'timeout', 'exception', 'invalid_output') +S = ExecutionState('ok', 'timeout', 'exception', 'invalid_output') -# State of check -ResultState = namedtuple('ResultState', - ['NONE', 'OK', 'WARNING', 'CRITICAL', 'UNKNOWN']) -R = ResultState('init', 'ok', 'warning', 'critical', 'unknown') +# Check result status +class R(): + OK, WARNING, CRITICAL, UNKNOWN, NONE = (0, 1, 2, 3, 4) + ALL = (OK, WARNING, CRITICAL, UNKNOWN, NONE) log = logging.getLogger(__name__) @@ -43,92 +45,64 @@ class BernardCheck(object): ])) @classmethod - def from_config(cls, check_config, defaults): - check_paths = [] - path = check_config.get('path', '') - filename = check_config.get('filename', '') - notification = check_config.get('notification', '') - timeout = int(check_config.get('timeout', 0)) - period = int(check_config.get('period', 0)) - attempts = int(check_config.get('attempts', 0)) - name = check_config.get('name', None) - args = check_config.get('args', []) - notify_startup = check_config.get('notify_startup', None) - if path: - try: - filenames = os.listdir(path) - check_paths = [] - for fname in filenames: - # Filter hidden files - if not fname.startswith('.'): - check_path = os.path.join(path, fname) - # Keep only executable files - if os.path.isfile(check_path) and os.access(check_path, os.X_OK): - check_paths.append(check_path) - except OSError, e: - raise InvalidPath(str(e)) - if filename: - check_paths.append(filename) - + def from_config(cls, name, check_config, defaults, hostname=None): + options = check_config.get('options', {}) + timeout = int(options.get('timeout', 0)) + period = int(options.get('period', 0)) + raw_command = check_config.get('command') + params_list = check_config.get('params') or [{}] + hostname = hostname or get_hostname() + + check_config = { + 'timeout': timeout or defaults['timeout'], + 'period': period or defaults['period'], + } checks = [] - if check_paths: - check_parameter = defaults.copy() - if notification: - check_parameter['notification'] = notification - if timeout: - check_parameter['timeout'] = timeout - if period: - check_parameter['period'] = period - if attempts: - check_parameter['attempts'] = attempts - if notify_startup: - check_parameter['notify_startup'] = notify_startup - if name: - check_parameter['name'] = name - for check_path in check_paths: - checks.append(cls(check=check_path, config=check_parameter, - args=args)) + + # For every set of params (e.g.: {'port': 8888}) return a single check. + # We'll template the $variables in the `command` value with the params. + for param_dict in params_list: + command = _subprocess_command(raw_command, param_dict, hostname) + checks.append(cls(name, command, check_config)) + return checks - def __init__(self, check, config, args=[]): - self.check = check + def __init__(self, name, command, config): + """ Initializes a BernardCheck with the given `name` and `command`. + Any additional config (e.g. timeout or period) are given in the + `config` dict. `command` is expected to be in a subprocess-friendly + form, e.g.: ['check_foo', ['-h', 'localhost']]. + """ + self.name = name self.config = config - self.args = args - self.command = [self.check] + args - + self.command = command self.run_count = 0 self.event_count = 0 - self.container_size = self.config['attempts'] + 1 - - # Contains the result of #{container_size} last checks - self.result_container = [] + # Always holds the latest result. + self.result = None - # Set check_name, remove file extension and "check_" prefix - if 'name' in config: - check_name = config['name'] - else: - check_name = self.check.split('/')[-1] - if check_name.startswith('check_'): - check_name = check_name[6:] - check_name = check_name.rsplit('.')[0] - - self.check_name = check_name.lower() - log.debug(u"Initialized check %s (%s)" % (self.check_name, ' '.join(self.command))) + log.debug(u"Initialized check %s (%s)" % (self.name, command)) def __repr__(self): - return self.check_name + return self.name + + def get_period(self): + return self.config['period'] def _execute_check(self): timeout = self.config.get('timeout') output = None returncode = None + # This is going to disable the StaticWatchdog signal.signal(signal.SIGALRM, self.timeout_handler) signal.alarm(timeout) try: try: - process = subprocess.Popen(self.command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + process = subprocess.Popen(self.command, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) output = process.communicate()[0].strip() returncode = process.returncode if len(output) > 20: @@ -136,7 +110,7 @@ def _execute_check(self): else: truncated_output = output log.info(u"Check[%s]: %s => %s (%s)" % ( - self.check_name, + self.name, u' '.join(self.command), returncode, truncated_output @@ -153,50 +127,53 @@ def _execute_check(self): def timeout_handler(self, signum, frame): raise Timeout() - def run(self): + def run(self, dogstatsd_client): execution_date = time.time() try: output, returncode = self._execute_check() + if output is None: - status = S.TIMEOUT - state = R.UNKNOWN + state = S.TIMEOUT + status = R.UNKNOWN message = 'Check %s timed out after %ds' % (self, self.config['timeout']) else: - try: - state, message = self.parse_nagios(output, returncode) - status = S.OK - except InvalidCheckOutput: - status = S.INVALID_OUTPUT - state = R.UNKNOWN - message = u'Failed to parse the output of the check: %s, returncode: %d, output: %s' % ( - self, returncode, output) + if returncode not in R.ALL: + state = S.INVALID_OUTPUT + status = R.UNKNOWN + message = u'Failed to parse the output of the check: %s, ' \ + 'returncode: %d, output: %s' \ + % (self, returncode, output) log.warn(message) - except OSError, exception: - state = R.UNKNOWN - status = S.EXCEPTION + else: + message = self.parse_nagios(output, dogstatsd_client) + state = S.OK + status = returncode + except OSError: + status = R.UNKNOWN + state = S.EXCEPTION message = u'Failed to execute the check: %s' % self log.warn(message, exc_info=True) execution_time = time.time() - execution_date self.run_count += 1 - return CheckResult( + check_result = CheckResult( status=status, state=state, message=message, execution_date=execution_date, execution_time=execution_time ) + self.result = check_result + return check_result - def parse_nagios(self, output, returncode): - state = returncode - + def parse_nagios(self, output, dogstatsd_client): output = output.strip() try: message, tail = output.split('|', 1) except ValueError: # No metric, return directly the output as a message - return state, output + return output message = message.strip() @@ -217,7 +194,7 @@ def parse_nagios(self, output, returncode): unit = metric.group('unit') dd_metric = self._metric_name(label) - # self.dogstatsd.increment('bernard.check.metric_points') + dogstatsd_client.increment('bernard.check.metric_points') if unit == '%': value = value / 100.0 @@ -234,38 +211,58 @@ def parse_nagios(self, output, returncode): elif unit == 'us': value = value / 1000000.0 elif unit == 'c': - # self.dogstatsd.rate(dd_metric, value) + dogstatsd_client.rate(dd_metric, value) log.debug('Saved rate: %s:%.2f' % (dd_metric, value)) continue - # self.dogstatsd.gauge(dd_metric, value) + dogstatsd_client.gauge(dd_metric, value) log.debug('Saved metric: %s:%.2f' % (dd_metric, value)) - return state, message + return message def _metric_name(self, label): - return 'bernard.%s.%s' % (self.check_name, label) - - def get_last_result(self): - return self.get_result(0) + return 'bernard.%s.%s' % (self.name, label) - def get_result(self, position=0): - if len(self.result_container) > position: - index = - (position + 1) - return self.result_container[index] - elif position > self.container_size: - raise Exception('Trying to get %dth result while container size is %d' % (position, self.container_size)) - else: - return CheckResult(execution_date=0, status=S.OK, state=R.NONE, message='Not runned yet', execution_time=0) + def get_result(self): + if self.result: + return self.result + return CheckResult(execution_date=0, state=S.OK, status=R.NONE, + message='Not yet run.', execution_time=0) def get_status(self): - result = self.get_last_result() + result = self.get_result() return { - 'check_name': self.check_name, + 'check_name': self.name, 'run_count': self.run_count, 'status': result.status, 'state': result.state, 'message': result.message, 'execution_time': result.execution_time, } + + +def _subprocess_command(raw_command, params, hostname): + """ Given a raw command from the Bernard config and a dictionary of check + parameter, return a list that's subprocess-compatible for running the + command. We'll replace all command "variables" with a real parameter. + + >>> _subprocess_command("/usr/bin/check_pg -p $port", {'port': '5433'}) + ['/usr/bin/check_pg', ['-p', '5433']] + """ + # $host is always available as a parameter. + if 'host' not in params: + params['host'] = hostname + + # Replace variables. + for param, val in params.iteritems(): + raw_command = raw_command.replace('$%s' % param, str(val)) + + # Split into subprocess format. + command_split = raw_command.split() + if len(command_split) == 0: + raise Exception('Invalid command in config: %v' % raw_command) + parsed_command = [command_split[0]] + if len(command_split[1:]): + parsed_command.extend(shlex.split(' '.join(command_split[1:]))) + return parsed_command diff --git a/bernard/cli.py b/bernard/cli.py index 577a4a07bf..d46451f05c 100644 --- a/bernard/cli.py +++ b/bernard/cli.py @@ -1,20 +1,22 @@ +# stdlib import logging import sys + +# project +from bernard.core import Bernard from config import initialize_logging, get_config, get_parsed_args from daemon import AgentSupervisor - from util import ( PidFile, get_hostname, ) -from bernard.core import Bernard log = logging.getLogger(__name__) def main(): """" Execution of Bernard""" - # Check we're not using an old version of Python. We need 2.4 above because some modules (like subprocess) - # were only introduced in 2.4. + # Check we're not using an old version of Python. We need 2.4 above because + # some modules (like subprocess) were only introduced in 2.4. if int(sys.version_info[1]) <= 3: sys.stderr.write("Datadog agent requires python 2.4 or later.\n") sys.exit(2) diff --git a/bernard/core.py b/bernard/core.py index 9ef48cd51a..9b4f29f16d 100644 --- a/bernard/core.py +++ b/bernard/core.py @@ -1,19 +1,22 @@ +# stdlib import logging import signal import sys import time + +# project +from bernard.check import R, S +from bernard.scheduler import Scheduler +from checks.check_status import AgentStatus, style +from config import get_config_path from daemon import Daemon +from dogstatsd_client import DogStatsd from util import ( StaticWatchdog, get_os, yaml, yLoader, ) -from checks.check_status import AgentStatus, style -from config import get_config_path - -from bernard.check import R, S -from bernard.scheduler import Scheduler RESTART_INTERVAL = 4 * 24 * 60 * 60 # Defaults to 4 days BERNARD_CONF = "bernard.yaml" @@ -61,7 +64,9 @@ def run(self): # load Bernard config and checks bernard_config = get_bernard_config() - self.scheduler = Scheduler.from_config(self.hostname, bernard_config) + dogstatsd_client = DogStatsd() + self.scheduler = Scheduler.from_config(self.hostname, bernard_config, + dogstatsd_client) # Save the agent start-up stats. BernardStatus(checks=self.scheduler.checks).persist() @@ -117,13 +122,14 @@ class BernardStatus(AgentStatus): NAME = 'Bernard' - def __init__(self, checks=[], schedule_count=0): + def __init__(self, checks=None, schedule_count=0): AgentStatus.__init__(self) + checks = checks or [] self.check_stats = [check.get_status() for check in checks] self.schedule_count = schedule_count - self.STATUS_COLOR = {S.OK: 'green', S.TIMEOUT: 'yellow', S.EXCEPTION: 'red', S.INVALID_OUTPUT: 'red'} - self.STATE_COLOR = {R.OK: 'green', R.WARNING: 'yellow', R.CRITICAL: 'red', R.UNKNOWN: 'yellow', R.NONE: 'white'} + self.STATUS_COLOR = {R.OK: 'green', R.WARNING: 'yellow', R.CRITICAL: 'red', R.UNKNOWN: 'yellow', R.NONE: 'white'} + self.STATE_COLOR = {S.OK: 'green', S.TIMEOUT: 'yellow', S.EXCEPTION: 'red', S.INVALID_OUTPUT: 'red'} def body_lines(self): lines = [ @@ -141,8 +147,8 @@ def body_lines(self): for check in self.check_stats: status_color = self.STATUS_COLOR[check['status']] state_color = self.STATE_COLOR[check['state']] - lines += [' %s: [%s] #%d run is %s' % (check['check_name'], style(check['status'], status_color), - check['run_count'], style(check['state'], state_color))] + lines += [' %s: [%s] #%d run is %s' % (check['check_name'], style(check['state'], status_color), + check['run_count'], style(check['status'], state_color))] lines += [' %s' % ((check['message'] or ' ').splitlines()[0])] return lines @@ -162,8 +168,6 @@ def to_dict(self): def get_bernard_config(): """Return the configuration of Bernard""" - - osname = get_os() config_path = get_config_path(os_name=get_os(), filename=BERNARD_CONF) try: diff --git a/bernard/scheduler.py b/bernard/scheduler.py index 79ff0971d5..68ca14a9e9 100644 --- a/bernard/scheduler.py +++ b/bernard/scheduler.py @@ -1,58 +1,57 @@ +# stdlib import logging import random import time -import kima.client -from bernard.check import BernardCheck, R, S +# project +import kima.client +from bernard.check import BernardCheck, S log = logging.getLogger(__name__) +# FIXME: Overriding the config for Kima. +API_KEY = 'apikey_2' +BASE_URL = 'http://localhost:5000' + class Scheduler(object): - """ - Schedule Bernard checks execution. - """ + """ Schedule Bernard checks execution. """ # Ratio of jitter to introduce in the scheduling JITTER_FACTOR = 0.1 + # Check config defaults + DEFAULT_TIMEOUT = 5 + DEFAULT_PERIOD = 15 + @classmethod - def from_config(cls, hostname, bernard_config): + def from_config(cls, hostname, bernard_config, dogstatsd_client): schedule_config = bernard_config.get('core', {}).get('schedule', {}) bernard_checks = [] - DEFAULT_TIMEOUT = 5 - DEFAULT_FREQUENCY = 60 - DEFAULT_ATTEMPTS = 3 - - default_check_parameter = { - 'hostname': hostname, - 'timeout': int(schedule_config.get('timeout', DEFAULT_TIMEOUT)), - 'frequency': int(schedule_config.get('period', DEFAULT_FREQUENCY)), - 'attempts': int(schedule_config.get('period', DEFAULT_ATTEMPTS)), - 'notification': bernard_config.get('core', {}).get('notification', None), - 'notify_startup': bernard_config.get('core', {}).get('notify_startup', "none"), + default_options = { + 'timeout': int(schedule_config.get('timeout', cls.DEFAULT_TIMEOUT)), + 'period': int(schedule_config.get('period', cls.DEFAULT_PERIOD)), } - try: - check_configs = bernard_config.get('checks') or [] - for check_config in check_configs: - try: - bernard_checks.extend(BernardCheck.from_config(check_config, default_check_parameter)) - except Exception, e: - log.exception(e) - - except AttributeError: - log.info("Error while parsing Bernard configuration file. Be sure the structure is valid.") - return [] + check_configs = bernard_config.get('checks') or {} + for check_name, check_config in check_configs.iteritems(): + try: + check = BernardCheck.from_config(check_name, check_config, + default_options, hostname) + except Exception: + log.exception('Unable to load check %s' % check_name) + else: + bernard_checks.extend(check) return cls(checks=bernard_checks, config=bernard_config, - hostname=hostname) + hostname=hostname, dogstatsd_client=dogstatsd_client) - def __init__(self, checks, config, hostname): - """Initialize scheduler""" + def __init__(self, checks, config, hostname, dogstatsd_client): + """ Initialize scheduler """ self.checks = checks self.config = config self.hostname = hostname + self.dogstatsd_client = dogstatsd_client self.schedule_count = 0 # Initialize schedule @@ -62,11 +61,9 @@ def __init__(self, checks, config, hostname): for check in self.checks: self.schedule.append((position, check)) position += 1 - check.last_notified_state = R.NONE - api_key = 'apikey_2' - base_url = 'http://localhost:9000' - self.kima = kima.client.connect(api_key, base_url) + # Initialize our kima client. + self.kima = kima.client.connect(API_KEY, BASE_URL) # Scheduler doesn't need to be initialize if no check assert self.checks @@ -93,7 +90,7 @@ def _now(self): def process(self): """ Execute the next scheduled check """ check = self._pop_check() - result = check.run() + result = check.run(self.dogstatsd_client) self.schedule_count += 1 # post results @@ -107,13 +104,13 @@ def process(self): def reschedule_check(self, check): # Get the duration to wait for the next scheduling - waiting = check.config['frequency'] - status = check.get_last_result().status - if status == S.TIMEOUT: + waiting = check.get_period() + state = check.get_result().state + if state == S.TIMEOUT: waiting = waiting * 3 - elif status == S.INVALID_OUTPUT: + elif state == S.INVALID_OUTPUT: waiting = waiting * 8 - elif status == S.EXCEPTION: + elif state == S.EXCEPTION: waiting = waiting * 12 jitter_range = self.JITTER_FACTOR * waiting @@ -134,39 +131,8 @@ def reschedule_check(self, check): log.debug('%s is rescheduled, next run in %.2fs' % (check, waiting)) def post_run(self, result): - return self.kima.post_monitor_run( - # Impedence mismatch: bernard calls states what the server - # knows as statuses. Should reconcile. - status=result.state, + return self.kima.post_check_run( + status=result.status, output=result.message, timestamp=result.execution_date, host_name=self.hostname) - -class SimulatedScheduler(Scheduler): - def __init__(self, *args, **kwargs): - Scheduler.__init__(self, *args, **kwargs) - self.virtual_time = time.time() - - def wait_time(self): - return 0 - - def _now(self): - """ - Set the virtual time at the end of the check execution - Reschedule the check on a timestamp based on this virtual time - """ - last_result = check.get_last_result() - timestamp = self.virtual_time + last_result.execution_time - self.virtual_time = timestamp - return timestamp - - def _pop_check(self): - """ - When going to run a next check in simulated time, move the - simulated time to the next scheduled timestamp if - it is in the future - """ - self.virtual_time = max(self.schedule[0][0], self.virtual_time) - if self.schedule: - return self.schedule.pop(0)[1] - diff --git a/tests/test_bernard.py b/tests/test_bernard.py index 3c40215fa3..be5b5e6837 100644 --- a/tests/test_bernard.py +++ b/tests/test_bernard.py @@ -3,9 +3,45 @@ import os import time logger = logging.getLogger() -from scheduler import Scheduler, Notifier -from checks.bernard_check import BernardCheck, R, S +from bernard.scheduler import Scheduler +from bernard.check import BernardCheck, R, S from dogstatsd_client import DogStatsd +from util import get_hostname + +class FakeDogstatsd(DogStatsd): + """Fake DogStatsd client, which keeps requests to test them""" + + def __init__(self, host='localhost', port=8125): + self.host = host + self.port = port + self.metrics = [] + self.events = [] + + def _send(self, metric, metric_type, value, tags, sample_rate): + self.metrics.append({ + 'metric': metric, + 'metric_type': metric_type, + 'value': value, + 'tags': tags, + 'sample_rate': sample_rate, + }) + + def event(self, title, text, alert_type=None, aggregation_key=None, source_type_name=None, date_happened=None, priority=None, tags=None, hostname=None): + self.events.append({ + 'title': title, + 'text': text, + 'alert_type': alert_type, + 'aggregation_key': aggregation_key, + 'source_type_name': source_type_name, + 'date_happened': date_happened, + 'priority': priority, + 'tags': tags, + 'hostname': hostname, + }) + + def flush(self): + self.metrics = [] + self.events = [] class TestBernardCheck(unittest.TestCase): "Tests to validate the Bernard check logic" @@ -14,15 +50,16 @@ class TestBernardCheck(unittest.TestCase): def test_timeout_check(self): """Specific tests for timeout checks to make tests faster""" check_timeout = self._get_timeout_check() + dogstatsd = FakeDogstatsd() start = time.time() - check_timeout.run() + check_timeout.run(dogstatsd) end = time.time() # Check status and state - result = check_timeout.get_last_result() - self.assertEqual(result.state, R.UNKNOWN) - self.assertEqual(result.status, S.TIMEOUT) + result = check_timeout.get_result() + self.assertEqual(result.status, R.UNKNOWN) + self.assertEqual(result.state, S.TIMEOUT) # Check execution duration. Timeout is 1s, let's give it 1.2s to run. self.assertTrue(result.execution_time < 1.2) @@ -31,48 +68,50 @@ def test_timeout_check(self): def test_result(self): """Test result of each checks: if status and states are okay""" check_ok, check_warning, check_wrong_exit, check_disappeared = self._get_test_checks() + dogstatsd_client = FakeDogstatsd() - check_ok.run() - check_warning.run() - check_wrong_exit.run() - check_disappeared.run() + check_ok.run(dogstatsd_client) + check_warning.run(dogstatsd_client) + check_wrong_exit.run(dogstatsd_client) + check_disappeared.run(dogstatsd_client) - result = check_ok.get_last_result() - self.assertEqual(result.state, R.OK) - self.assertEqual(result.status, S.OK) + result = check_ok.get_result() + self.assertEqual(result.status, R.OK) + self.assertEqual(result.state, S.OK) - result = check_warning.get_last_result() - self.assertEqual(result.state, R.WARNING) - self.assertEqual(result.status, S.OK) + result = check_warning.get_result() + self.assertEqual(result.status, R.WARNING) + self.assertEqual(result.state, S.OK) - result = check_wrong_exit.get_last_result() - self.assertEqual(result.state, R.UNKNOWN) - self.assertEqual(result.status, S.INVALID_OUTPUT) + result = check_wrong_exit.get_result() + self.assertEqual(result.status, R.UNKNOWN) + self.assertEqual(result.state, S.INVALID_OUTPUT) - result = check_disappeared.get_last_result() - self.assertEqual(result.state, R.UNKNOWN) - self.assertEqual(result.status, S.EXCEPTION) + result = check_disappeared.get_result() + self.assertEqual(result.status, R.UNKNOWN) + self.assertEqual(result.state, S.EXCEPTION) def test_perfdata_metrics(self): """Test perfdata metrics: if the parsing, scaling and dogstatsd calls are okay""" check_ok, check_warning, check_wrong_exit, check_disappeared = self._get_test_checks() + dogstatsd_client = FakeDogstatsd() metric = {} - check_ok.run() - for m in check_ok.dogstatsd.metrics: - if m['metric'] == 'bernard.ok.metric1': + check_ok.run(dogstatsd_client) + for m in dogstatsd_client.metrics: + if m['metric'] == 'bernard.check_ok.metric1': metric = m self.assertEqual(metric.get('value'), 30) self.assertEqual(metric.get('metric_type'), 'g') - check_warning.run() - for m in check_warning.dogstatsd.metrics: - if m['metric'] == 'bernard.warning.timing': + check_warning.run(dogstatsd_client) + for m in dogstatsd_client.metrics: + if m['metric'] == 'bernard.check_warning.timing': metric = m self.assertEqual(metric.get('value'), 0.001) self.assertEqual(metric.get('metric_type'), 'g') - for m in check_warning.dogstatsd.metrics: - if m['metric'] == 'bernard.warning.count': + for m in dogstatsd_client.metrics: + if m['metric'] == 'bernard.check_warning.count': metric = m self.assertEqual(metric.get('value'), 1234) self.assertEqual(metric.get('metric_type'), '_dd-r') @@ -87,10 +126,10 @@ def test_scheduler(self): scheduler.JITTER_FACTOR = 0 # Be sure it keeps the initial order - self.assertEqual(scheduler.schedule[0][1].check_name, check_ok.check_name) - self.assertEqual(scheduler.schedule[1][1].check_name, check_warning.check_name) - self.assertEqual(scheduler.schedule[2][1].check_name, check_wrong_exit.check_name) - self.assertEqual(scheduler.schedule[3][1].check_name, check_disappeared.check_name) + self.assertEqual(scheduler.schedule[0][1].name, check_ok.name) + self.assertEqual(scheduler.schedule[1][1].name, check_warning.name) + self.assertEqual(scheduler.schedule[2][1].name, check_wrong_exit.name) + self.assertEqual(scheduler.schedule[3][1].name, check_disappeared.name) # Should run each check once scheduler.process() @@ -99,85 +138,43 @@ def test_scheduler(self): scheduler.process() # Look at the new schedule - self.assertEqual(scheduler.schedule[0][1].check_name, check_ok.check_name) - self.assertEqual(scheduler.schedule[1][1].check_name, check_warning.check_name) - self.assertEqual(scheduler.schedule[2][1].check_name, check_wrong_exit.check_name) - self.assertEqual(scheduler.schedule[3][1].check_name, check_disappeared.check_name) + self.assertEqual(scheduler.schedule[0][1].name, check_ok.name) + self.assertEqual(scheduler.schedule[1][1].name, check_warning.name) + self.assertEqual(scheduler.schedule[2][1].name, check_wrong_exit.name) + self.assertEqual(scheduler.schedule[3][1].name, check_disappeared.name) # Be sure that schedule order corresponds to timestamps self.assertTrue(scheduler.schedule[1][0] <= scheduler.schedule[1][0]) self.assertTrue(scheduler.schedule[1][0] <= scheduler.schedule[2][0]) self.assertTrue(scheduler.schedule[2][0] <= scheduler.schedule[3][0]) - - def _get_check_parameters(self): - dogstatsd = FakeDogstatsd() - path = os.path.dirname(os.path.abspath(__file__)) - path = os.path.join(path, 'bernard_checks') - - config = { - 'frequency': 60, - 'attempts': 3, - 'timeout': 1, - 'notification': '', - 'notify_startup': 'none', - } - - return path, config, dogstatsd - def _get_test_checks(self): - path, config, dogstatsd = self._get_check_parameters() - return [ - BernardCheck(os.path.join(path, 'check_ok'), config, dogstatsd), - BernardCheck(os.path.join(path, 'check_warning'), config, dogstatsd), - BernardCheck(os.path.join(path, 'check_wrong_exit'), config, dogstatsd), - BernardCheck(os.path.join(path, 'check_disappeared'), config, dogstatsd), + BernardCheck.from_config('check_ok', self._get_check_config('check_ok'), {})[0], + BernardCheck.from_config('check_warning', self._get_check_config('check_warning'), {})[0], + BernardCheck.from_config('check_wrong_exit', self._get_check_config('check_wrong_exit'), {})[0], + BernardCheck.from_config('check_disappeared', self._get_check_config('check_disappeared'), {})[0], ] def _get_timeout_check(self): - path, config, dogstatsd = self._get_check_parameters() - - return BernardCheck(os.path.join(path, 'check_timeout'), config, dogstatsd) + return BernardCheck.from_config('check_timeout', self._get_check_config('check_timeout'), {})[0] def _get_scheduler(self, checks): - return Scheduler(checks=checks, config={}) + return Scheduler(checks, {}, get_hostname(), FakeDogstatsd()) + + def _get_check_config(self, command): + path = os.path.dirname(os.path.abspath(__file__)) + path = os.path.join(path, 'bernard_checks') + return { + 'command': os.path.join(path, command), + 'options': { + 'period': 60, + 'attempts': 3, + 'timeout': 1, + } + } if __name__ == '__main__': unittest.main() -class FakeDogstatsd(DogStatsd): - """Fake DogStatsd client, which keeps requests to test them""" - - def __init__(self, host='localhost', port=8125): - self.host = host - self.port = port - self.metrics = [] - self.events = [] - - def _send(self, metric, metric_type, value, tags, sample_rate): - self.metrics.append({ - 'metric': metric, - 'metric_type': metric_type, - 'value': value, - 'tags': tags, - 'sample_rate': sample_rate, - }) - - def event(self, title, text, alert_type=None, aggregation_key=None, source_type_name=None, date_happened=None, priority=None, tags=None, hostname=None): - self.events.append({ - 'title': title, - 'text': text, - 'alert_type': alert_type, - 'aggregation_key': aggregation_key, - 'source_type_name': source_type_name, - 'date_happened': date_happened, - 'priority': priority, - 'tags': tags, - 'hostname': hostname, - }) - - def flush(self): - self.metrics = [] - self.events = [] \ No newline at end of file From c3c9163a1c552e913cb4e64bb3d313110d2397f8 Mon Sep 17 00:00:00 2001 From: Conor Branagan Date: Tue, 31 Dec 2013 18:47:58 +0000 Subject: [PATCH 2/3] Fix pylint issue. --- bernard/core.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/bernard/core.py b/bernard/core.py index 9b4f29f16d..79cff94875 100644 --- a/bernard/core.py +++ b/bernard/core.py @@ -9,7 +9,7 @@ from bernard.scheduler import Scheduler from checks.check_status import AgentStatus, style from config import get_config_path -from daemon import Daemon +from daemon import Daemon, AgentSupervisor from dogstatsd_client import DogStatsd from util import ( StaticWatchdog, @@ -104,8 +104,7 @@ def run(self): # Now clean-up. BernardStatus.remove_latest_status() - # Explicitly kill the process, because it might be running - # as a daemon. + # Explicitly kill the process, because it might be running as a daemon. log.info("Exiting. Bye bye.") sys.exit(0) From 963c76a5d26e825541b3af2b410a25e5ad3d0002 Mon Sep 17 00:00:00 2001 From: Conor Branagan Date: Tue, 31 Dec 2013 20:16:46 +0000 Subject: [PATCH 3/3] Pass the params and check name to the posted check run. --- bernard/check.py | 14 ++++++++++---- bernard/scheduler.py | 6 ++++-- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/bernard/check.py b/bernard/check.py index b48f8cafe9..628bfcbfbd 100644 --- a/bernard/check.py +++ b/bernard/check.py @@ -62,12 +62,17 @@ def from_config(cls, name, check_config, defaults, hostname=None): # For every set of params (e.g.: {'port': 8888}) return a single check. # We'll template the $variables in the `command` value with the params. for param_dict in params_list: + # Stringify all of the check params. We expect everything to be + # strings through the pipeline so we'll do it early on. + for k, v in param_dict.iteritems(): + param_dict[k] = str(v) + command = _subprocess_command(raw_command, param_dict, hostname) - checks.append(cls(name, command, check_config)) + checks.append(cls(name, command, check_config, param_dict)) return checks - def __init__(self, name, command, config): + def __init__(self, name, command, config, params): """ Initializes a BernardCheck with the given `name` and `command`. Any additional config (e.g. timeout or period) are given in the `config` dict. `command` is expected to be in a subprocess-friendly @@ -76,6 +81,7 @@ def __init__(self, name, command, config): self.name = name self.config = config self.command = command + self.params = params self.run_count = 0 self.event_count = 0 @@ -162,7 +168,7 @@ def run(self, dogstatsd_client): state=state, message=message, execution_date=execution_date, - execution_time=execution_time + execution_time=execution_time, ) self.result = check_result return check_result @@ -256,7 +262,7 @@ def _subprocess_command(raw_command, params, hostname): # Replace variables. for param, val in params.iteritems(): - raw_command = raw_command.replace('$%s' % param, str(val)) + raw_command = raw_command.replace('$%s' % param, val) # Split into subprocess format. command_split = raw_command.split() diff --git a/bernard/scheduler.py b/bernard/scheduler.py index 68ca14a9e9..584216c264 100644 --- a/bernard/scheduler.py +++ b/bernard/scheduler.py @@ -95,7 +95,7 @@ def process(self): # post results try: - self.post_run(result) + self.post_run(check, result) except Exception: log.error("Could not post run", exc_info=True) @@ -130,9 +130,11 @@ def reschedule_check(self, check): log.debug('%s is rescheduled, next run in %.2fs' % (check, waiting)) - def post_run(self, result): + def post_run(self, check, result): return self.kima.post_check_run( + check=check.name, status=result.status, output=result.message, timestamp=result.execution_date, + params=check.params, host_name=self.hostname)