From 16d1ffad093a4562d29729d8916def05ff27124d Mon Sep 17 00:00:00 2001
From: Conor Branagan <conor.branagan@gmail.com>
Date: Tue, 31 Dec 2013 18:45:48 +0000
Subject: [PATCH 1/3] Get bernard working with new style config.

Instead of defining each check separately, you define the check once
with multiple parameters in the config, such as port or password.

* Tests are passing.
* Removed "attempts" option because this logic is handled in Colvin.
* Refactored code in multiple places.
---
 bernard.yaml.example  |  46 ++++-----
 bernard/check.py      | 225 +++++++++++++++++++++---------------------
 bernard/cli.py        |  10 +-
 bernard/core.py       |  30 +++---
 bernard/scheduler.py  | 114 ++++++++-------------
 tests/test_bernard.py | 193 ++++++++++++++++++------------------
 6 files changed, 288 insertions(+), 330 deletions(-)

diff --git a/bernard.yaml.example b/bernard.yaml.example
index 27db67d58d..1332c72d29 100644
--- a/bernard.yaml.example
+++ b/bernard.yaml.example
@@ -1,28 +1,20 @@
 ## Default configuration
-# core:
-#   schedule:
-#     timeout:  5           # To check will timeout and exit after {timeout} seconds
-#     period:  60           # Scheduled once every {period} seconds
-#     attempts: 3           # The state change is confirmed only after {attempts} attempts (1 for instant change).
-#   notification: ""        # String added in the event body
-#   notify_startup: none    # Which state to notify at startup, can be all, warning, critical or none
-# checks:
-
-## Advanced example
-# core:
-#   schedule:
-#     timeout: 3
-#     period: 90
-#   notification: "@all"
-# checks:
-#   - path: /path2/default_checks/
-#     attempts: 2
-#   - path: /path1/my_checks/
-#     timeout: 2
-#     period: 50
-#     notification: "@my_email Take a look"
-#     notify_startup: warning
-#     args: ["--verbose"]
-#   - filename: /path3/checks/my_check
-#     args: ["-w", "10", "-w", "30"]
-#     name: custom name for this check
+#core:
+#    schedule:
+#        timeout:  5           # To check will timeout and exit after {timeout} seconds
+#        period:  15           # Scheduled once every {period} seconds
+#
+#checks:
+#    check_ntp:
+#        command: /usr/local/bin/check_ntp_peer -H $host
+#
+#    check_pg:
+#        command: /usr/local/bin/check_pg.sh -p $port -d $db
+#        options:
+#            timeout: 2
+#            period: 60
+#        params:
+#            - port: 5432
+#              db: db1
+#            - port: 5433
+#              db: db2
diff --git a/bernard/check.py b/bernard/check.py
index 2178335082..b48f8cafe9 100644
--- a/bernard/check.py
+++ b/bernard/check.py
@@ -1,17 +1,19 @@
+# stdlib
 import logging
 import os
 import re
 import signal
+import shlex
 import subprocess
 import time
+
+# project
 from util import (
+    get_hostname,
     namedtuple,
     StaticWatchdog,
 )
 
-class InvalidCheckOutput(Exception):
-    pass
-
 class Timeout(Exception):
     pass
 
@@ -22,15 +24,15 @@ class InvalidPath(Exception):
 CheckResult = namedtuple('CheckResult',
     ['status', 'state', 'message', 'execution_date', 'execution_time'])
 
-# Status of the execution of the check
-ExecutionStatus = namedtuple('ExecutionStatus',
+# State of the last execution of the check
+ExecutionState = namedtuple('ExecutionState',
     ['OK', 'TIMEOUT', 'EXCEPTION', 'INVALID_OUTPUT'])
-S = ExecutionStatus('ok', 'timeout', 'exception', 'invalid_output')
+S = ExecutionState('ok', 'timeout', 'exception', 'invalid_output')
 
-# State of check
-ResultState = namedtuple('ResultState',
-    ['NONE', 'OK', 'WARNING', 'CRITICAL', 'UNKNOWN'])
-R = ResultState('init', 'ok', 'warning', 'critical', 'unknown')
+# Check result status
+class R():
+    OK, WARNING, CRITICAL, UNKNOWN, NONE = (0, 1, 2, 3, 4)
+    ALL = (OK, WARNING, CRITICAL, UNKNOWN, NONE)
 
 log = logging.getLogger(__name__)
 
@@ -43,92 +45,64 @@ class BernardCheck(object):
         ]))
 
     @classmethod
-    def from_config(cls, check_config, defaults):
-        check_paths = []
-        path = check_config.get('path', '')
-        filename = check_config.get('filename', '')
-        notification = check_config.get('notification', '')
-        timeout = int(check_config.get('timeout', 0))
-        period = int(check_config.get('period', 0))
-        attempts = int(check_config.get('attempts', 0))
-        name = check_config.get('name', None)
-        args = check_config.get('args', [])
-        notify_startup = check_config.get('notify_startup', None)
-        if path:
-            try:
-                filenames = os.listdir(path)
-                check_paths = []
-                for fname in filenames:
-                    # Filter hidden files
-                    if not fname.startswith('.'):
-                        check_path = os.path.join(path, fname)
-                        # Keep only executable files
-                        if os.path.isfile(check_path) and os.access(check_path, os.X_OK):
-                            check_paths.append(check_path)
-            except OSError, e:
-                raise InvalidPath(str(e))
-        if filename:
-            check_paths.append(filename)
-
+    def from_config(cls, name, check_config, defaults, hostname=None):
+        options = check_config.get('options', {})
+        timeout = int(options.get('timeout', 0))
+        period = int(options.get('period', 0))
+        raw_command = check_config.get('command')
+        params_list = check_config.get('params') or [{}]
+        hostname = hostname or get_hostname()
+
+        check_config = {
+            'timeout': timeout or defaults['timeout'],
+            'period': period or defaults['period'],
+        }
         checks = []
-        if check_paths:
-            check_parameter = defaults.copy()
-            if notification:
-                check_parameter['notification'] = notification
-            if timeout:
-                check_parameter['timeout'] = timeout
-            if period:
-                check_parameter['period'] = period
-            if attempts:
-                check_parameter['attempts'] = attempts
-            if notify_startup:
-                check_parameter['notify_startup'] = notify_startup
-            if name:
-                check_parameter['name'] = name
-            for check_path in check_paths:
-                checks.append(cls(check=check_path, config=check_parameter,
-                                  args=args))
+
+        # For every set of params (e.g.: {'port': 8888}) return a single check.
+        # We'll template the $variables in the `command` value with the params.
+        for param_dict in params_list:
+            command = _subprocess_command(raw_command, param_dict, hostname)
+            checks.append(cls(name, command, check_config))
+
         return checks
 
-    def __init__(self, check, config, args=[]):
-        self.check = check
+    def __init__(self, name, command, config):
+        """ Initializes a BernardCheck with the given `name` and `command`.
+            Any additional config (e.g. timeout or period) are given in the
+            `config` dict. `command` is expected to be in a subprocess-friendly
+            form, e.g.: ['check_foo', ['-h', 'localhost']].
+        """
+        self.name = name
         self.config = config
-        self.args = args
-        self.command = [self.check] + args
-
+        self.command = command
         self.run_count = 0
         self.event_count = 0
 
-        self.container_size = self.config['attempts'] + 1
-
-        # Contains the result of #{container_size} last checks
-        self.result_container = []
+        # Always holds the latest result.
+        self.result = None
 
-        # Set check_name, remove file extension and "check_" prefix
-        if 'name' in config:
-            check_name = config['name']
-        else:
-            check_name = self.check.split('/')[-1]
-            if check_name.startswith('check_'):
-                check_name = check_name[6:]
-            check_name = check_name.rsplit('.')[0]
-
-        self.check_name = check_name.lower()
-        log.debug(u"Initialized check %s (%s)" % (self.check_name, ' '.join(self.command)))
+        log.debug(u"Initialized check %s (%s)" % (self.name, command))
 
     def __repr__(self):
-        return self.check_name
+        return self.name
+
+    def get_period(self):
+        return self.config['period']
 
     def _execute_check(self):
         timeout = self.config.get('timeout')
         output = None
         returncode = None
+
         # This is going to disable the StaticWatchdog
         signal.signal(signal.SIGALRM, self.timeout_handler)
         signal.alarm(timeout)
         try:
             try:
-                process = subprocess.Popen(self.command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                process = subprocess.Popen(self.command,
+                                           stdout=subprocess.PIPE,
+                                           stderr=subprocess.PIPE)
                 output = process.communicate()[0].strip()
                 returncode = process.returncode
                 if len(output) > 20:
@@ -136,7 +110,7 @@ def _execute_check(self):
                 else:
                     truncated_output = output
                 log.info(u"Check[%s]: %s => %s (%s)" % (
-                    self.check_name,
+                    self.name,
                     u' '.join(self.command),
                     returncode,
                     truncated_output
@@ -153,50 +127,53 @@ def _execute_check(self):
     def timeout_handler(self, signum, frame):
         raise Timeout()
 
-    def run(self):
+    def run(self, dogstatsd_client):
         execution_date = time.time()
         try:
             output, returncode = self._execute_check()
+
             if output is None:
-                status = S.TIMEOUT
-                state = R.UNKNOWN
+                state = S.TIMEOUT
+                status = R.UNKNOWN
                 message = 'Check %s timed out after %ds' % (self, self.config['timeout'])
             else:
-                try:
-                    state, message = self.parse_nagios(output, returncode)
-                    status = S.OK
-                except InvalidCheckOutput:
-                    status = S.INVALID_OUTPUT
-                    state = R.UNKNOWN
-                    message = u'Failed to parse the output of the check: %s, returncode: %d, output: %s' % (
-                        self, returncode, output)
+                if returncode not in R.ALL:
+                    state = S.INVALID_OUTPUT
+                    status = R.UNKNOWN
+                    message = u'Failed to parse the output of the check: %s, ' \
+                               'returncode: %d, output: %s' \
+                                    % (self, returncode, output)
                     log.warn(message)
-        except OSError, exception:
-            state = R.UNKNOWN
-            status = S.EXCEPTION
+                else:
+                    message = self.parse_nagios(output, dogstatsd_client)
+                    state = S.OK
+                    status = returncode
+        except OSError:
+            status = R.UNKNOWN
+            state = S.EXCEPTION
             message = u'Failed to execute the check: %s' % self
             log.warn(message, exc_info=True)
 
         execution_time = time.time() - execution_date
         self.run_count += 1
 
-        return CheckResult(
+        check_result = CheckResult(
             status=status,
             state=state,
             message=message,
             execution_date=execution_date,
             execution_time=execution_time
         )
+        self.result = check_result
+        return check_result
 
-    def parse_nagios(self, output, returncode):
-        state = returncode
-
+    def parse_nagios(self, output, dogstatsd_client):
         output = output.strip()
         try:
             message, tail = output.split('|', 1)
         except ValueError:
             # No metric, return directly the output as a message
-            return state, output
+            return output
 
         message = message.strip()
 
@@ -217,7 +194,7 @@ def parse_nagios(self, output, returncode):
                 unit = metric.group('unit')
 
                 dd_metric = self._metric_name(label)
-                # self.dogstatsd.increment('bernard.check.metric_points')
+                dogstatsd_client.increment('bernard.check.metric_points')
 
                 if unit == '%':
                     value = value / 100.0
@@ -234,38 +211,58 @@ def parse_nagios(self, output, returncode):
                 elif unit == 'us':
                     value = value / 1000000.0
                 elif unit == 'c':
-                    # self.dogstatsd.rate(dd_metric, value)
+                    dogstatsd_client.rate(dd_metric, value)
                     log.debug('Saved rate: %s:%.2f' % (dd_metric, value))
                     continue
 
-                # self.dogstatsd.gauge(dd_metric, value)
+                dogstatsd_client.gauge(dd_metric, value)
                 log.debug('Saved metric: %s:%.2f' % (dd_metric, value))
 
-        return state, message
+        return message
 
     def _metric_name(self, label):
-        return 'bernard.%s.%s' % (self.check_name, label)
-
-    def get_last_result(self):
-        return self.get_result(0)
+        return 'bernard.%s.%s' % (self.name, label)
 
-    def get_result(self, position=0):
-        if len(self.result_container) > position:
-            index = - (position + 1)
-            return self.result_container[index]
-        elif position > self.container_size:
-            raise Exception('Trying to get %dth result while container size is %d' % (position, self.container_size))
-        else:
-            return CheckResult(execution_date=0, status=S.OK, state=R.NONE, message='Not runned yet', execution_time=0)
+    def get_result(self):
+        if self.result:
+            return self.result
+        return CheckResult(execution_date=0, state=S.OK, status=R.NONE,
+                           message='Not yet run.', execution_time=0)
 
     def get_status(self):
-        result = self.get_last_result()
+        result = self.get_result()
 
         return {
-            'check_name': self.check_name,
+            'check_name': self.name,
             'run_count': self.run_count,
             'status': result.status,
             'state': result.state,
             'message': result.message,
             'execution_time': result.execution_time,
         }
+
+
+def _subprocess_command(raw_command, params, hostname):
+    """ Given a raw command from the Bernard config and a dictionary of check
+        parameter, return a list that's subprocess-compatible for running the
+        command. We'll replace all command "variables" with a real parameter.
+
+    >>> _subprocess_command("/usr/bin/check_pg -p $port", {'port': '5433'})
+    ['/usr/bin/check_pg', ['-p', '5433']]
+    """
+    # $host is always available as a parameter.
+    if 'host' not in params:
+        params['host'] = hostname
+
+    # Replace variables.
+    for param, val in params.iteritems():
+        raw_command = raw_command.replace('$%s' % param, str(val))
+
+    # Split into subprocess format.
+    command_split = raw_command.split()
+    if len(command_split) == 0:
+        raise Exception('Invalid command in config: %v' % raw_command)
+    parsed_command = [command_split[0]]
+    if len(command_split[1:]):
+        parsed_command.extend(shlex.split(' '.join(command_split[1:])))
+    return parsed_command
diff --git a/bernard/cli.py b/bernard/cli.py
index 577a4a07bf..d46451f05c 100644
--- a/bernard/cli.py
+++ b/bernard/cli.py
@@ -1,20 +1,22 @@
+# stdlib
 import logging
 import sys
+
+# project
+from bernard.core import Bernard
 from config import initialize_logging, get_config, get_parsed_args
 from daemon import AgentSupervisor
-
 from util import (
     PidFile,
     get_hostname,
 )
-from bernard.core import Bernard
 
 log = logging.getLogger(__name__)
 
 def main():
     """" Execution of Bernard"""
-    # Check we're not using an old version of Python. We need 2.4 above because some modules (like subprocess)
-    # were only introduced in 2.4.
+    # Check we're not using an old version of Python. We need 2.4 above because
+    # some modules (like subprocess) were only introduced in 2.4.
     if int(sys.version_info[1]) <= 3:
         sys.stderr.write("Datadog agent requires python 2.4 or later.\n")
         sys.exit(2)
diff --git a/bernard/core.py b/bernard/core.py
index 9ef48cd51a..9b4f29f16d 100644
--- a/bernard/core.py
+++ b/bernard/core.py
@@ -1,19 +1,22 @@
+# stdlib
 import logging
 import signal
 import sys
 import time
+
+# project
+from bernard.check import R, S
+from bernard.scheduler import Scheduler
+from checks.check_status import AgentStatus, style
+from config import get_config_path
 from daemon import Daemon
+from dogstatsd_client import DogStatsd
 from util import (
     StaticWatchdog,
     get_os,
     yaml,
     yLoader,
 )
-from checks.check_status import AgentStatus, style
-from config import get_config_path
-
-from bernard.check import R, S
-from bernard.scheduler import Scheduler
 
 RESTART_INTERVAL = 4 * 24 * 60 * 60 # Defaults to 4 days
 BERNARD_CONF = "bernard.yaml"
@@ -61,7 +64,9 @@ def run(self):
 
         # load Bernard config and checks
         bernard_config = get_bernard_config()
-        self.scheduler = Scheduler.from_config(self.hostname, bernard_config)
+        dogstatsd_client = DogStatsd()
+        self.scheduler = Scheduler.from_config(self.hostname, bernard_config,
+                                               dogstatsd_client)
 
         # Save the agent start-up stats.
         BernardStatus(checks=self.scheduler.checks).persist()
@@ -117,13 +122,14 @@ class BernardStatus(AgentStatus):
 
     NAME = 'Bernard'
 
-    def __init__(self, checks=[], schedule_count=0):
+    def __init__(self, checks=None, schedule_count=0):
         AgentStatus.__init__(self)
+        checks = checks or []
         self.check_stats = [check.get_status() for check in checks]
         self.schedule_count = schedule_count
 
-        self.STATUS_COLOR = {S.OK: 'green', S.TIMEOUT: 'yellow', S.EXCEPTION: 'red', S.INVALID_OUTPUT: 'red'}
-        self.STATE_COLOR = {R.OK: 'green', R.WARNING: 'yellow', R.CRITICAL: 'red', R.UNKNOWN: 'yellow', R.NONE: 'white'}
+        self.STATUS_COLOR = {R.OK: 'green', R.WARNING: 'yellow', R.CRITICAL: 'red', R.UNKNOWN: 'yellow', R.NONE: 'white'}
+        self.STATE_COLOR = {S.OK: 'green', S.TIMEOUT: 'yellow', S.EXCEPTION: 'red', S.INVALID_OUTPUT: 'red'}
 
     def body_lines(self):
         lines = [
@@ -141,8 +147,8 @@ def body_lines(self):
         for check in self.check_stats:
             status_color = self.STATUS_COLOR[check['status']]
             state_color = self.STATE_COLOR[check['state']]
-            lines += ['  %s: [%s] #%d run is %s' % (check['check_name'], style(check['status'], status_color),
-                                                    check['run_count'], style(check['state'], state_color))]
+            lines += ['  %s: [%s] #%d run is %s' % (check['check_name'], style(check['state'], status_color),
+                                                    check['run_count'], style(check['status'], state_color))]
             lines += ['    %s' % ((check['message'] or ' ').splitlines()[0])]
 
         return lines
@@ -162,8 +168,6 @@ def to_dict(self):
 
 def get_bernard_config():
     """Return the configuration of Bernard"""
-
-    osname = get_os()
     config_path = get_config_path(os_name=get_os(), filename=BERNARD_CONF)
 
     try:
diff --git a/bernard/scheduler.py b/bernard/scheduler.py
index 79ff0971d5..68ca14a9e9 100644
--- a/bernard/scheduler.py
+++ b/bernard/scheduler.py
@@ -1,58 +1,57 @@
+# stdlib
 import logging
 import random
 import time
-import kima.client
 
-from bernard.check import BernardCheck, R, S
+# project
+import kima.client
+from bernard.check import BernardCheck, S
 
 log = logging.getLogger(__name__)
 
+# FIXME: Overriding the config for Kima.
+API_KEY = 'apikey_2'
+BASE_URL = 'http://localhost:5000'
+
 class Scheduler(object):
-    """
-    Schedule Bernard checks execution.
-    """
+    """ Schedule Bernard checks execution. """
 
     # Ratio of jitter to introduce in the scheduling
     JITTER_FACTOR = 0.1
 
+    # Check config defaults
+    DEFAULT_TIMEOUT = 5
+    DEFAULT_PERIOD = 15
+
     @classmethod
-    def from_config(cls, hostname, bernard_config):
+    def from_config(cls, hostname, bernard_config, dogstatsd_client):
         schedule_config = bernard_config.get('core', {}).get('schedule', {})
         bernard_checks = []
 
-        DEFAULT_TIMEOUT = 5
-        DEFAULT_FREQUENCY = 60
-        DEFAULT_ATTEMPTS = 3
-
-        default_check_parameter = {
-            'hostname': hostname,
-            'timeout': int(schedule_config.get('timeout', DEFAULT_TIMEOUT)),
-            'frequency': int(schedule_config.get('period', DEFAULT_FREQUENCY)),
-            'attempts': int(schedule_config.get('period', DEFAULT_ATTEMPTS)),
-            'notification': bernard_config.get('core', {}).get('notification', None),
-            'notify_startup': bernard_config.get('core', {}).get('notify_startup', "none"),
+        default_options = {
+            'timeout': int(schedule_config.get('timeout', cls.DEFAULT_TIMEOUT)),
+            'period': int(schedule_config.get('period', cls.DEFAULT_PERIOD)),
         }
 
-        try:
-            check_configs = bernard_config.get('checks') or []
-            for check_config in check_configs:
-                try:
-                    bernard_checks.extend(BernardCheck.from_config(check_config, default_check_parameter))
-                except Exception, e:
-                    log.exception(e)
-
-        except AttributeError:
-            log.info("Error while parsing Bernard configuration file. Be sure the structure is valid.")
-            return []
+        check_configs = bernard_config.get('checks') or {}
+        for check_name, check_config in check_configs.iteritems():
+            try:
+                check = BernardCheck.from_config(check_name, check_config,
+                                                 default_options, hostname)
+            except Exception:
+                log.exception('Unable to load check %s' % check_name)
+            else:
+                bernard_checks.extend(check)
 
         return cls(checks=bernard_checks, config=bernard_config,
-                   hostname=hostname)
+                   hostname=hostname, dogstatsd_client=dogstatsd_client)
 
-    def __init__(self, checks, config, hostname):
-        """Initialize scheduler"""
+    def __init__(self, checks, config, hostname, dogstatsd_client):
+        """ Initialize scheduler """
         self.checks = checks
         self.config = config
         self.hostname = hostname
+        self.dogstatsd_client = dogstatsd_client
         self.schedule_count = 0
 
         # Initialize schedule
@@ -62,11 +61,9 @@ def __init__(self, checks, config, hostname):
         for check in self.checks:
             self.schedule.append((position, check))
             position += 1
-            check.last_notified_state = R.NONE
 
-        api_key = 'apikey_2'
-        base_url = 'http://localhost:9000'
-        self.kima = kima.client.connect(api_key, base_url)
+        # Initialize our kima client.
+        self.kima = kima.client.connect(API_KEY, BASE_URL)
 
         # Scheduler doesn't need to be initialize if no check
         assert self.checks
@@ -93,7 +90,7 @@ def _now(self):
     def process(self):
         """ Execute the next scheduled check """
         check = self._pop_check()
-        result = check.run()
+        result = check.run(self.dogstatsd_client)
         self.schedule_count += 1
 
         # post results
@@ -107,13 +104,13 @@ def process(self):
 
     def reschedule_check(self, check):
         # Get the duration to wait for the next scheduling
-        waiting = check.config['frequency']
-        status = check.get_last_result().status
-        if status == S.TIMEOUT:
+        waiting = check.get_period()
+        state = check.get_result().state
+        if state == S.TIMEOUT:
             waiting = waiting * 3
-        elif status == S.INVALID_OUTPUT:
+        elif state == S.INVALID_OUTPUT:
             waiting = waiting * 8
-        elif status == S.EXCEPTION:
+        elif state == S.EXCEPTION:
             waiting = waiting * 12
 
         jitter_range = self.JITTER_FACTOR * waiting
@@ -134,39 +131,8 @@ def reschedule_check(self, check):
         log.debug('%s is rescheduled, next run in %.2fs' % (check, waiting))
 
     def post_run(self, result):
-        return self.kima.post_monitor_run(
-                # Impedence mismatch: bernard calls states what the server
-                # knows as statuses. Should reconcile.
-                status=result.state,
+        return self.kima.post_check_run(
+                status=result.status,
                 output=result.message,
                 timestamp=result.execution_date,
                 host_name=self.hostname)
-
-class SimulatedScheduler(Scheduler):
-    def __init__(self, *args, **kwargs):
-        Scheduler.__init__(self, *args, **kwargs)
-        self.virtual_time = time.time()
-
-    def wait_time(self):
-        return 0
-
-    def _now(self):
-        """
-        Set the virtual time at the end of the check execution
-        Reschedule the check on a timestamp based on this virtual time
-        """
-        last_result = check.get_last_result()
-        timestamp = self.virtual_time + last_result.execution_time
-        self.virtual_time = timestamp
-        return timestamp
-
-    def _pop_check(self):
-        """
-        When going to run a next check in simulated time, move the
-        simulated time to the next scheduled timestamp if
-        it is in the future
-        """
-        self.virtual_time = max(self.schedule[0][0], self.virtual_time)
-        if self.schedule:
-            return self.schedule.pop(0)[1]
-
diff --git a/tests/test_bernard.py b/tests/test_bernard.py
index 3c40215fa3..be5b5e6837 100644
--- a/tests/test_bernard.py
+++ b/tests/test_bernard.py
@@ -3,9 +3,45 @@
 import os
 import time
 logger = logging.getLogger()
-from scheduler import Scheduler, Notifier
-from checks.bernard_check import BernardCheck, R, S
+from bernard.scheduler import Scheduler
+from bernard.check import BernardCheck, R, S
 from dogstatsd_client import DogStatsd
+from util import get_hostname
+
+class FakeDogstatsd(DogStatsd):
+    """Fake DogStatsd client, which keeps requests to test them"""
+
+    def __init__(self, host='localhost', port=8125):
+        self.host = host
+        self.port = port
+        self.metrics = []
+        self.events = []
+
+    def _send(self, metric, metric_type, value, tags, sample_rate):
+        self.metrics.append({
+            'metric': metric,
+            'metric_type': metric_type,
+            'value': value,
+            'tags': tags,
+            'sample_rate': sample_rate,
+            })
+
+    def event(self, title, text, alert_type=None, aggregation_key=None, source_type_name=None, date_happened=None, priority=None, tags=None, hostname=None):
+        self.events.append({
+            'title': title,
+            'text': text,
+            'alert_type': alert_type,
+            'aggregation_key': aggregation_key,
+            'source_type_name': source_type_name,
+            'date_happened': date_happened,
+            'priority': priority,
+            'tags': tags,
+            'hostname': hostname,
+            })
+
+    def flush(self):
+        self.metrics = []
+        self.events = []
 
 class TestBernardCheck(unittest.TestCase):
     "Tests to validate the Bernard check logic"
@@ -14,15 +50,16 @@ class TestBernardCheck(unittest.TestCase):
     def test_timeout_check(self):
         """Specific tests for timeout checks to make tests faster"""
         check_timeout = self._get_timeout_check()
+        dogstatsd = FakeDogstatsd()
 
         start = time.time()
-        check_timeout.run()
+        check_timeout.run(dogstatsd)
         end = time.time()
 
         # Check status and state
-        result = check_timeout.get_last_result()
-        self.assertEqual(result.state, R.UNKNOWN)
-        self.assertEqual(result.status, S.TIMEOUT)
+        result = check_timeout.get_result()
+        self.assertEqual(result.status, R.UNKNOWN)
+        self.assertEqual(result.state, S.TIMEOUT)
 
         # Check execution duration. Timeout is 1s, let's give it 1.2s to run.
         self.assertTrue(result.execution_time < 1.2)
@@ -31,48 +68,50 @@ def test_timeout_check(self):
     def test_result(self):
         """Test result of each checks: if status and states are okay"""
         check_ok, check_warning, check_wrong_exit, check_disappeared = self._get_test_checks()
+        dogstatsd_client = FakeDogstatsd()
 
-        check_ok.run()
-        check_warning.run()
-        check_wrong_exit.run()
-        check_disappeared.run()
+        check_ok.run(dogstatsd_client)
+        check_warning.run(dogstatsd_client)
+        check_wrong_exit.run(dogstatsd_client)
+        check_disappeared.run(dogstatsd_client)
 
-        result = check_ok.get_last_result()
-        self.assertEqual(result.state, R.OK)
-        self.assertEqual(result.status, S.OK)
+        result = check_ok.get_result()
+        self.assertEqual(result.status, R.OK)
+        self.assertEqual(result.state, S.OK)
 
-        result = check_warning.get_last_result()
-        self.assertEqual(result.state, R.WARNING)
-        self.assertEqual(result.status, S.OK)
+        result = check_warning.get_result()
+        self.assertEqual(result.status, R.WARNING)
+        self.assertEqual(result.state, S.OK)
 
-        result = check_wrong_exit.get_last_result()
-        self.assertEqual(result.state, R.UNKNOWN)
-        self.assertEqual(result.status, S.INVALID_OUTPUT)
+        result = check_wrong_exit.get_result()
+        self.assertEqual(result.status, R.UNKNOWN)
+        self.assertEqual(result.state, S.INVALID_OUTPUT)
 
-        result = check_disappeared.get_last_result()
-        self.assertEqual(result.state, R.UNKNOWN)
-        self.assertEqual(result.status, S.EXCEPTION)
+        result = check_disappeared.get_result()
+        self.assertEqual(result.status, R.UNKNOWN)
+        self.assertEqual(result.state, S.EXCEPTION)
 
     def test_perfdata_metrics(self):
         """Test perfdata metrics: if the parsing, scaling and dogstatsd calls are okay"""
         check_ok, check_warning, check_wrong_exit, check_disappeared = self._get_test_checks()
+        dogstatsd_client = FakeDogstatsd()
         metric = {}
 
-        check_ok.run()
-        for m in check_ok.dogstatsd.metrics:
-            if m['metric'] == 'bernard.ok.metric1':
+        check_ok.run(dogstatsd_client)
+        for m in dogstatsd_client.metrics:
+            if m['metric'] == 'bernard.check_ok.metric1':
                 metric = m
         self.assertEqual(metric.get('value'), 30)
         self.assertEqual(metric.get('metric_type'), 'g')
 
-        check_warning.run()
-        for m in check_warning.dogstatsd.metrics:
-            if m['metric'] == 'bernard.warning.timing':
+        check_warning.run(dogstatsd_client)
+        for m in dogstatsd_client.metrics:
+            if m['metric'] == 'bernard.check_warning.timing':
                 metric = m
         self.assertEqual(metric.get('value'), 0.001)
         self.assertEqual(metric.get('metric_type'), 'g')
-        for m in check_warning.dogstatsd.metrics:
-            if m['metric'] == 'bernard.warning.count':
+        for m in dogstatsd_client.metrics:
+            if m['metric'] == 'bernard.check_warning.count':
                 metric = m
         self.assertEqual(metric.get('value'), 1234)
         self.assertEqual(metric.get('metric_type'), '_dd-r')
@@ -87,10 +126,10 @@ def test_scheduler(self):
         scheduler.JITTER_FACTOR = 0
 
         # Be sure it keeps the initial order
-        self.assertEqual(scheduler.schedule[0][1].check_name, check_ok.check_name)
-        self.assertEqual(scheduler.schedule[1][1].check_name, check_warning.check_name)
-        self.assertEqual(scheduler.schedule[2][1].check_name, check_wrong_exit.check_name)
-        self.assertEqual(scheduler.schedule[3][1].check_name, check_disappeared.check_name)
+        self.assertEqual(scheduler.schedule[0][1].name, check_ok.name)
+        self.assertEqual(scheduler.schedule[1][1].name, check_warning.name)
+        self.assertEqual(scheduler.schedule[2][1].name, check_wrong_exit.name)
+        self.assertEqual(scheduler.schedule[3][1].name, check_disappeared.name)
 
         # Should run each check once
         scheduler.process()
@@ -99,85 +138,43 @@ def test_scheduler(self):
         scheduler.process()
 
         # Look at the new schedule
-        self.assertEqual(scheduler.schedule[0][1].check_name, check_ok.check_name)
-        self.assertEqual(scheduler.schedule[1][1].check_name, check_warning.check_name)
-        self.assertEqual(scheduler.schedule[2][1].check_name, check_wrong_exit.check_name)
-        self.assertEqual(scheduler.schedule[3][1].check_name, check_disappeared.check_name)
+        self.assertEqual(scheduler.schedule[0][1].name, check_ok.name)
+        self.assertEqual(scheduler.schedule[1][1].name, check_warning.name)
+        self.assertEqual(scheduler.schedule[2][1].name, check_wrong_exit.name)
+        self.assertEqual(scheduler.schedule[3][1].name, check_disappeared.name)
 
         # Be sure that schedule order corresponds to timestamps
         self.assertTrue(scheduler.schedule[1][0] <= scheduler.schedule[1][0])
         self.assertTrue(scheduler.schedule[1][0] <= scheduler.schedule[2][0])
         self.assertTrue(scheduler.schedule[2][0] <= scheduler.schedule[3][0])
 
-
-    def _get_check_parameters(self):
-        dogstatsd = FakeDogstatsd()
-        path = os.path.dirname(os.path.abspath(__file__))
-        path = os.path.join(path, 'bernard_checks')
-
-        config = {
-            'frequency': 60,
-            'attempts': 3,
-            'timeout': 1,
-            'notification': '',
-            'notify_startup': 'none',
-        }
-
-        return path, config, dogstatsd
-
     def _get_test_checks(self):
-        path, config, dogstatsd = self._get_check_parameters()
-
         return [
-            BernardCheck(os.path.join(path, 'check_ok'), config, dogstatsd),
-            BernardCheck(os.path.join(path, 'check_warning'), config, dogstatsd),
-            BernardCheck(os.path.join(path, 'check_wrong_exit'), config, dogstatsd),
-            BernardCheck(os.path.join(path, 'check_disappeared'), config, dogstatsd),
+            BernardCheck.from_config('check_ok', self._get_check_config('check_ok'), {})[0],
+            BernardCheck.from_config('check_warning', self._get_check_config('check_warning'), {})[0],
+            BernardCheck.from_config('check_wrong_exit', self._get_check_config('check_wrong_exit'), {})[0],
+            BernardCheck.from_config('check_disappeared', self._get_check_config('check_disappeared'), {})[0],
         ]
 
     def _get_timeout_check(self):
-        path, config, dogstatsd = self._get_check_parameters()
-
-        return BernardCheck(os.path.join(path, 'check_timeout'), config, dogstatsd)
+        return BernardCheck.from_config('check_timeout', self._get_check_config('check_timeout'), {})[0]
 
     def _get_scheduler(self, checks):
-        return Scheduler(checks=checks, config={})
+        return Scheduler(checks, {}, get_hostname(), FakeDogstatsd())
+
+    def _get_check_config(self, command):
+        path = os.path.dirname(os.path.abspath(__file__))
+        path = os.path.join(path, 'bernard_checks')
+        return {
+            'command': os.path.join(path, command),
+            'options': {
+                'period': 60,
+                'attempts': 3,
+                'timeout': 1,
+            }
+        }
 
 
 if __name__ == '__main__':
     unittest.main()
 
-class FakeDogstatsd(DogStatsd):
-    """Fake DogStatsd client, which keeps requests to test them"""
-
-    def __init__(self, host='localhost', port=8125):
-        self.host = host
-        self.port = port
-        self.metrics = []
-        self.events = []
-
-    def _send(self, metric, metric_type, value, tags, sample_rate):
-        self.metrics.append({
-            'metric': metric,
-            'metric_type': metric_type,
-            'value': value,
-            'tags': tags,
-            'sample_rate': sample_rate,
-            })
-
-    def event(self, title, text, alert_type=None, aggregation_key=None, source_type_name=None, date_happened=None, priority=None, tags=None, hostname=None):
-        self.events.append({
-            'title': title,
-            'text': text,
-            'alert_type': alert_type,
-            'aggregation_key': aggregation_key,
-            'source_type_name': source_type_name,
-            'date_happened': date_happened,
-            'priority': priority,
-            'tags': tags,
-            'hostname': hostname,
-            })
-
-    def flush(self):
-        self.metrics = []
-        self.events = []
\ No newline at end of file

From c3c9163a1c552e913cb4e64bb3d313110d2397f8 Mon Sep 17 00:00:00 2001
From: Conor Branagan <conor.branagan@gmail.com>
Date: Tue, 31 Dec 2013 18:47:58 +0000
Subject: [PATCH 2/3] Fix pylint issue.

---
 bernard/core.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/bernard/core.py b/bernard/core.py
index 9b4f29f16d..79cff94875 100644
--- a/bernard/core.py
+++ b/bernard/core.py
@@ -9,7 +9,7 @@
 from bernard.scheduler import Scheduler
 from checks.check_status import AgentStatus, style
 from config import get_config_path
-from daemon import Daemon
+from daemon import Daemon, AgentSupervisor
 from dogstatsd_client import DogStatsd
 from util import (
     StaticWatchdog,
@@ -104,8 +104,7 @@ def run(self):
         # Now clean-up.
         BernardStatus.remove_latest_status()
 
-        # Explicitly kill the process, because it might be running
-        # as a daemon.
+        # Explicitly kill the process, because it might be running as a daemon.
         log.info("Exiting. Bye bye.")
         sys.exit(0)
 

From 963c76a5d26e825541b3af2b410a25e5ad3d0002 Mon Sep 17 00:00:00 2001
From: Conor Branagan <conor.branagan@gmail.com>
Date: Tue, 31 Dec 2013 20:16:46 +0000
Subject: [PATCH 3/3] Pass the params and check name to the posted check run.

---
 bernard/check.py     | 14 ++++++++++----
 bernard/scheduler.py |  6 ++++--
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/bernard/check.py b/bernard/check.py
index b48f8cafe9..628bfcbfbd 100644
--- a/bernard/check.py
+++ b/bernard/check.py
@@ -62,12 +62,17 @@ def from_config(cls, name, check_config, defaults, hostname=None):
         # For every set of params (e.g.: {'port': 8888}) return a single check.
         # We'll template the $variables in the `command` value with the params.
         for param_dict in params_list:
+            # Stringify all of the check params. We expect everything to be
+            # strings through the pipeline so we'll do it early on.
+            for k, v in param_dict.iteritems():
+                param_dict[k] = str(v)
+
             command = _subprocess_command(raw_command, param_dict, hostname)
-            checks.append(cls(name, command, check_config))
+            checks.append(cls(name, command, check_config, param_dict))
 
         return checks
 
-    def __init__(self, name, command, config):
+    def __init__(self, name, command, config, params):
         """ Initializes a BernardCheck with the given `name` and `command`.
             Any additional config (e.g. timeout or period) are given in the
             `config` dict. `command` is expected to be in a subprocess-friendly
@@ -76,6 +81,7 @@ def __init__(self, name, command, config):
         self.name = name
         self.config = config
         self.command = command
+        self.params = params
         self.run_count = 0
         self.event_count = 0
 
@@ -162,7 +168,7 @@ def run(self, dogstatsd_client):
             state=state,
             message=message,
             execution_date=execution_date,
-            execution_time=execution_time
+            execution_time=execution_time,
         )
         self.result = check_result
         return check_result
@@ -256,7 +262,7 @@ def _subprocess_command(raw_command, params, hostname):
 
     # Replace variables.
     for param, val in params.iteritems():
-        raw_command = raw_command.replace('$%s' % param, str(val))
+        raw_command = raw_command.replace('$%s' % param, val)
 
     # Split into subprocess format.
     command_split = raw_command.split()
diff --git a/bernard/scheduler.py b/bernard/scheduler.py
index 68ca14a9e9..584216c264 100644
--- a/bernard/scheduler.py
+++ b/bernard/scheduler.py
@@ -95,7 +95,7 @@ def process(self):
 
         # post results
         try:
-            self.post_run(result)
+            self.post_run(check, result)
         except Exception:
             log.error("Could not post run", exc_info=True)
 
@@ -130,9 +130,11 @@ def reschedule_check(self, check):
 
         log.debug('%s is rescheduled, next run in %.2fs' % (check, waiting))
 
-    def post_run(self, result):
+    def post_run(self, check, result):
         return self.kima.post_check_run(
+                check=check.name,
                 status=result.status,
                 output=result.message,
                 timestamp=result.execution_date,
+                params=check.params,
                 host_name=self.hostname)