From c3ae0e6186e07481b6ec8a117b88e090ea4845e4 Mon Sep 17 00:00:00 2001 From: Quentin Madec Date: Wed, 18 Feb 2015 12:55:50 -0500 Subject: [PATCH 1/4] [etcd] send service check ok when all is fine And not only CRITICAL when unable to get metrics. This is needed as the service check won't appear on the monitor screen otherwise. Fix python2.6 compatibility issue: "{}".format() --- checks.d/etcd.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/checks.d/etcd.py b/checks.d/etcd.py index cd0d7caceb..50dce7b47b 100644 --- a/checks.d/etcd.py +++ b/checks.d/etcd.py @@ -54,7 +54,7 @@ def check(self, instance): instance_tags = instance.get('tags', []) # Append the instance's URL in case there are more than one, that # way they can tell the difference! - instance_tags.append("url:{}".format(url)) + instance_tags.append("url:{0}".format(url)) timeout = float(instance.get('timeout', self.DEFAULT_TIMEOUT)) self_response = self.get_self_metrics(url, timeout) @@ -68,13 +68,13 @@ def check(self, instance): if key in self_response: self.rate(self.SELF_RATES[key], self_response[key], tags=instance_tags) else: - self.log.warn("Missing key {} in stats.".format(key)) + self.log.warn("Missing key {0} in stats.".format(key)) for key in self.SELF_GAUGES: if key in self_response: self.gauge(self.SELF_GAUGES[key], self_response[key], tags=instance_tags) else: - self.log.warn("Missing key {} in stats.".format(key)) + self.log.warn("Missing key {0} in stats.".format(key)) store_response = self.get_store_metrics(url, timeout) if store_response is not None: @@ -82,13 +82,16 @@ def check(self, instance): if key in store_response: self.rate(self.STORE_RATES[key], store_response[key], tags=instance_tags) else: - self.log.warn("Missing key {} in stats.".format(key)) + self.log.warn("Missing key {0} in stats.".format(key)) for key in self.STORE_GAUGES: if key in store_response: self.gauge(self.STORE_GAUGES[key], store_response[key], tags=instance_tags) else: - self.log.warn("Missing key {} in stats.".format(key)) + self.log.warn("Missing key {0} in stats.".format(key)) + + if self_response is not None and store_response is not None: + self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, tags=instance_tags) def get_self_metrics(self, url, timeout): return self.get_json(url + "/v2/stats/self", timeout) @@ -103,13 +106,13 @@ def get_json(self, url, timeout): # If there's a timeout self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, message="Timeout when hitting %s" % url, - tags = ["url:{}".format(url)]) - return None + tags = ["url:{0}".format(url)]) + raise if r.status_code != 200: self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, message="Got %s when hitting %s" % (r.status_code, url), - tags = ["url:{}".format(url)]) - return None + tags = ["url:{0}".format(url)]) + raise Exception("Http status code {0} on url {1}".format(r.status_code, url)) return r.json() From 3f537dc8f61b8a13f874c8c0abcfd4e35b520b5e Mon Sep 17 00:00:00 2001 From: Quentin Madec Date: Wed, 18 Feb 2015 16:41:49 -0500 Subject: [PATCH 2/4] [etcd] add test, metrics and service checks Copied from the apache/riackcs tests, using the AgentCheckTest. --- .travis.yml | 1 + Rakefile | 1 + ci/etcd.rb | 63 ++++++++++++++++++++++++++++++++++++++++++++++ tests/test_etcd.py | 43 +++++++++++++++++++++++++++++++ 4 files changed, 108 insertions(+) create mode 100644 ci/etcd.rb create mode 100644 tests/test_etcd.py diff --git a/.travis.yml b/.travis.yml index b963f7fcaa..6e9431814e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -64,6 +64,7 @@ env: - TRAVIS_FLAVOR=ssh - TRAVIS_FLAVOR=fluentd - TRAVIS_FLAVOR=rabbitmq + - TRAVIS_FLAVOR=etcd # Override travis defaults with empty jobs before_install: echo "OVERRIDING TRAVIS STEPS" diff --git a/Rakefile b/Rakefile index aabae0a6c0..8da0af9eaf 100755 --- a/Rakefile +++ b/Rakefile @@ -9,6 +9,7 @@ require './ci/cassandra' require './ci/couchdb' require './ci/default' require './ci/elasticsearch' +require './ci/etcd' require './ci/fluentd' require './ci/gearman' require './ci/haproxy' diff --git a/ci/etcd.rb b/ci/etcd.rb new file mode 100644 index 0000000000..5f161f5d42 --- /dev/null +++ b/ci/etcd.rb @@ -0,0 +1,63 @@ +require './ci/common' + +def etcd_version + ENV['ETCD_VERSION'] || '2.0.3' +end + +def etcd_rootdir + "#{ENV['INTEGRATIONS_DIR']}/etcd_#{etcd_version}" +end + +namespace :ci do + namespace :etcd do |flavor| + task :before_install => ['ci:common:before_install'] + + task :install => ['ci:common:install'] do + unless Dir.exist? File.expand_path(etcd_rootdir) + sh %(curl -s -L -o $VOLATILE_DIR/etcd.tar.gz\ + https://github.com/coreos/etcd/releases/download/v#{etcd_version}/etcd-v#{etcd_version}-linux-amd64.tar.gz) + sh %(mkdir -p #{etcd_rootdir}) + sh %(tar xzvf $VOLATILE_DIR/etcd.tar.gz\ + -C #{etcd_rootdir}\ + --strip-components=1 >/dev/null) + end + end + + task :before_script => ['ci:common:before_script'] do + sh %(cd $VOLATILE_DIR && #{etcd_rootdir}/etcd >/dev/null &) + sleep_for 10 + end + + task :script => ['ci:common:script'] do + this_provides = [ + 'etcd' + ] + Rake::Task['ci:common:run_tests'].invoke(this_provides) + end + + task :cleanup => ['ci:common:cleanup'] do + # This will delete the temp directory of etcd, + # so the etcd process will kill himself quickly after that (<10s) + sh %(rm -rf $VOLATILE_DIR/*etcd*) + end + + task :execute do + exception = nil + begin + %w(before_install install before_script script).each do |t| + Rake::Task["#{flavor.scope.path}:#{t}"].invoke + end + rescue => e + exception = e + puts "Failed task: #{e.class} #{e.message}".red + end + if ENV['SKIP_CLEANUP'] + puts 'Skipping cleanup, disposable environments are great'.yellow + else + puts 'Cleaning up' + Rake::Task["#{flavor.scope.path}:cleanup"].invoke + end + fail exception if exception + end + end +end diff --git a/tests/test_etcd.py b/tests/test_etcd.py new file mode 100644 index 0000000000..9ab68f2f98 --- /dev/null +++ b/tests/test_etcd.py @@ -0,0 +1,43 @@ +import unittest +from tests.common import AgentCheckTest +from nose.plugins.attrib import attr +from time import sleep +from checks import AgentCheck +from requests.exceptions import Timeout + +@attr(requires='etcd') +class EtcdTest(AgentCheckTest): + + CHECK_NAME = "etcd" + def __init__(self, *args, **kwargs): + AgentCheckTest.__init__(self, *args, **kwargs) + self.config = {"instances": [{"url": "http://localhost:4001"}]} + + def test_metrics(self): + self.run_check(self.config) + sleep(1) + self.run_check(self.config) + tags = ['url:http://localhost:4001', 'etcd_state:leader'] + self.assertMetric('etcd.store.gets.success', metric_value=0.0, tags=tags) + self.assertMetric('etcd.store.gets.fail', metric_value=0.0, tags=tags) + self.assertMetric('etcd.self.send.appendrequest.count', metric_value=0.0, tags=tags) + + + def test_service_checks(self): + self.run_check(self.config) + + self.assertEqual(len(self.service_checks), 1, self.service_checks) + sc = self.service_checks[0] + self.assertEquals(sc["check"], self.check.SERVICE_CHECK_NAME, sc["check"]) + self.assertEquals(sc["status"], AgentCheck.OK, sc["status"]) + self.assertEquals(sc["tags"], ['url:http://localhost:4001', 'etcd_state:leader'], sc["tags"]) + + def test_bad_config(self): + self.assertRaises(Exception, + lambda: self.run_check({"instances": [{"url": "http://localhost:4001/test"}]})) + service_checks = self.check.get_service_checks() + self.assertEqual(len(service_checks), 1, service_checks) + sc = service_checks[0] + self.assertEquals(sc["check"], self.check.SERVICE_CHECK_NAME, sc["check"]) + self.assertEquals(sc["status"], AgentCheck.CRITICAL, sc["status"]) + self.assertEquals(sc["tags"], ['url:http://localhost:4001/test/v2/stats/self'], sc["tags"]) From ad405a246d2a42e309faab4d86aebd71529571c2 Mon Sep 17 00:00:00 2001 From: Quentin Madec Date: Thu, 19 Feb 2015 12:31:35 -0500 Subject: [PATCH 3/4] [tests] add assertServiceCheck in common check Add assertServiceCheck (same principle than assertMetrics) in AgentCheckTest. If service_checks does not exist, retrieve latest data from AgentCheckTest.check. --- tests/common.py | 24 +++++++++++++++++++++++- tests/test_etcd.py | 18 ++++++++---------- 2 files changed, 31 insertions(+), 11 deletions(-) diff --git a/tests/common.py b/tests/common.py index cdc78fc4e8..ef157aad73 100644 --- a/tests/common.py +++ b/tests/common.py @@ -123,14 +123,23 @@ def run_check(self, config, agent_config=None): if self.check is None: self.check = load_check(self.CHECK_NAME, config, agent_config) + error = None for instance in self.check.instances: - self.check.check(instance) + try: + self.check.check(instance) + except Exception, e: + # Catch error before re-raising it to be able to get service_checks + print"Exception {0} during check" + error = e self.metrics = self.check.get_metrics() self.events = self.check.get_events() self.service_checks = self.check.get_service_checks() self.warnings = self.check.get_warnings() + if error is not None: + raise error + def print_current_state(self): print "++++++++++++ DEBUG ++++++++++++" print "METRICS ", @@ -197,3 +206,16 @@ def assertMetricTag(self, metric_name, tag, count=None): candidates.append((m_name, ts, val, mdata)) self._candidates_size_assert(candidates, count=count) + + def assertServiceCheck(self, service_check_name, status=None, tags=None, count=None): + candidates = [] + for sc in self.service_checks: + if sc['check'] == service_check_name: + if status is not None and sc['status'] != status: + continue + if tags is not None and sorted(tags) != sorted(sc.get("tags")): + continue + + candidates.append(sc) + + self._candidates_size_assert(candidates, count=count) diff --git a/tests/test_etcd.py b/tests/test_etcd.py index 9ab68f2f98..dc1006fdf4 100644 --- a/tests/test_etcd.py +++ b/tests/test_etcd.py @@ -27,17 +27,15 @@ def test_service_checks(self): self.run_check(self.config) self.assertEqual(len(self.service_checks), 1, self.service_checks) - sc = self.service_checks[0] - self.assertEquals(sc["check"], self.check.SERVICE_CHECK_NAME, sc["check"]) - self.assertEquals(sc["status"], AgentCheck.OK, sc["status"]) - self.assertEquals(sc["tags"], ['url:http://localhost:4001', 'etcd_state:leader'], sc["tags"]) + self.assertServiceCheck(self.check.SERVICE_CHECK_NAME, + status=AgentCheck.OK, + tags=['url:http://localhost:4001', 'etcd_state:leader']) def test_bad_config(self): self.assertRaises(Exception, lambda: self.run_check({"instances": [{"url": "http://localhost:4001/test"}]})) - service_checks = self.check.get_service_checks() - self.assertEqual(len(service_checks), 1, service_checks) - sc = service_checks[0] - self.assertEquals(sc["check"], self.check.SERVICE_CHECK_NAME, sc["check"]) - self.assertEquals(sc["status"], AgentCheck.CRITICAL, sc["status"]) - self.assertEquals(sc["tags"], ['url:http://localhost:4001/test/v2/stats/self'], sc["tags"]) + + self.assertEqual(len(self.service_checks), 1, self.service_checks) + self.assertServiceCheck(self.check.SERVICE_CHECK_NAME, + status=AgentCheck.CRITICAL, + tags=['url:http://localhost:4001/test/v2/stats/self']) From af7592960600aaf1d2a7e91c21b39491f7d0b56f Mon Sep 17 00:00:00 2001 From: Quentin Madec Date: Thu, 19 Feb 2015 14:10:06 -0500 Subject: [PATCH 4/4] [tests] update riakcs with new assertServiceCheck Because of the changes in AgentCheckTest, old test was failing (because of the new error catch and reraise in run_check). It now uses assertServiceCheck. --- tests/test_riakcs.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/test_riakcs.py b/tests/test_riakcs.py index 7c388f2b56..ee5b9c1466 100644 --- a/tests/test_riakcs.py +++ b/tests/test_riakcs.py @@ -32,9 +32,7 @@ def test_service_checks(self): self.check = load_check(self.CHECK_NAME, self.config, {}) self.assertRaises(error, lambda: self.run_check(self.config)) - service_checks = self.check.get_service_checks() - self.assertEqual(len(service_checks), 1, service_checks) - sc = service_checks[0] - self.assertEquals(sc["check"], self.check.SERVICE_CHECK_NAME, sc["check"]) - self.assertEquals(sc["status"], AgentCheck.CRITICAL, sc["status"]) - self.assertEquals(sc["tags"], ['aggregation_key:localhost:8080'], sc["tags"]) + self.assertEqual(len(self.service_checks), 1, self.service_checks) + self.assertServiceCheck(self.check.SERVICE_CHECK_NAME, + status=AgentCheck.CRITICAL, + tags=['aggregation_key:localhost:8080'])