From bb4728ecf447a4a0b297f67191b42f8c1e762156 Mon Sep 17 00:00:00 2001 From: Alex Harris Date: Fri, 21 Apr 2017 14:38:28 +0100 Subject: [PATCH] Update zk_service_health metric when not serving. This metric should now correctly reflect when server is up and serving requests. Previously this metric tracked ruok status only which led to a condition where metric is reporting healthy but server is not responding to client requests. --- README.md | 3 ++- zk-collectd.py | 9 +++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) mode change 100755 => 100644 zk-collectd.py diff --git a/README.md b/README.md index 302bbf5..c7b4c4a 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,8 @@ All metrics are reported with the `plugin:zookeeper` dimension. Additionally, if you specify an `Instance` in your `Module` configuration block, its value will be reported as the `plugin_instance` dimension. -zk_is_leader is a synthetic metric which is 0 iff the contents of zk_server_state is 'follower' +zk_is_leader is a synthetic metric which is 0 if the contents of zk_server_state is 'follower'. +zk_service_health is a synthetic metric which tracks if service is running and servicing requests. # License diff --git a/zk-collectd.py b/zk-collectd.py old mode 100755 new mode 100644 index efcb250..afc3b38 --- a/zk-collectd.py +++ b/zk-collectd.py @@ -84,6 +84,15 @@ def _get_mntr_stats(self): response = self._send_cmd(MNTR_CMD) result = {} + # If instance stops serving requests (e.g. loses quorum) it still + # returns "imok" to ruok query but will force close any client + # connections and return an error string to all other 4 letter commands. + # In this situation we should override zk_service_health metric + # initially set in _get_health_stat as it's definitely not in a healthy + # state. + if response == 'This ZooKeeper instance is not currently serving requests\n': + return {'zk_service_health': 0} + for line in response.splitlines(): try: key, value = self._parse_line(line)