Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RedisSentinel] Add a redis sentinel check #1

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions circle.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
general:
branches:
only:
- /jaime.*/ # for now - whitelisting
- /tristan.*/ # for now - whitelisting
32 changes: 32 additions & 0 deletions redis_sentinel/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Redis Sentinel

## Overview

Get metrics from Redis's Sentinel service in real time to:

* Visualize and monitor sentinels states
* Be notified about failovers

## Installation

Install the `dd-check-redis-sentinel` package manually or with your favorite configuration manager

## Configuration

Edit the `redis_sentinel.yaml` file to point to your server and port, set the masters to monitor

## Validation

When you run `datadog-agent info` you should see something like the following:

Checks
======

redis_sentinel
--------------
- instance #0 [OK]
- Collected 39 metrics, 0 events & 7 service checks

## Compatibility

The Redis Sentinel check is compatible with all major platforms
212 changes: 212 additions & 0 deletions redis_sentinel/check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
import time
from collections import defaultdict

# 3rd party
import redis

# project
from checks import AgentCheck

EVENT_TYPE = SOURCE_TYPE_NAME = 'redis_sentinel'


class RedisSentinelCheck(AgentCheck):

def __init__(self, name, init_config, agentConfig, instances=None):
AgentCheck.__init__(self, name, init_config, agentConfig, instances)
self._masters = defaultdict(lambda: "")

def check(self, instance):
redis_conn = redis.StrictRedis(
host=instance['sentinel_host'], port=instance['sentinel_port'], db=0
)
for master_name in instance['masters']:
base_tags = ['redis_name:%s' % master_name] + instance.get('tags', [])
try:
self._process_instance_master(redis_conn, master_name, base_tags)
except Exception as e:
self.warning("Error collecting metrics for master %s: %s" % (master_name, e))

def _process_instance_master(self, redis_conn, master_name, base_tags):
master_tags = self._process_master_stats(redis_conn, master_name, base_tags)
self._process_slaves_stats(redis_conn, master_name, base_tags, master_tags)
self._process_sentinels_stats(redis_conn, master_name, base_tags, master_tags)

def _process_sentinels_stats(self, redis_conn, master_name, base_tags, master_tags):
"""
[{
'down-after-milliseconds': 5000,
'flags': 's_down,sentinel',
'ip': '10.1.2.3',
'is_disconnected': False,
'is_master': False,
'is_master_down': False,
'is_odown': False,
'is_sdown': True,
'is_sentinel': True,
'is_slave': False,
'last-hello-message': 12345678,
'last-ok-ping-reply': 12345678,
'last-ping-reply': 12345679
'last-ping-sent': 12345678,
'name': '10.1.2.3:26379',
'pending-commands': 78,
'port': 26379,
'runid': '123456789abcdef',
's-down-time': 12345678,
'voted-leader': '?',
'voted-leader-epoch': 0,
}]
"""
sentinels_stats = redis_conn.sentinel_sentinels(master_name)
# sentinel_stats returns stats for other sentinels only
# so increment once for current sentinel
self.increment('redis.sentinel.ok_sentinels', tags=master_tags)
for stats in sentinels_stats:
sentinel_tags = ['sentinel_ip:%s' % stats['ip']] + base_tags
if stats['is_odown'] or stats['is_sdown']: # sentinel keeps track of old sentinels
continue
self.increment('redis.sentinel.ok_sentinels', tags=master_tags)
self.gauge(
'redis.sentinel.pending_commands',
stats['pending-commands'], tags=['sentinel'] + sentinel_tags
)
self.gauge(
'redis.sentinel.ping_latency',
stats['last-ping-reply'] - stats['last-ping-sent'],
sentinel_tags
)
self.gauge(
'redis.sentinel.last_ok_ping_latency',
stats['last-ping-reply'] - stats['last-ok-ping-reply'],
sentinel_tags
)

def _process_slaves_stats(self, redis_conn, master_name, base_tags, master_tags):
"""
[{
'down-after-milliseconds': 5000,
'flags': 'slave',
'info-refresh': 2628,
'ip': '10.1.2.3',
'is_disconnected': False,
'is_master': False,
'is_master_down': False,
'is_odown': False,
'is_sdown': False,
'is_sentinel': False,
'is_slave': True
'last-ok-ping-reply': 429,
'last-ping-reply': 429,
'last-ping-sent': 0,
'master-host': '10.1.2.3',
'master-link-down-time': 0,
'master-link-status': 'ok',
'master-port': 6379,
'name': '10.1.2.3:6379',
'pending-commands': 0,
'port': 6379,
'role-reported': 'slave',
'role-reported-time': 3124725,
'runid': '123456789abcdef',
'slave-priority': 100,
'slave-repl-offset': 12345678,
}]
"""
slaves_stats = redis_conn.sentinel_slaves(master_name)
for stats in slaves_stats:
if stats['is_odown'] or stats['is_sdown']: # sentinel keeps track of old slaves
continue
self.increment('redis.sentinel.ok_slaves', tags=master_tags)
slave_tags = ['slave_ip:%s' % stats['ip']] + base_tags
self.gauge(
'redis.sentinel.pending_commands', stats['pending-commands'],
tags=['slave'] + slave_tags
)

self.service_check(
'redis.sentinel.slave_is_disconnected',
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nitpick: we seem to consistently use affirmative statements for service check names (i.e. slave_connected or slave_connection_status instead of slave_is_disconnected), not sure it'd make sense here though.
Disregard if you disagree

AgentCheck.CRITICAL if stats['is_disconnected'] else AgentCheck.OK,
tags=slave_tags
)
self.service_check(
'redis.sentinel.slave_master_link_down',
AgentCheck.CRITICAL if stats['master-link-status'] != 'ok' else AgentCheck.OK,
tags=slave_tags
)

def _process_master_stats(self, redis_conn, master_name, base_tags):
"""
{
'config-epoch': 94,
'down-after-milliseconds': 5000,
'failover-timeout': 60000,
'flags': 'master',
'info-refresh': 1234,
'ip': '10.1.2.3',
'is_disconnected': False,
'is_master': True,
'is_master_down': False,
'is_odown': False,
'is_sdown': False,
'is_sentinel': False,
'is_slave': False,
'last-ok-ping-reply': 49,
'last-ping-reply': 49,
'last-ping-sent': 0,
'name': 'delancie-backend',
'num-other-sentinels': 4
'num-slaves': 3,
'parallel-syncs': 10,
'pending-commands': 0,
'port': 6379,
'quorum': 2,
'role-reported': 'master',
'role-reported-time': 12345678,
'runid': '123456789abcdef',
}
"""
stats = redis_conn.sentinel_master(master_name)
master_tags = ['master_ip:%s' % stats['ip']] + base_tags
self.gauge(
'redis.sentinel.pending_commands', stats['pending-commands'],
tags=['master'] + master_tags
)
self.gauge(
'redis.sentinel.known_slaves', stats['num-slaves'], tags=master_tags
)
self.gauge(
'redis.sentinel.known_sentinels',
stats['num-other-sentinels'] + 1,
tags=master_tags
)

self.service_check(
'redis.sentinel.master_is_disconnected',
AgentCheck.CRITICAL if stats['is_disconnected'] else AgentCheck.OK,
tags=master_tags
)
self.service_check(
'redis.sentinel.master_is_down',
AgentCheck.CRITICAL if stats['is_master_down'] else AgentCheck.OK,
tags=master_tags
)

if self._masters[master_name] != stats['ip']:
if self._masters[master_name] != "": # avoid check initialization
self.increment('redis.sentinel.failover', tags=base_tags)
self.event({
'timestamp': int(time.time()),
'event_type': EVENT_TYPE,
'msg_title': '%s failover from %s to %s' % (
master_name, self._masters[master_name], stats['ip']
),
'alert_type': 'info',
"source_type_name": SOURCE_TYPE_NAME,
"event_object": master_name,
"tags": base_tags
})

self._masters[master_name] = stats['ip']

return master_tags
9 changes: 9 additions & 0 deletions redis_sentinel/conf.yaml.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
init_config:

instances:
- sentinel_host: localhost
sentinel_port: 26379
tags: ['custom:tag']
masters:
- mymaster
- myothermaster
Binary file added redis_sentinel/images/120x60.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added redis_sentinel/images/128x128.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added redis_sentinel/images/200x128.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
15 changes: 15 additions & 0 deletions redis_sentinel/manifest.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"maintainer": "[email protected]",
"manifest_version": "0.1.0",
"max_agent_version": "6.0.0",
"min_agent_version": "5.6.3",
"name": "redis_sentinel",
"parameters": {
"creates_events": false,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the code suggest we do create events on failovers here.

"metrics_to_check": ["redis.sentinel.known_sentinels"],
"user_configured": false
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this refer to yaml configs or configs in the tile/dogweb?

},
"short_description": "Redis Sentinel provides high availability for Redis.",
"support": "contrib",
"version": "0.1.0"
}
9 changes: 9 additions & 0 deletions redis_sentinel/metadata.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation,integration,short_name
redis.sentinel.known_sentinels,gauge,,instance,,number of sentinels detected,,redis_sentinel,known sentinels
redis.sentinel.known_slaves,gauge,,instance,,number of slaves detected,,redis_sentinel,known slaves
redis.sentinel.last_ok_ping_latency,gauge,,second,,number of seconds since last OK ping,,redis_sentinel,last ok latency
redis.sentinel.ok_sentinels,gauge,,instance,,number of sentinels up and running,,redis_sentinel,ok sentinels
redis.sentinel.ok_slaves,gauge,,instance,,number of slaves up and running,,redis_sentinel,ok slaves
redis.sentinel.pending_commands,gauge,,command,,number of pending sentinel commands,,redis_sentinel,pending commands
redis.sentinel.ping_latency,gauge,,millisecond,,latency of a sentinel ping,,redis_sentinel,ping latency
redis.sentinel.failover,count,,occurrence,,number of failovers detected,,redis_sentinel,failovers
1 change: 1 addition & 0 deletions redis_sentinel/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
redis
Empty file added redis_sentinel/tests.py
Empty file.