Skip to content

Commit

Permalink
[docker daemon] Add support for labels and refactor tagging. Fix #1742
Browse files Browse the repository at this point in the history
  • Loading branch information
Remi Hakim committed Sep 11, 2015
1 parent 78c5a04 commit ac36186
Show file tree
Hide file tree
Showing 3 changed files with 188 additions and 74 deletions.
191 changes: 118 additions & 73 deletions checks.d/docker_daemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,12 @@
"image_tag",
]

DEFAULT_IMAGE_TAGS = [
'image_name',
'image_tag'
]


def image_tag_extractor(c, key):
if "Image" in c:
split = c["Image"].split(":", 1)
Expand All @@ -96,28 +102,32 @@ def image_tag_extractor(c, key):
"container_name": lambda c: [c['Names'][0].lstrip("/")] if c["Names"] else [c['Id'][:11]],
}

CONTAINER = "container"
PERFORMANCE = "performance"
FILTERED = "filtered"
IMAGE = "image"


def get_mountpoints(docker_root):
mountpoints = {}
for metric in CGROUP_METRICS:
mountpoints[metric["cgroup"]] = find_cgroup(metric["cgroup"], docker_root)
return mountpoints

def get_filters(instance):
def get_filters(include, exclude):
# The reasoning is to check exclude first, so we can skip if there is no exclude
if not instance.get("exclude"):
instance["filtering_enabled"] = False
if not exclude:
return

filtered_tag_names = []
exclude_patterns = []
include_patterns = []

# Compile regex
for rule in instance.get("exclude", []):
for rule in exclude:
exclude_patterns.append(re.compile(rule))
filtered_tag_names.append(rule.split(':')[0])
for rule in instance.get("include", []):
for rule in include:
include_patterns.append(re.compile(rule))
filtered_tag_names.append(rule.split(':')[0])

Expand Down Expand Up @@ -149,60 +159,80 @@ def __init__(self, name, init_config, agentConfig, instances=None):
# At first run we'll just collect the events from the latest 60 secs
self._last_event_collection_ts = int(time.time()) - 60

# Set tagging options
self.custom_tags = instance.get("tags", [])
self.collect_labels_as_tags = instance.get("collect_labels_as_tags", [])
self.tag_names = {
CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS),
PERFORMANCE: instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS),
IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS)

}

# Set filtering settings
if not instance.get("exclude"):
self._filtering_enabled = False
else:
self._filtering_enabled = True
self._exclude_patterns, self._include_patterns, self._filtered_tag_names = get_filters(instance)
include = instance.get("include", [])
exclude = instance.get("exclude", [])
self._exclude_patterns, self._include_patterns, _filtered_tag_names = get_filters(include, exclude)
self.tag_names[FILTERED] = _filtered_tag_names


# Other options
self.collect_image_stats = _is_affirmative(instance.get('collect_images_stats', True))
self.collect_container_size = _is_affirmative(instance.get('collect_container_size', False))
self.collect_events = _is_affirmative(instance.get('collect_events', True))
self.collect_image_size = _is_affirmative(instance.get('collect_image_size', True))
self.collect_ecs_tags = _is_affirmative(instance.get('ecs_tags', True)) and Platform.is_ecs_instance()


def check(self, instance):
"""Run the Docker check for one instance."""

# Report image metrics
if _is_affirmative(instance.get('collect_images_stats', True)):
self._count_and_weight_images(instance)
if self.collect_image_stats:
self._count_and_weight_images()

if self.collect_ecs_tags:
self.refresh_ecs_tags()

# Get the list of containers and the index of their names
containers_by_id = self._get_and_count_containers(instance)
containers_by_id = self._get_and_count_containers()
containers_by_id = self._crawl_container_pids(containers_by_id)

# Report performance container metrics (cpu, mem, net, io)
self._report_performance_metrics(instance, containers_by_id)
self._report_performance_metrics(containers_by_id)

if _is_affirmative(instance.get('collect_container_size', False)):
self._report_container_size(instance, containers_by_id)
if self.collect_container_size:
self._report_container_size(containers_by_id)

# Send events from Docker API
if _is_affirmative(instance.get('collect_events', True)):
self._process_events(instance, containers_by_id)
if self.collect_events:
self._process_events(containers_by_id)



# Containers

def _count_and_weight_images(self, instance):
def _count_and_weight_images(self):
try:
extra_tags = instance.get('tags', [])
tags = self._get_tags()
active_images = self.client.images(all=False)
active_images_len = len(active_images)
all_images_len = len(self.client.images(quiet=True, all=True))
self.gauge("docker.images.available", active_images_len, tags=extra_tags)
self.gauge("docker.images.intermediate", (all_images_len - active_images_len), tags=extra_tags)
self.gauge("docker.images.available", active_images_len, tags=tags)
self.gauge("docker.images.intermediate", (all_images_len - active_images_len), tags=tags)

if _is_affirmative(instance.get('collect_image_size', True)):
self._report_image_size(instance, active_images)
if self.collect_image_size:
self._report_image_size(active_images)

except Exception, e:
# It's not an important metric, keep going if it fails
self.warning("Failed to count Docker images. Exception: {0}".format(e))

def _get_and_count_containers(self, instance):
def _get_and_count_containers(self):
"""List all the containers from the API, filter and count them."""

# Querying the size of containers is slow, we don't do it at each run
must_query_size = _is_affirmative(instance.get('collect_container_size', False)) and self._latest_size_query == 0
must_query_size = self.collect_container_size and self._latest_size_query == 0
self._latest_size_query = (self._latest_size_query + 1) % SIZE_REFRESH_RATE

running_containers_count = Counter()
Expand All @@ -220,23 +250,14 @@ def _get_and_count_containers(self, instance):
self.service_check(SERVICE_CHECK_NAME, AgentCheck.OK)

# Filter containers according to the exclude/include rules
self._filter_containers(instance, containers)
self._filter_containers(containers)

containers_by_id = {}

# Dict of container ids and a list of their Amazon ECS task tags
ecs_tags = None
if Platform.is_ecs_instance() and instance.get('ecs_tags', True):
ecs_tags = self._get_ecs_tags()

for container in containers:
custom_tags = instance.get('tags', [])
if ecs_tags:
custom_tags += ecs_tags.get(container['Id'], [])
container_name = container['Names'][0].strip('/')

tag_names = instance.get("container_tags", DEFAULT_CONTAINER_TAGS)
container_status_tags = self._get_tags(container, tag_names) + custom_tags
container_status_tags = self._get_tags(container, CONTAINER)

all_containers_count[tuple(sorted(container_status_tags))] += 1
if self._is_container_running(container):
Expand Down Expand Up @@ -266,14 +287,38 @@ def _is_container_running(self, container):
"""
return container["Status"].startswith("Up") or container["Status"].startswith("Restarting")

def _get_tags(self, entity, tag_names):
def _get_tags(self, entity=None, tag_type=None):
"""Generate the tags for a given entity (container or image) according to a list of tag names."""
tags = []
for tag_name in tag_names:
tag_value = self._extract_tag_value(entity, tag_name)
if tag_value is not None:
for t in tag_value:
tags.append('%s:%s' % (tag_name, t.strip()))
# Start with custom tags
tags = list(self.custom_tags)
if entity is not None:

# Get labels as tags
labels = entity.get("Labels")
if labels is not None:
for k in self.collect_labels_as_tags:
if k in labels:
v = labels[k]
if not v:
tags.append(k)
else:
tags.append("%s:%s" % (k,v))

# Get entity specific tags
if tag_type is not None:
tag_names = self.tag_names[tag_type]
for tag_name in tag_names:
tag_value = self._extract_tag_value(entity, tag_name)
if tag_value is not None:
for t in tag_value:
tags.append('%s:%s' % (tag_name, t.strip()))

# Add ECS tags
if self.collect_ecs_tags:
entity_id = entity.get("Id")
if entity_id in self.ecs_tags:
ecs_tags = self.ecs_tags[entity_id]
tags.extend(ecs_tags)

return tags

Expand All @@ -285,6 +330,7 @@ def _extract_tag_value(self, entity, tag_name):
if tag_name not in TAG_EXTRACTORS:
self.warning("{0} isn't a supported tag".format(tag_name))
return

# Check for already extracted tags
if "_tag_values" not in entity:
entity["_tag_values"] = {}
Expand All @@ -294,33 +340,34 @@ def _extract_tag_value(self, entity, tag_name):

return entity["_tag_values"][tag_name]

def _get_ecs_tags(self):
def refresh_ecs_tags(self):
ecs_config = self.client.inspect_container('ecs-agent')
net_conf = ecs_config['NetworkSettings'].get('Ports', {})
net_conf = net_conf.get(net_conf.keys()[0], [])
container_tags = {}
ecs_tags = {}
if net_conf:
net_conf = net_conf[0] if isinstance(net_conf, list) else net_conf
ip, port = net_conf.get('HostIp'), net_conf.get('HostPort')
tasks = requests.get('http://%s:%s' % (ip, port)).json()
for task in tasks.get('Tasks', []):
for container in task.get('Containers', []):
tags = ['task_name:%s' % task['Family'], 'task_version:%s' % task['Version']]
container_tags[container['DockerId']] = tags
return container_tags
ecs_tags[container['DockerId']] = tags

self.ecs_tags = ecs_tags

def _filter_containers(self, instance, containers):
def _filter_containers(self, containers):
if not self._filtering_enabled:
return

for container in containers:
container_tags = self._get_tags(container, self._filtered_tag_names)
container['_is_filtered'] = self._are_tags_filtered(instance, container_tags)
container_tags = self._get_tags(container, FILTERED)
container['_is_filtered'] = self._are_tags_filtered(container_tags)

if container['_is_filtered']:
self.log.debug("Container {0} is filtered".format(container["Names"][0]))

def _are_tags_filtered(self, instance, tags):
def _are_tags_filtered(self, tags):
if self._tags_match_patterns(tags, self._exclude_patterns):
if self._tags_match_patterns(tags, self._include_patterns):
return False
Expand All @@ -341,41 +388,40 @@ def _is_container_excluded(self, container):
"""
return container.get('_is_filtered', False)

def _report_container_size(self, instance, containers_by_id):
def _report_container_size(self, containers_by_id):
container_list_with_size = None
for container in containers_by_id.itervalues():
if self._is_container_excluded(container):
continue

tag_names = instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS)
container_tags = self._get_tags(container, tag_names) + instance.get('tags', [])
tags = self._get_tags(container, PERFORMANCE)

if "SizeRw" in container:
self.gauge('docker.container.size_rw', container['SizeRw'], tags=container_tags)
self.gauge('docker.container.size_rw', container['SizeRw'],
tags=tags)
if "SizeRootFs" in container:
self.gauge('docker.container.size_rootfs', container['SizeRootFs'], tags=container_tags)
self.gauge(
'docker.container.size_rootfs', container['SizeRootFs'],
tags=tags)

def _report_image_size(self, instance, images):
def _report_image_size(self, images):
for image in images:
tag_names = instance.get('image_tags', ['image_name', 'image_tag'])
image_tags = self._get_tags(image, tag_names) + instance.get('tags', [])
tags = self._get_tags(image, IMAGE)
if 'VirtualSize' in image:
self.gauge('docker.image.virtual_size', image['VirtualSize'], tags=image_tags)
self.gauge('docker.image.virtual_size', image['VirtualSize'], tags=tags)
if 'Size' in image:
self.gauge('docker.image.size', image['Size'], tags=image_tags)
self.gauge('docker.image.size', image['Size'], tags=tags)

# Performance metrics

def _report_performance_metrics(self, instance, containers_by_id):
def _report_performance_metrics(self, containers_by_id):
for container in containers_by_id.itervalues():
if self._is_container_excluded(container) or not self._is_container_running(container):
continue

tag_names = instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS)
container_tags = self._get_tags(container, tag_names) + instance.get('tags', [])

self._report_cgroup_metrics(container, container_tags)
self._report_net_metrics(container, container_tags)
tags = self._get_tags(container, PERFORMANCE)
self._report_cgroup_metrics(container, tags)
self._report_net_metrics(container, tags)

def _report_cgroup_metrics(self, container, tags):
try:
Expand Down Expand Up @@ -412,8 +458,6 @@ def _report_cgroup_metrics(self, container, tags):
def _report_net_metrics(self, container, tags):
"""Find container network metrics by looking at /proc/$PID/net/dev of the container process."""
proc_net_file = os.path.join(container['_proc_root'], 'net/dev')

fp = None
try:
with open(proc_net_file, 'r') as fp:
lines = fp.readlines()
Expand All @@ -433,9 +477,9 @@ def _report_net_metrics(self, container, tags):
# It is possible that the container got stopped between the API call and now
self.warning("Failed to report IO metrics from file {0}. Exception: {1}".format(proc_net_file, e))

def _process_events(self, instance, containers_by_id):
def _process_events(self, containers_by_id):
try:
api_events = self._get_events(instance)
api_events = self._get_events()
aggregated_events = self._pre_aggregate_events(api_events, containers_by_id)
events = self._format_events(aggregated_events, containers_by_id)
except (socket.timeout, urllib2.URLError):
Expand All @@ -450,7 +494,7 @@ def _process_events(self, instance, containers_by_id):
self.log.debug("Creating event: %s" % ev['msg_title'])
self.event(ev)

def _get_events(self, instance):
def _get_events(self):
"""Get the list of events."""
now = int(time.time())
events = []
Expand Down Expand Up @@ -488,6 +532,7 @@ def _format_events(self, aggregated_events, containers_by_id):
container_name = event['id'][:11]
if event['id'] in containers_by_id:
container_name = containers_by_id[event['id']]['Names'][0].strip('/')

status_change.append([container_name, event['status']])

status_text = ", ".join(["%d %s" % (count, st) for st, count in status.iteritems()])
Expand Down
5 changes: 5 additions & 0 deletions conf.d/docker_daemon.yaml.example
Original file line number Diff line number Diff line change
Expand Up @@ -124,3 +124,8 @@ instances:
# Available: ["image_name", "image_tag", "docker_image", "container_command"]
#
# container_tags: ["image_name", "image_tag", "docker_image"]

# List of container label names that should be collected and sent as tags.
# Default to None
# Example:
# collect_labels_as_tags: ["com.docker.compose.service", "com.docker.compose.project"]
Loading

0 comments on commit ac36186

Please sign in to comment.