Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[mapreduce] Switch most metrics to counters #2487

Merged
merged 1 commit into from
May 18, 2016
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
[mapreduce] Switch most metrics to counters
With the following exceptions:
- `mapreduce.job.elapsed_time`
- `mapreduce.job.map.task.elapsed_time` (made non-default in config)
- `mapreduce.job.reduce.task.elapsed_time` (made non-default in config)

Also, removed `mapreduce.job.map.task.progress` and
`mapreduce.job.reduce.task.progress`.
olivielpeau committed May 10, 2016
commit 38b2667ae487356fa6dac8272ac2e671a28c97ff
57 changes: 31 additions & 26 deletions checks.d/mapreduce.py
Original file line number Diff line number Diff line change
@@ -34,12 +34,10 @@
MapReduce Map Task Metrics
--------------------------
mapreduce.job.map.task.progress The distribution of all map task progresses
mapreduce.job.map.task.elapsed_time The distribution of all map tasks elapsed time

MapReduce Reduce Task Metrics
--------------------------
mapreduce.job.reduce.task.progress The distribution of all reduce task progresses
mapreduce.job.reduce.task.elapsed_time The distribution of all reduce tasks elapsed time
'''

# stdlib
@@ -54,6 +52,8 @@

# Project
from checks import AgentCheck
from config import _is_affirmative


# Default Settings
DEFAULT_CUSTER_NAME = 'default_cluster'
@@ -72,46 +72,46 @@

# Metric types
HISTOGRAM = 'histogram'
INCREMENT = 'increment'

# Metrics to collect
MAPREDUCE_JOB_METRICS = {
'elapsedTime': ('mapreduce.job.elapsed_time', HISTOGRAM),
'mapsTotal': ('mapreduce.job.maps_total', HISTOGRAM),
'mapsCompleted': ('mapreduce.job.maps_completed', HISTOGRAM),
'reducesTotal': ('mapreduce.job.reduces_total', HISTOGRAM),
'reducesCompleted': ('mapreduce.job.reduces_completed', HISTOGRAM),
'mapsPending': ('mapreduce.job.maps_pending', HISTOGRAM),
'mapsRunning': ('mapreduce.job.maps_running', HISTOGRAM),
'reducesPending': ('mapreduce.job.reduces_pending', HISTOGRAM),
'reducesRunning': ('mapreduce.job.reduces_running', HISTOGRAM),
'newReduceAttempts': ('mapreduce.job.new_reduce_attempts', HISTOGRAM),
'runningReduceAttempts': ('mapreduce.job.running_reduce_attempts', HISTOGRAM),
'failedReduceAttempts': ('mapreduce.job.failed_reduce_attempts', HISTOGRAM),
'killedReduceAttempts': ('mapreduce.job.killed_reduce_attempts', HISTOGRAM),
'successfulReduceAttempts': ('mapreduce.job.successful_reduce_attempts', HISTOGRAM),
'newMapAttempts': ('mapreduce.job.new_map_attempts', HISTOGRAM),
'runningMapAttempts': ('mapreduce.job.running_map_attempts', HISTOGRAM),
'failedMapAttempts': ('mapreduce.job.failed_map_attempts', HISTOGRAM),
'killedMapAttempts': ('mapreduce.job.killed_map_attempts', HISTOGRAM),
'successfulMapAttempts': ('mapreduce.job.successful_map_attempts', HISTOGRAM),
'mapsTotal': ('mapreduce.job.maps_total', INCREMENT),
'mapsCompleted': ('mapreduce.job.maps_completed', INCREMENT),
'reducesTotal': ('mapreduce.job.reduces_total', INCREMENT),
'reducesCompleted': ('mapreduce.job.reduces_completed', INCREMENT),
'mapsPending': ('mapreduce.job.maps_pending', INCREMENT),
'mapsRunning': ('mapreduce.job.maps_running', INCREMENT),
'reducesPending': ('mapreduce.job.reduces_pending', INCREMENT),
'reducesRunning': ('mapreduce.job.reduces_running', INCREMENT),
'newReduceAttempts': ('mapreduce.job.new_reduce_attempts', INCREMENT),
'runningReduceAttempts': ('mapreduce.job.running_reduce_attempts', INCREMENT),
'failedReduceAttempts': ('mapreduce.job.failed_reduce_attempts', INCREMENT),
'killedReduceAttempts': ('mapreduce.job.killed_reduce_attempts', INCREMENT),
'successfulReduceAttempts': ('mapreduce.job.successful_reduce_attempts', INCREMENT),
'newMapAttempts': ('mapreduce.job.new_map_attempts', INCREMENT),
'runningMapAttempts': ('mapreduce.job.running_map_attempts', INCREMENT),
'failedMapAttempts': ('mapreduce.job.failed_map_attempts', INCREMENT),
'killedMapAttempts': ('mapreduce.job.killed_map_attempts', INCREMENT),
'successfulMapAttempts': ('mapreduce.job.successful_map_attempts', INCREMENT),
}

MAPREDUCE_JOB_COUNTER_METRICS = {
'reduceCounterValue': ('mapreduce.job.counter.reduce_counter_value', HISTOGRAM),
'mapCounterValue': ('mapreduce.job.counter.map_counter_value', HISTOGRAM),
'totalCounterValue': ('mapreduce.job.counter.total_counter_value', HISTOGRAM),
'reduceCounterValue': ('mapreduce.job.counter.reduce_counter_value', INCREMENT),
'mapCounterValue': ('mapreduce.job.counter.map_counter_value', INCREMENT),
'totalCounterValue': ('mapreduce.job.counter.total_counter_value', INCREMENT),
}

MAPREDUCE_MAP_TASK_METRICS = {
'progress': ('mapreduce.job.map.task.progress', HISTOGRAM),
'elapsedTime': ('mapreduce.job.map.task.elapsed_time', HISTOGRAM)
}

MAPREDUCE_REDUCE_TASK_METRICS = {
'progress': ('mapreduce.job.reduce.task.progress', HISTOGRAM),
'elapsedTime': ('mapreduce.job.reduce.task.elapsed_time', HISTOGRAM)
}


class MapReduceCheck(AgentCheck):

def __init__(self, name, init_config, agentConfig, instances=None):
@@ -129,6 +129,8 @@ def check(self, instance):
if rm_address is None:
raise Exception('The ResourceManager URL must be specified in the instance configuration')

collect_task_metrics = _is_affirmative(instance.get('collect_task_metrics', False))

# Get additional tags from the conf file
tags = instance.get('tags', [])
if tags is None:
@@ -160,7 +162,8 @@ def check(self, instance):
self._mapreduce_job_counters_metrics(running_jobs, tags)

# Get task metrics
self._mapreduce_task_metrics(running_jobs, tags)
if collect_task_metrics:
self._mapreduce_task_metrics(running_jobs, tags)

# Report success after gathering all metrics from Application Master
if running_jobs:
@@ -439,6 +442,8 @@ def _set_metric(self, metric_name, metric_type, value, tags=None, device_name=No
'''
if metric_type == HISTOGRAM:
self.histogram(metric_name, value, tags=tags, device_name=device_name)
elif metric_type == INCREMENT:
self.increment(metric_name, value, tags=tags, device_name=device_name)
else:
self.log.error('Metric type "%s" unknown' % (metric_type))

6 changes: 5 additions & 1 deletion conf.d/mapreduce.yaml.example
Original file line number Diff line number Diff line change
@@ -2,7 +2,7 @@ instances:
#
# The MapReduce check retrieves metrics from YARN's ResourceManager. This
# check must be run from the Master Node and the ResourceManager URI must
# be specified below. The ResourceManager URI is composed of the
# be specified below. The ResourceManager URI is composed of the
# ResourceManager's hostname and port.
#
# The ResourceManager hostname can be found in the yarn-site.xml conf file
@@ -16,6 +16,10 @@ instances:
# A Required friendly name for the cluster.
# cluster_name: MyMapReduceCluster

# Set to true to collect histograms on the elapsed time of
# map and reduce tasks (default: false)
# collect_task_metrics: false

# Optional tags to be applied to every emitted metric.
# tags:
# - key:value
59 changes: 29 additions & 30 deletions tests/checks/mock/test_mapreduce.py
Original file line number Diff line number Diff line change
@@ -84,7 +84,8 @@ class MapReduceCheck(AgentCheckTest):

MR_CONFIG = {
'resourcemanager_uri': 'http://localhost:8088',
'cluster_name': CLUSTER_NAME
'cluster_name': CLUSTER_NAME,
'collect_task_metrics': 'true'
}

INIT_CONFIG = {
@@ -119,24 +120,24 @@ class MapReduceCheck(AgentCheckTest):

MAPREDUCE_JOB_METRIC_VALUES = {
'mapreduce.job.elapsed_time.max': 99221829,
'mapreduce.job.maps_total.max': 1,
'mapreduce.job.maps_completed.max': 0,
'mapreduce.job.reduces_total.max': 1,
'mapreduce.job.reduces_completed.max': 0,
'mapreduce.job.maps_pending.max': 0,
'mapreduce.job.maps_running.max': 1,
'mapreduce.job.reduces_pending.max': 1,
'mapreduce.job.reduces_running.max': 0,
'mapreduce.job.new_reduce_attempts.max': 1,
'mapreduce.job.running_reduce_attempts.max': 0,
'mapreduce.job.failed_reduce_attempts.max': 0,
'mapreduce.job.killed_reduce_attempts.max': 0,
'mapreduce.job.successful_reduce_attempts.max': 0,
'mapreduce.job.new_map_attempts.max': 0,
'mapreduce.job.running_map_attempts.max': 1,
'mapreduce.job.failed_map_attempts.max': 1,
'mapreduce.job.killed_map_attempts.max': 0,
'mapreduce.job.successful_map_attempts.max': 0,
'mapreduce.job.maps_total': 1,
'mapreduce.job.maps_completed': 0,
'mapreduce.job.reduces_total': 1,
'mapreduce.job.reduces_completed': 0,
'mapreduce.job.maps_pending': 0,
'mapreduce.job.maps_running': 1,
'mapreduce.job.reduces_pending': 1,
'mapreduce.job.reduces_running': 0,
'mapreduce.job.new_reduce_attempts': 1,
'mapreduce.job.running_reduce_attempts': 0,
'mapreduce.job.failed_reduce_attempts': 0,
'mapreduce.job.killed_reduce_attempts': 0,
'mapreduce.job.successful_reduce_attempts': 0,
'mapreduce.job.new_map_attempts': 0,
'mapreduce.job.running_map_attempts': 1,
'mapreduce.job.failed_map_attempts': 1,
'mapreduce.job.killed_map_attempts': 0,
'mapreduce.job.successful_map_attempts': 0,
}

MAPREDUCE_JOB_METRIC_TAGS = [
@@ -147,7 +148,6 @@ class MapReduceCheck(AgentCheckTest):
]

MAPREDUCE_MAP_TASK_METRIC_VALUES = {
'mapreduce.job.map.task.progress.max': 49.11076,
'mapreduce.job.map.task.elapsed_time.max': 99869037
}

@@ -160,7 +160,6 @@ class MapReduceCheck(AgentCheckTest):
]

MAPREDUCE_REDUCE_TASK_METRIC_VALUES = {
'mapreduce.job.reduce.task.progress.max': 32.42940,
'mapreduce.job.reduce.task.elapsed_time.max': 123456
}

@@ -173,15 +172,15 @@ class MapReduceCheck(AgentCheckTest):
]

MAPREDUCE_JOB_COUNTER_METRIC_VALUES = {
'mapreduce.job.counter.total_counter_value.max': {'value': 0, 'tags': ['counter_name:file_bytes_read']},
'mapreduce.job.counter.map_counter_value.max': {'value': 1, 'tags': ['counter_name:file_bytes_read']},
'mapreduce.job.counter.reduce_counter_value.max': {'value': 2, 'tags': ['counter_name:file_bytes_read']},
'mapreduce.job.counter.total_counter_value.max': {'value': 3, 'tags': ['counter_name:file_bytes_written']},
'mapreduce.job.counter.map_counter_value.max': {'value': 4, 'tags': ['counter_name:file_bytes_written']},
'mapreduce.job.counter.reduce_counter_value.max': {'value': 5, 'tags': ['counter_name:file_bytes_written']},
'mapreduce.job.counter.total_counter_value.max': {'value': 9, 'tags': ['counter_name:map_output_records']},
'mapreduce.job.counter.map_counter_value.max': {'value': 10, 'tags': ['counter_name:map_output_records']},
'mapreduce.job.counter.reduce_counter_value.max': {'value': 11, 'tags': ['counter_name:map_output_records']},
'mapreduce.job.counter.total_counter_value': {'value': 0, 'tags': ['counter_name:file_bytes_read']},
'mapreduce.job.counter.map_counter_value': {'value': 1, 'tags': ['counter_name:file_bytes_read']},
'mapreduce.job.counter.reduce_counter_value': {'value': 2, 'tags': ['counter_name:file_bytes_read']},
'mapreduce.job.counter.total_counter_value': {'value': 3, 'tags': ['counter_name:file_bytes_written']},
'mapreduce.job.counter.map_counter_value': {'value': 4, 'tags': ['counter_name:file_bytes_written']},
'mapreduce.job.counter.reduce_counter_value': {'value': 5, 'tags': ['counter_name:file_bytes_written']},
'mapreduce.job.counter.total_counter_value': {'value': 9, 'tags': ['counter_name:map_output_records']},
'mapreduce.job.counter.map_counter_value': {'value': 10, 'tags': ['counter_name:map_output_records']},
'mapreduce.job.counter.reduce_counter_value': {'value': 11, 'tags': ['counter_name:map_output_records']},
}

MAPREDUCE_JOB_COUNTER_METRIC_TAGS = [