Skip to content

Commit

Permalink
[CELEBORN-1817] add committed file size metrics
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?

this PR adds the file size metrics for workers

### Why are the changes needed?

the reason for us to add this metric is that we observed that, likely due to the delayed processing of split messages, we have jobs writing 40-50g files even the split threshold is 10g (we use soft split)

we want to have this metrics to monitor the severity of the issue

### Does this PR introduce _any_ user-facing change?

yes, one more metrics

### How was this patch tested?

(ignore the dashboard title, it's a dummy one)

![image](https://github.com/user-attachments/assets/d88c15e6-d740-4def-94d5-03666bbb38ca)

Closes #3047 from CodingCat/committed_file_size.

Authored-by: Nan <[email protected]>
Signed-off-by: mingji <[email protected]>
  • Loading branch information
Nan authored and FMX committed Jan 7, 2025
1 parent 6853b23 commit ca60613
Show file tree
Hide file tree
Showing 4 changed files with 305 additions and 6 deletions.
306 changes: 300 additions & 6 deletions assets/grafana/celeborn-dashboard.json
Original file line number Diff line number Diff line change
Expand Up @@ -2951,11 +2951,305 @@
],
"title": "metrics_IsDecommissioningWorker_Value",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "size of partition files in bytes",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 110
},
"id": 235,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "metrics_PartitionFileSizeBytes_Mean{role=\"Worker\", instance=~\"${instance}\"}",
"hide": false,
"instant": false,
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
}
],
"title": "metrics_ PartitionFileSizeBytes_Mean",
"type": "timeseries"
}
],
"title": "Worker",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "p99 size of partition files in size",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 70
},
"id": 236,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "metrics_PartitionFileSizeBytes_P99{role=\"Worker\", instance=~\"${instance}\"}",
"hide": false,
"instant": false,
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
}
],
"title": "metrics_ PartitionFileSizeBytes_P99",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "max size of partition files in size",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 110
},
"id": 237,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "metrics_PartitionFileSizeBytes_max{role=\"Worker\", instance=~\"${instance}\"}",
"hide": false,
"instant": false,
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
}
],
"title": "metrics_ PartitionFileSizeBytes_MAX",
"type": "timeseries"
},
{
"collapsed": true,
"gridPos": {
Expand Down Expand Up @@ -5091,7 +5385,7 @@
"fullMetaSearch": false,
"includeNullMetadata": true,
"instant": false,
"legendFormat": "__auto",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A",
"useBackend": false
Expand Down Expand Up @@ -5191,7 +5485,7 @@
"fullMetaSearch": false,
"includeNullMetadata": true,
"instant": false,
"legendFormat": "__auto",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A",
"useBackend": false
Expand Down Expand Up @@ -5291,7 +5585,7 @@
"fullMetaSearch": false,
"includeNullMetadata": true,
"instant": false,
"legendFormat": "__auto",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A",
"useBackend": false
Expand Down Expand Up @@ -5390,7 +5684,7 @@
"fullMetaSearch": false,
"includeNullMetadata": true,
"instant": false,
"legendFormat": "__auto",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A",
"useBackend": false
Expand Down Expand Up @@ -5490,7 +5784,7 @@
"fullMetaSearch": false,
"includeNullMetadata": true,
"instant": false,
"legendFormat": "__auto",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A",
"useBackend": false
Expand Down Expand Up @@ -5590,7 +5884,7 @@
"fullMetaSearch": false,
"includeNullMetadata": true,
"instant": false,
"legendFormat": "__auto",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A",
"useBackend": false
Expand Down
1 change: 1 addition & 0 deletions docs/monitoring.md
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ These metrics are exposed by Celeborn worker.
| PausePushDataAndReplicateTime | The time for a worker to stop receiving pushData from clients and other workers because of back pressure. |
| PausePushData | The count for a worker to stop receiving pushData from clients because of back pressure. |
| PausePushDataAndReplicate | The count for a worker to stop receiving pushData from clients and other workers because of back pressure. |
| PartitionFileSizeBytes | The size of partition files committed in current worker. |
| TakeBufferTime | The time for a worker to take out a buffer from a disk flusher. |
| FlushDataTime | The time for a worker to write a buffer which is 256KB by default to storage. |
| CommitFilesTime | The time for a worker to flush buffers and close files related to specified shuffle. |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -549,8 +549,10 @@ protected synchronized long close(
}
}
if (diskFileInfo != null) {
source.updateHistogram(WorkerSource.PARTITION_FILE_SIZE(), diskFileInfo.getFileLength());
return diskFileInfo.getFileLength();
} else {
source.updateHistogram(WorkerSource.PARTITION_FILE_SIZE(), memoryFileInfo.getFileLength());
return memoryFileInfo.getFileLength();
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ class WorkerSource(conf: CelebornConf) extends AbstractSource(conf, Role.WORKER)
addTimer(CLEAN_EXPIRED_SHUFFLE_KEYS_TIME)

addHistogram(FETCH_CHUNK_TRANSFER_SIZE)
addHistogram(PARTITION_FILE_SIZE)

def getCounterCount(metricsName: String): Long = {
val metricNameWithLabel = metricNameWithCustomizedLabels(metricsName, Map.empty)
Expand Down Expand Up @@ -215,6 +216,7 @@ object WorkerSource {
val DEVICE_OS_TOTAL_CAPACITY = "DeviceOSTotalBytes"
val DEVICE_CELEBORN_FREE_CAPACITY = "DeviceCelebornFreeBytes"
val DEVICE_CELEBORN_TOTAL_CAPACITY = "DeviceCelebornTotalBytes"
val PARTITION_FILE_SIZE = "PartitionFileSizeBytes"

// congestion control
val POTENTIAL_CONSUME_SPEED = "PotentialConsumeSpeed"
Expand Down

0 comments on commit ca60613

Please sign in to comment.