From bba1a72d33092019dd1e3c66449d84ff79f659c1 Mon Sep 17 00:00:00 2001 From: Fotis Nikolaidis Date: Thu, 15 Jun 2023 18:03:34 +0300 Subject: [PATCH] Improve reporting and fix bug on constant distribution Signed-off-by: Fotis Nikolaidis --- CHANGELOG.md | 4 + Makefile | 2 +- api/v1alpha1/type_expressions.go | 4 +- .../cockroachdb/examples/11.network.yml | 67 ---- .../cockroachdb/templates/cluster.yml | 41 +-- .../fedbed/templates/server.yaml | 27 +- charts/system/dashboards/selectable.json | 103 +++---- charts/system/dashboards/singleton.json | 276 +++++++++++------ charts/system/dashboards/summary.json | 289 ++++-------------- .../templates/telemetry/cadvisor/cadvisor.yml | 5 + .../templates/telemetry/grafana/grafana.yml | 9 +- .../commands/common/kubectl-wrapper.go | 131 +++++--- cmd/kubectl-frisbee/commands/tests/inspect.go | 2 +- cmd/kubectl-frisbee/commands/tests/report.go | 2 +- cmd/kubectl-frisbee/commands/tests/submit.go | 17 +- .../commands/tests/validate.go | 4 +- cmd/kubectl-frisbee/env/logo.go | 2 +- controllers/common/defaults.go | 4 +- controllers/scenario/controller.go | 10 +- examples/databases/elasticity/install.sh | 10 +- .../databases/network-partition/install.sh | 9 +- .../databases/network-partition/manifest.yml | 2 +- examples/databases/normal-load/install.sh | 10 +- examples/databases/sstable-bitrot/install.sh | 9 +- .../databases/sstable-bitrot/manifest.yml | 2 +- .../federated_learning/5.crashing-nodes.yml | 53 ---- .../crash-on-epoch/install.sh | 9 +- .../crash-on-epoch/manifest.yml | 12 +- .../federated_learning/ml-backend/install.sh | 9 +- .../ml-backend/manifest.yml | 6 +- .../node-placement/install.sh | 24 ++ .../manifest.yml} | 10 +- .../parallel-workflows/install.sh | 9 +- .../parallel-workflows/manifest.yml | 73 +++-- .../resource-distribution/install.sh | 24 ++ .../manifest.yml} | 6 +- examples/tutorial/19.sla-assertions.yml | 2 +- go.mod | 5 +- go.sum | 8 + pkg/distributions/constant.go | 4 +- pkg/distributions/sample_generator.go | 24 +- pkg/distributions/sample_generator_test.go | 110 ++++++- pkg/expressions/metrics.go | 4 - pkg/grafana/alerts.go | 26 +- pkg/grafana/client.go | 17 +- pkg/grafana/notifications.go | 7 +- 46 files changed, 740 insertions(+), 743 deletions(-) delete mode 100644 charts/databases/cockroachdb/examples/11.network.yml delete mode 100644 examples/federated_learning/5.crashing-nodes.yml create mode 100755 examples/federated_learning/node-placement/install.sh rename examples/federated_learning/{3.client-placement.yml => node-placement/manifest.yml} (92%) create mode 100755 examples/federated_learning/resource-distribution/install.sh rename examples/federated_learning/{2.resource-distribution.yml => resource-distribution/manifest.yml} (93%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 66faf536..5f2d62df 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,13 +4,17 @@ ### Changed defaults / behaviours - Upgrade to Grafana v9.4.7 in order to avoid issues with locking database (https://github.com/grafana/grafana/issues/60703) +- Change Grafana requirements to 2 CPU and 14 Gi memory. - ... ### New Features & Functionality - Upgrade API to Kubernetes v0.27.2. Upgrade all other deps to the latest version. +- Fixed kubectl-frisbee to monitor multiple pods in parallel. - ... ## Bug Fixes +- Added --network host to "make docker-build" to fix the network discovery issues. +- Fix the Constant distribution. It was returning normalized values. - ... ## 1.0.40 \[2023-05-23\] diff --git a/Makefile b/Makefile index 5b13b291..127bcd32 100644 --- a/Makefile +++ b/Makefile @@ -141,7 +141,7 @@ run: generate fmt vet certs ## Run a controller from your host. docker-build: ## Build docker image for the Frisbee controller. @echo "===> Build Frisbee Container <===" - docker build -t ${IMG} . + docker build -t ${IMG} . --network host docker-run: docker-build ## Build and Run docker image for the Frisbee controller. @echo "===> Run Frisbee Container Locally <===" diff --git a/api/v1alpha1/type_expressions.go b/api/v1alpha1/type_expressions.go index ff09b44a..e2fbdeba 100644 --- a/api/v1alpha1/type_expressions.go +++ b/api/v1alpha1/type_expressions.go @@ -161,8 +161,8 @@ func (expr ExprState) GoValuate(state interface{}) (bool, error) { // +kubebuilder:object:generate=false -// ExprMetricsValidator expressions evaluated with https://regex101.com/r/8JrgyI/1 -var ExprMetricsValidator = regexp.MustCompile(`(?m)^(?P\w+)\(\)\s+of\s+query\((?P\w+)\/(?P\d+)\/(?P.+),\s+(?P\w+),\s+(?P\w+)\)\s+is\s+(?P\w+)\((?P-*\d*[\.,\s]*\d*)\)\s*(for\s+\((?P\w+)\))*\s*(every\((?P\w+)\))*\s*$`) +// ExprMetricsValidator expressions evaluated with https://regex101.com/r/bjPwQK/1 +var ExprMetricsValidator = regexp.MustCompile(`(?m)^(?P\w+)\(\)\s+of\s+query\((?P\w+)\/(?P\d+)\/(?P.+),\s+(?P\w+),\s+(?P\w+)\)\s+is\s+(?P\w+)\((?P-*\d*[\.,\s]*\d*\w*)\)\s*(for\s+\((?P\w+)\))*\s*(every\((?P\w+)\))*\s*$`) type ExprMetrics string diff --git a/charts/databases/cockroachdb/examples/11.network.yml b/charts/databases/cockroachdb/examples/11.network.yml deleted file mode 100644 index e90659ab..00000000 --- a/charts/databases/cockroachdb/examples/11.network.yml +++ /dev/null @@ -1,67 +0,0 @@ ---- -apiVersion: frisbee.dev/v1alpha1 -kind: Scenario -metadata: - name: cockroach-network -spec: - actions: - # Step 0. provision 4 individual servers - - action: Cluster - name: masters - cluster: - templateRef: cockroach.cluster.master - instances: 4 - inputs: - - { join: "masters-1:26257,masters-2:26257,masters-3:26257,masters-4:26257" } - - # Step 1. Create a cockroach cluster from the individual servers - - action: Call - name: boot - depends: { running: [ masters ] } - call: - callable: boot - services: [ masters-1 ] - expect: - - { stdout: "Cluster successfully initialized.*" } - - # Step 2. import TPC-C data from the workload node (node 1) - - action: Call - name: import-workload - depends: { success: [ boot ] } - call: - callable: test2-import-tpcc - services: [ masters-1 ] - - # Step 3. wait for 3x replication - # Alternatively, wait for "Under-replicated" metric from Grafana to become 0 - - action: Call - name: wait-for-3x-replication - depends: { success: [ import-workload ] } - call: - callable: test2-wait-for-3x-replication - services: [ masters-1 ] - - # Step 4. run TPC-C workload from the workload node (node 4) - - action: Call - name: run-workload - depends: { success: [ wait-for-3x-replication ] } - call: - callable: test2-run-workload - services: [ masters-3 ] - - # Step5. partition node 1 from the rest of the nodes; node 1 can reach other nodes, but no other nodes can reach node 1 - - action: Chaos - name: partition0 - depends: { success: [ wait-for-3x-replication ], after: "3m" } # give tpcc a head start - chaos: - templateRef: system.chaos.network.partition.partial - inputs: - - { source: masters-1, duration: 10m , direction: "to", dst: "masters-2, masters-3, masters-4" } - - - # Teardown - - action: Delete - name: teardown - depends: { running: [ masters ], success: [ partition0, run-workload ] } - delete: - jobs: [ masters ] \ No newline at end of file diff --git a/charts/databases/cockroachdb/templates/cluster.yml b/charts/databases/cockroachdb/templates/cluster.yml index bef2d4d5..f10a67af 100644 --- a/charts/databases/cockroachdb/templates/cluster.yml +++ b/charts/databases/cockroachdb/templates/cluster.yml @@ -56,15 +56,6 @@ spec: set -eum cockroach init --insecure - import-tpcc: - container: main - command: - - /bin/sh # Run shell - - -c # Read from string - - | # Multi-line str - set -eum - ./cockroach workload fixtures import tpcc --warehouses=200 --fks=false --checks=false - bitrot: container: main command: @@ -99,26 +90,8 @@ spec: dd if=/dev/urandom of=${file}.sst seek=256 count=128 bs=1 conv=notrunc done - run-workload: - container: main - command: - - /bin/sh # Run shell - - -c # Read from string - - | # Multi-line str - set -eum - ./cockroach workload run tpcc --warehouses=100 --tolerate-errors - - - test2-import-tpcc: - container: main - command: - - /bin/sh # Run shell - - -c # Read from string - - | # Multi-line str - set -eum - ./cockroach workload fixtures import tpcc --warehouses=1 - test2-wait-for-3x-replication: + wait-for-3x-replication: container: main command: - /bin/sh # Run shell @@ -149,14 +122,4 @@ spec: echo "Sleep" sleep 2 - done - - test2-run-workload: - container: main - command: - - /bin/sh # Run shell - - -c # Read from string - - | # Multi-line str - set -eum - - ./cockroach workload run tpcc --warehouses=1 --wait=false --histograms=/stats.json --duration=30m + done \ No newline at end of file diff --git a/charts/federated-learning/fedbed/templates/server.yaml b/charts/federated-learning/fedbed/templates/server.yaml index 769838d7..dcbea3c7 100644 --- a/charts/federated-learning/fedbed/templates/server.yaml +++ b/charts/federated-learning/fedbed/templates/server.yaml @@ -71,16 +71,15 @@ spec: # # Push round events to grafana # - - name: annotator - image: alpine/curl:latest + image: icsforth/annotator:latest volumeMounts: - name: logs mountPath: /fl-logs/ command: - - /bin/sh # Run shell - - -c # Read from string - - | # Multi-line str + - /bin/bash + - -c + - | set -eux export logfile=/fl-logs/server.log @@ -98,16 +97,24 @@ spec: annotation=$(echo '{"tags":["app"],"text":"'${EVENTCNT}'"}') - curl -s -H "Content-Type: application/json" -X POST -d ${annotation} ${GRAFANA}/api/annotations + curl --connect-timeout 5 \ + --max-time 10 \ + --retry 5 \ + --retry-delay 0 \ + --retry-max-time 40 \ + -H "Content-Type: application/json" \ + -X POST \ + -d ${annotation} \ + ${GRAFANA}/api/annotations echo -e "\n--" done - - - + # + # Block waiting for a specific round + # callables: - wait-for-round: # block waiting for a specific round + wait-for-round: container: main command: - /bin/bash diff --git a/charts/system/dashboards/selectable.json b/charts/system/dashboards/selectable.json index 2c98cfeb..d1ef4f31 100644 --- a/charts/system/dashboards/selectable.json +++ b/charts/system/dashboards/selectable.json @@ -99,9 +99,6 @@ "showLine": false, "step": "", "tagKeys": "", - "tags": [ - "run" - ], "tagsField": "", "target": { "limit": 100, @@ -153,7 +150,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 1, - "id": 10, + "id": 1, "links": [], "liveNow": false, "panels": [ @@ -245,7 +242,7 @@ "showThresholdMarkers": true, "text": {} }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -320,7 +317,7 @@ "showThresholdMarkers": true, "text": {} }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -396,7 +393,7 @@ "showThresholdMarkers": true, "text": {} }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -471,7 +468,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -584,7 +581,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "sparkline": {}, "targets": [ @@ -674,7 +671,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "sparkline": {}, "targets": [ @@ -748,7 +745,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -818,7 +815,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -905,7 +902,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "sparkline": {}, "targets": [ @@ -992,7 +989,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "sparkline": {}, "targets": [ @@ -1081,7 +1078,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "sparkline": {}, "targets": [ @@ -1168,7 +1165,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "sparkline": {}, "targets": [ @@ -1230,7 +1227,7 @@ "tags": [], "viewMode": "list" }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -1351,7 +1348,7 @@ }, "showHeader": true }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "scroll": false, "span": 0, "targets": [ @@ -1435,7 +1432,7 @@ "showTime": true, "showUser": true }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -1576,7 +1573,7 @@ }, { "id": "unit", - "value": "s" + "value": "short" }, { "id": "custom.axisLabel", @@ -1633,7 +1630,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -1848,7 +1845,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -2013,7 +2010,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -2191,7 +2188,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -2360,7 +2357,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -2575,7 +2572,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -2778,7 +2775,7 @@ "showUnfilled": true, "text": {} }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -2869,7 +2866,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -2901,7 +2898,6 @@ ], "title": "IO Traffic (Bytes)", "transformations": [], - "transparent": false, "type": "timeseries" }, { @@ -2959,7 +2955,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -2991,7 +2987,6 @@ ], "title": "Total IO Traffic (Bytes)", "transformations": [], - "transparent": false, "type": "piechart" }, { @@ -3068,7 +3063,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -3100,7 +3095,6 @@ ], "title": "IO Traffic (Operations)", "transformations": [], - "transparent": false, "type": "timeseries" }, { @@ -3158,7 +3152,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -3190,7 +3184,6 @@ ], "title": "Total IO Traffic (Operations)", "transformations": [], - "transparent": false, "type": "piechart" }, { @@ -3274,7 +3267,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -3307,7 +3300,6 @@ "thresholds": [], "title": "IO Throughput (Filesystem)", "transformations": [], - "transparent": false, "type": "timeseries" }, { @@ -3393,7 +3385,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -3426,7 +3418,6 @@ "thresholds": [], "title": "IOPS (Filesystem)", "transformations": [], - "transparent": false, "type": "timeseries" }, { @@ -3511,7 +3502,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -3547,7 +3538,6 @@ "thresholds": [], "title": "Estimated IO Request Size", "transformations": [], - "transparent": false, "type": "timeseries" }, { @@ -3631,7 +3621,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -3664,7 +3654,6 @@ "thresholds": [], "title": "Average Time spent in IO", "transformations": [], - "transparent": false, "type": "timeseries" }, { @@ -3771,7 +3760,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -3803,7 +3792,6 @@ ], "title": "Network Traffic (Bytes)", "transformations": [], - "transparent": false, "type": "timeseries" }, { @@ -3861,7 +3849,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -3893,7 +3881,6 @@ ], "title": "Total Network Traffic (Bytes)", "transformations": [], - "transparent": false, "type": "piechart" }, { @@ -3970,7 +3957,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -4002,7 +3989,6 @@ ], "title": "Network Traffic (Packets)", "transformations": [], - "transparent": false, "type": "timeseries" }, { @@ -4060,7 +4046,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -4092,7 +4078,6 @@ ], "title": "Total Network Traffic (Packets)", "transformations": [], - "transparent": false, "type": "piechart" }, { @@ -4228,7 +4213,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -4404,7 +4389,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -4586,7 +4571,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -4764,7 +4749,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -4920,7 +4905,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -5098,7 +5083,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -5273,7 +5258,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -5420,6 +5405,6 @@ "timezone": "", "title": "Frisbee Console: Selectable", "uid": "selectable", - "version": 2, + "version": 1, "weekStart": "" } \ No newline at end of file diff --git a/charts/system/dashboards/singleton.json b/charts/system/dashboards/singleton.json index 97fcf4b8..a5006b6e 100644 --- a/charts/system/dashboards/singleton.json +++ b/charts/system/dashboards/singleton.json @@ -100,7 +100,7 @@ "step": "", "tagKeys": "", "tags": [ - "run" + "create" ], "tagsField": "", "target": { @@ -153,7 +153,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 1, - "id": 4, + "id": 3, "links": [], "liveNow": false, "panels": [ @@ -236,7 +236,7 @@ }, "showHeader": true }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "scroll": false, "span": 0, "targets": [ @@ -319,7 +319,7 @@ "showTime": true, "showUser": true }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -430,7 +430,7 @@ "tags": [], "viewMode": "list" }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -666,7 +666,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -805,7 +805,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -884,7 +884,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -951,7 +952,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -1060,7 +1061,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] }, @@ -1092,7 +1094,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -1171,7 +1173,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] }, @@ -1203,7 +1206,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -1244,6 +1247,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -1280,7 +1285,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] }, @@ -1332,10 +1338,11 @@ "showLegend": true }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -1404,6 +1411,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -1436,7 +1445,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -1489,10 +1499,11 @@ "showLegend": true }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -1524,7 +1535,6 @@ ], "title": "IO Traffic (Bytes)", "transformations": [], - "transparent": false, "type": "timeseries" }, { @@ -1578,10 +1588,11 @@ "values": false }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -1613,7 +1624,6 @@ ], "title": "Total IO Traffic (Bytes)", "transformations": [], - "transparent": false, "type": "piechart" }, { @@ -1628,6 +1638,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -1660,7 +1672,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -1713,10 +1726,11 @@ "showLegend": true }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -1748,7 +1762,6 @@ ], "title": "IO Operations", "transformations": [], - "transparent": false, "type": "timeseries" }, { @@ -1802,10 +1815,11 @@ "values": false }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -1837,7 +1851,6 @@ ], "title": "Total IO Traffic (Operations)", "transformations": [], - "transparent": false, "type": "piechart" }, { @@ -1855,7 +1868,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] }, @@ -1872,6 +1886,8 @@ "id": 137, "options": { "displayMode": "lcd", + "minVizHeight": 10, + "minVizWidth": 0, "orientation": "horizontal", "reduceOptions": { "calcs": [ @@ -1883,7 +1899,7 @@ "showUnfilled": true, "text": {} }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -1914,6 +1930,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -1947,7 +1965,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] }, @@ -1976,10 +1995,11 @@ "showLegend": true }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -2000,7 +2020,6 @@ "thresholds": [], "title": "IO Throughput (Write)", "transformations": [], - "transparent": false, "type": "timeseries" }, { @@ -2017,6 +2036,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -2050,7 +2071,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] }, @@ -2079,10 +2101,11 @@ "showLegend": true }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -2101,7 +2124,6 @@ "thresholds": [], "title": "IO Throughput (Read)", "transformations": [], - "transparent": false, "type": "timeseries" }, { @@ -2118,6 +2140,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -2151,7 +2175,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] }, @@ -2180,10 +2205,11 @@ "showLegend": true }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -2204,7 +2230,6 @@ "thresholds": [], "title": "IOPS (Write)", "transformations": [], - "transparent": false, "type": "timeseries" }, { @@ -2221,6 +2246,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -2254,7 +2281,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] }, @@ -2283,10 +2311,11 @@ "showLegend": true }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -2305,7 +2334,6 @@ "thresholds": [], "title": "IOPS (Read)", "transformations": [], - "transparent": false, "type": "timeseries" }, { @@ -2322,6 +2350,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -2355,7 +2385,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -2388,10 +2419,11 @@ "showLegend": true }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -2412,7 +2444,6 @@ "thresholds": [], "title": "Estimated IO Request Size (Write)", "transformations": [], - "transparent": false, "type": "timeseries" }, { @@ -2429,6 +2460,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -2462,7 +2495,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -2495,10 +2529,11 @@ "showLegend": true }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -2520,7 +2555,6 @@ "thresholds": [], "title": "Estimated IO Request Size (Read)", "transformations": [], - "transparent": false, "type": "timeseries" }, { @@ -2537,6 +2571,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -2564,12 +2600,14 @@ "mode": "off" } }, + "decimals": 1, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -2602,10 +2640,11 @@ "showLegend": true }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -2625,7 +2664,6 @@ "thresholds": [], "title": "Cummulative Time spent in IO (Write)", "transformations": [], - "transparent": false, "type": "timeseries" }, { @@ -2642,6 +2680,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -2669,12 +2709,14 @@ "mode": "off" } }, + "decimals": 1, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -2707,10 +2749,11 @@ "showLegend": true }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -2730,7 +2773,6 @@ "thresholds": [], "title": "Cummulative Time spent in IO (Read)", "transformations": [], - "transparent": false, "type": "timeseries" }, { @@ -2775,6 +2817,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -2807,7 +2851,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -2835,10 +2880,11 @@ "showLegend": true }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -2870,7 +2916,6 @@ ], "title": "Network Traffic (Bytes)", "transformations": [], - "transparent": false, "type": "timeseries" }, { @@ -2924,10 +2969,11 @@ "values": false }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -2959,7 +3005,6 @@ ], "title": "Total Network Traffic (Bytes)", "transformations": [], - "transparent": false, "type": "piechart" }, { @@ -2974,6 +3019,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -3006,7 +3053,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -3034,10 +3082,11 @@ "showLegend": true }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -3069,7 +3118,6 @@ ], "title": "Network Traffic (Packets)", "transformations": [], - "transparent": false, "type": "timeseries" }, { @@ -3123,10 +3171,11 @@ "values": false }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -3158,7 +3207,6 @@ ], "title": "Total Network Traffic (Packets)", "transformations": [], - "transparent": false, "type": "piechart" }, { @@ -3175,6 +3223,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -3208,7 +3258,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] }, @@ -3288,10 +3339,11 @@ "showLegend": true }, "tooltip": { - "mode": "multi" + "mode": "multi", + "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -3324,6 +3376,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -3357,7 +3411,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] }, @@ -3430,17 +3485,18 @@ "calcs": [ "mean", "max", - "last" + "lastNotNull" ], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { - "mode": "multi" + "mode": "multi", + "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -3473,6 +3529,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -3508,7 +3566,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] }, @@ -3536,10 +3595,11 @@ "showLegend": true }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -3580,6 +3640,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -3615,7 +3677,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] }, @@ -3643,10 +3706,11 @@ "showLegend": true }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -3717,6 +3781,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -3749,7 +3815,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -3782,10 +3849,11 @@ "showLegend": true }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -3845,6 +3913,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "ops/s", "axisPlacement": "auto", "barAlignment": 0, @@ -3877,7 +3947,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] }, @@ -3957,10 +4028,11 @@ "showLegend": true }, "tooltip": { - "mode": "multi" + "mode": "multi", + "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -3993,6 +4065,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "ops/s", "axisPlacement": "auto", "barAlignment": 0, @@ -4025,7 +4099,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] }, @@ -4105,10 +4180,11 @@ "showLegend": true }, "tooltip": { - "mode": "multi" + "mode": "multi", + "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { diff --git a/charts/system/dashboards/summary.json b/charts/system/dashboards/summary.json index 3b043617..09679fd3 100644 --- a/charts/system/dashboards/summary.json +++ b/charts/system/dashboards/summary.json @@ -8,13 +8,11 @@ "uid": "grafana" }, "enable": true, - "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "target": { "limit": 100, "matchAny": false, - "tags": [], "type": "dashboard" }, "type": "dashboard" @@ -34,9 +32,6 @@ "showLine": false, "step": "", "tagKeys": "", - "tags": [ - "failure" - ], "tagsField": "", "target": { "limit": 100, @@ -99,9 +94,6 @@ "showLine": false, "step": "", "tagKeys": "", - "tags": [ - "run" - ], "tagsField": "", "target": { "limit": 100, @@ -131,16 +123,10 @@ "showLine": false, "step": "", "tagKeys": "", - "tags": [ - "exit" - ], "tagsField": "", "target": { "limit": 100, "matchAny": false, - "tags": [ - "delete" - ], "type": "tags" }, "textField": "", @@ -153,7 +139,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 1, - "id": 12, + "id": 3, "links": [], "liveNow": false, "panels": [ @@ -245,7 +231,7 @@ "showThresholdMarkers": true, "text": {} }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -320,7 +306,7 @@ "showThresholdMarkers": true, "text": {} }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -396,7 +382,7 @@ "showThresholdMarkers": true, "text": {} }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -469,7 +455,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -565,40 +551,39 @@ "fields": "", "values": false }, - "text": {}, "textMode": "auto" }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", + "span": 0, + "sparkline": {}, "targets": [ { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "exemplar": true, "expr": "count(max(up{}) by (instance))", "format": "table", - "interval": "", "intervalFactor": 1, - "legendFormat": "", "refId": "active", "step": 10 } ], + "thresholds": "", "title": "Active Services", - "transformations": [], - "type": "stat" + "type": "stat", + "valueFontSize": "", + "valueName": "" }, { + "colorBackground": false, + "colorValue": false, "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, "mappings": [], "thresholds": { "mode": "absolute", @@ -639,7 +624,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -725,10 +710,9 @@ "fields": "", "values": false }, - "text": {}, "textMode": "auto" }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "sparkline": {}, "targets": [ @@ -737,11 +721,7 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "exemplar": true, "expr": "sum(max(container_memory_working_set_bytes{id=\"/\"}) by (instance))", - "hide": false, - "interval": "", - "legendFormat": "", "refId": "allocations" } ], @@ -823,10 +803,9 @@ "fields": "", "values": false }, - "text": {}, "textMode": "auto" }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "sparkline": {}, "targets": [ @@ -835,12 +814,8 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "exemplar": true, "expr": "max(machine_memory_bytes) by (node)", "format": "time_series", - "hide": false, - "interval": "", - "legendFormat": "", "refId": "cores" } ], @@ -873,9 +848,6 @@ "error": false, "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, "mappings": [], "thresholds": { "mode": "absolute", @@ -926,10 +898,9 @@ "fields": "", "values": false }, - "text": {}, "textMode": "auto" }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "sparkline": {}, "targets": [ @@ -938,11 +909,7 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "exemplar": true, "expr": "sum(rate(container_cpu_usage_seconds_total{id=\"/\"}[$__rate_interval]))", - "hide": false, - "interval": "", - "legendFormat": "", "refId": "vcores" } ], @@ -974,9 +941,6 @@ "error": false, "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, "mappings": [], "thresholds": { "mode": "absolute", @@ -1026,10 +990,9 @@ "fields": "", "values": false }, - "text": {}, "textMode": "auto" }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "sparkline": {}, "targets": [ @@ -1038,18 +1001,13 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "exemplar": true, "expr": "sum(max(machine_cpu_cores) by (node)) != 0", "format": "time_series", - "hide": false, - "interval": "", - "legendFormat": "", "refId": "cores" } ], "thresholds": "", "title": "CPU Available", - "transformations": [], "type": "stat", "valueFontSize": "", "valueName": "" @@ -1116,10 +1074,9 @@ "fields": "", "values": false }, - "text": {}, "textMode": "auto" }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "sparkline": {}, "targets": [ @@ -1128,11 +1085,7 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "exemplar": true, "expr": "sum(max(container_fs_usage_bytes{device=~\"^(^$)|(overlay_.*)|(/dev/[sv]d[a-z][1-9]*)$\"}) by (instance))", - "hide": false, - "interval": "", - "legendFormat": "", "refId": "total" } ], @@ -1204,10 +1157,9 @@ "fields": "", "values": false }, - "text": {}, "textMode": "auto" }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "sparkline": {}, "targets": [ @@ -1216,10 +1168,7 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "exemplar": true, "expr": "sum(max by (device) (container_fs_limit_bytes{id=\"/\",device=~\"^(^$)|(overlay_.*)|(/dev/[sv]d[a-z][1-9]*)$\"}))", - "interval": "", - "legendFormat": "", "refId": "limit" } ], @@ -1269,7 +1218,7 @@ "tags": [], "viewMode": "list" }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -1390,7 +1339,7 @@ }, "showHeader": true }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "scroll": false, "span": 0, "targets": [ @@ -1399,13 +1348,9 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "exemplar": true, "expr": "max(up) by (node, instance)", "format": "table", - "instant": false, - "interval": "", "intervalFactor": 1, - "legendFormat": "", "refId": "Used" } ], @@ -1473,7 +1418,7 @@ "showTime": true, "showUser": true }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -1671,21 +1616,15 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "editorMode": "code", - "exemplar": true, "expr": "avg(rate(container_cpu_usage_seconds_total{id=\"/\"}[$__rate_interval])) != 0 ", - "hide": false, - "instant": false, - "interval": "", "legendFormat": "cpu", - "range": true, "refId": "cpu" }, { @@ -1693,14 +1632,8 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "editorMode": "code", - "exemplar": true, "expr": "avg(rate(container_fs_reads_bytes_total{id=\"/\"}[$__rate_interval]) + \nrate(container_fs_writes_bytes_total{id=\"/\"}[$__rate_interval])) != 0", - "hide": false, - "instant": false, - "interval": "", "legendFormat": "io", - "range": true, "refId": "io" }, { @@ -1708,14 +1641,8 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "editorMode": "code", - "exemplar": true, "expr": "avg(rate(container_network_receive_bytes_total{id=\"/\"}[$__rate_interval]) + \nrate(container_network_transmit_bytes_total{id=\"/\"}[$__rate_interval])) != 0", - "hide": false, - "instant": false, - "interval": "", "legendFormat": "net", - "range": true, "refId": "net" } ], @@ -1801,7 +1728,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] }, @@ -1868,7 +1796,6 @@ }, "id": 109, "isNew": false, - "links": [], "options": { "legend": { "calcs": [ @@ -1885,7 +1812,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -1893,10 +1820,7 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "exemplar": true, "expr": "max(rate(container_cpu_usage_seconds_total{id=\"/\"}[$__rate_interval])) != 0", - "hide": false, - "interval": "", "legendFormat": "max", "refId": "max" }, @@ -1905,10 +1829,7 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "exemplar": true, "expr": "avg(rate(container_cpu_usage_seconds_total{id=\"/\"}[$__rate_interval])) != 0", - "hide": false, - "interval": "", "legendFormat": "avg", "refId": "avg" }, @@ -1917,10 +1838,7 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "exemplar": true, "expr": "min(rate(container_cpu_usage_seconds_total{id=\"/\"}[$__rate_interval])) != 0", - "hide": false, - "interval": "", "legendFormat": "min", "refId": "min" } @@ -2043,7 +1961,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -2051,10 +1969,7 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "exemplar": true, "expr": "avg(rate(container_cpu_system_seconds_total{id=\"/\"}[$__rate_interval])) != 0", - "hide": false, - "interval": "", "legendFormat": "kernel", "refId": "kernel" }, @@ -2063,22 +1978,11 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "exemplar": true, "expr": "avg(rate(container_cpu_user_seconds_total{id=\"/\"}[$__rate_interval])) != 0", - "hide": false, - "interval": "", "legendFormat": "user", "refId": "user" } ], - "thresholds": [ - { - "colorMode": "critical", - "op": "lt", - "value": 0.20000000298023224, - "visible": true - } - ], "title": "CPU Utilization [ User / Kernel ]", "type": "timeseries" }, @@ -2221,7 +2125,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -2390,7 +2294,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -2398,10 +2302,7 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "exemplar": true, "expr": "avg(container_memory_usage_bytes{id=\"/\"}) != 0", - "hide": false, - "interval": "", "legendFormat": "Usage", "refId": "Usage" }, @@ -2410,10 +2311,7 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "exemplar": true, "expr": "avg(container_memory_working_set_bytes{id=\"/\"}) != 0", - "hide": false, - "interval": "", "legendFormat": "WorkingSet", "refId": "WorkingSet" }, @@ -2422,10 +2320,7 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "exemplar": true, "expr": "avg(container_memory_rss{id=\"/\"}) != 0", - "hide": false, - "interval": "", "legendFormat": "RSS", "refId": "RSS" }, @@ -2434,10 +2329,7 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "exemplar": true, "expr": "avg(container_memory_swap{id=\"/\"}) != 0", - "hide": false, - "interval": "", "legendFormat": "Swap", "refId": "Swap" }, @@ -2446,10 +2338,7 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "exemplar": true, "expr": "avg(container_memory_mapped_file{id=\"/\"}) != 0", - "hide": false, - "interval": "", "legendFormat": "MMAP", "refId": "MMAP" }, @@ -2458,16 +2347,12 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "exemplar": true, "expr": "avg(container_memory_cache{id=\"/\"}) != 0", - "hide": false, - "interval": "", "legendFormat": "Cache", "refId": "Cache" } ], "title": "Average Memory Usage Per Type", - "transformations": [], "type": "timeseries" }, { @@ -2605,7 +2490,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -2613,10 +2498,7 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "exemplar": true, "expr": "avg(rate(container_memory_failures_total{id=\"/\"}[$__rate_interval])) != 0", - "hide": false, - "interval": "", "legendFormat": "PageFaults", "refId": "PageFaults" }, @@ -2625,22 +2507,11 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "exemplar": true, "expr": "avg(container_memory_working_set_bytes{id=\"/\"}) != 0", - "hide": false, - "interval": "", "legendFormat": "RAM", "refId": "RAM" } ], - "thresholds": [ - { - "colorMode": "critical", - "op": "lt", - "value": 0.20000000298023224, - "visible": true - } - ], "title": "Memory Allocation (Page Faults / RAM )", "type": "timeseries" }, @@ -2808,7 +2679,7 @@ "showUnfilled": true, "text": {} }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -2899,7 +2770,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -2931,7 +2802,6 @@ ], "title": "IO Traffic (Bytes)", "transformations": [], - "transparent": false, "type": "timeseries" }, { @@ -2989,7 +2859,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -3021,7 +2891,6 @@ ], "title": "Total IO Traffic (Bytes)", "transformations": [], - "transparent": false, "type": "piechart" }, { @@ -3085,7 +2954,6 @@ "y": 86 }, "id": 199, - "links": [], "options": { "legend": { "calcs": [], @@ -3098,7 +2966,6 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", "targets": [ { "datasource": { @@ -3130,7 +2997,6 @@ ], "title": "IO Traffic (Operations)", "transformations": [], - "transparent": false, "type": "timeseries" }, { @@ -3188,7 +3054,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -3220,7 +3086,6 @@ ], "title": "Total IO Traffic (Operations)", "transformations": [], - "transparent": false, "type": "piechart" }, { @@ -3304,7 +3169,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -3337,7 +3202,6 @@ "thresholds": [], "title": "IO Throughput (Filesystem)", "transformations": [], - "transparent": false, "type": "timeseries" }, { @@ -3423,7 +3287,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -3456,7 +3320,6 @@ "thresholds": [], "title": "IOPS (Filesystem)", "transformations": [], - "transparent": false, "type": "timeseries" }, { @@ -3541,7 +3404,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -3577,7 +3440,6 @@ "thresholds": [], "title": "Estimate IO Request Size", "transformations": [], - "transparent": false, "type": "timeseries" }, { @@ -3661,7 +3523,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -3694,7 +3556,6 @@ "thresholds": [], "title": "Average Time spent in IO", "transformations": [], - "transparent": false, "type": "timeseries" }, { @@ -3801,7 +3662,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -3833,7 +3694,6 @@ ], "title": "Network Traffic (Bytes)", "transformations": [], - "transparent": false, "type": "timeseries" }, { @@ -3891,7 +3751,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -3923,7 +3783,6 @@ ], "title": "Total Network Traffic (Bytes)", "transformations": [], - "transparent": false, "type": "piechart" }, { @@ -4000,7 +3859,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -4032,7 +3891,6 @@ ], "title": "Network Traffic (Packets)", "transformations": [], - "transparent": false, "type": "timeseries" }, { @@ -4090,7 +3948,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "targets": [ { "datasource": { @@ -4122,7 +3980,6 @@ ], "title": "Total Network Traffic (Packets)", "transformations": [], - "transparent": false, "type": "piechart" }, { @@ -4258,7 +4115,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -4434,7 +4291,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -4620,7 +4477,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -4798,7 +4655,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -4806,10 +4663,7 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "exemplar": true, "expr": "avg(rate(container_network_receive_bytes_total{id=\"/\"}[$__rate_interval])) != 0", - "hide": false, - "interval": "", "legendFormat": "Throughput", "refId": "Throughput" }, @@ -4818,22 +4672,11 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "exemplar": true, "expr": "avg(rate(container_network_receive_packets_total{id=\"/\"}[$__rate_interval])) != 0", - "hide": false, - "interval": "", "legendFormat": "Packets", "refId": "Packets" } ], - "thresholds": [ - { - "colorMode": "critical", - "op": "lt", - "value": 0.20000000298023224, - "visible": true - } - ], "title": "Correlated Inbound Network Throughput", "type": "timeseries" }, @@ -4935,7 +4778,6 @@ }, "id": 126, "isNew": false, - "links": [], "options": { "legend": { "calcs": [ @@ -4952,7 +4794,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -4960,10 +4802,7 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "exemplar": true, "expr": "max(rate(container_oom_events_total{id=\"/\"}[$__rate_interval])) != 0", - "hide": false, - "interval": "", "intervalFactor": 1, "legendFormat": "max", "refId": "max" @@ -4973,10 +4812,7 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "exemplar": true, "expr": "avg(rate(container_oom_events_total{id=\"/\"}[$__rate_interval])) != 0", - "hide": false, - "interval": "", "intervalFactor": 1, "legendFormat": "avg", "refId": "avg" @@ -4986,10 +4822,7 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "exemplar": true, "expr": "min(rate(container_oom_events_total{id=\"/\"}[$__rate_interval])) != 0", - "hide": false, - "interval": "", "intervalFactor": 1, "legendFormat": "min", "refId": "min" @@ -5113,7 +4946,6 @@ }, "id": 168, "isNew": false, - "links": [], "options": { "legend": { "calcs": [ @@ -5130,7 +4962,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -5138,10 +4970,7 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "exemplar": true, "expr": "max(rate(container_fs_sector_writes_total{id=\"/\", device=~\"^(^$)|(overlay_.*)|(/dev/mapper/.*)$\", }[$__rate_interval])) != 0", - "hide": false, - "interval": "", "legendFormat": "max", "refId": "max" }, @@ -5150,10 +4979,7 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "exemplar": true, "expr": "avg(rate(container_fs_sector_writes_total{id=\"/\", device=~\"^(^$)|(overlay_.*)|(/dev/mapper/.*)$\", }[$__rate_interval])) != 0", - "hide": false, - "interval": "", "legendFormat": "avg", "refId": "avg" }, @@ -5162,10 +4988,7 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "exemplar": true, "expr": "min(rate(container_fs_sector_writes_total{id=\"/\", device=~\"^(^$)|(overlay_.*)|(/dev/mapper/.*)$\", }[$__rate_interval])) != 0", - "hide": false, - "interval": "", "legendFormat": "min", "refId": "min" } @@ -5288,7 +5111,6 @@ }, "id": 169, "isNew": false, - "links": [], "options": { "legend": { "calcs": [ @@ -5305,7 +5127,7 @@ "sort": "none" } }, - "pluginVersion": "9.4.7", + "pluginVersion": "9.2.1", "span": 0, "targets": [ { @@ -5313,10 +5135,7 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "exemplar": true, "expr": "max(rate(container_fs_sector_reads_total{id=\"/\", device=~\"^(^$)|(overlay_.*)|(/dev/mapper/.*)$\", }[$__rate_interval])) != 0", - "hide": false, - "interval": "", "legendFormat": "max", "refId": "max" }, @@ -5325,10 +5144,7 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "exemplar": true, "expr": "avg(rate(container_fs_sector_reads_total{id=\"/\", device=~\"^(^$)|(overlay_.*)|(/dev/mapper/.*)$\", }[$__rate_interval])) != 0", - "hide": false, - "interval": "", "legendFormat": "avg", "refId": "avg" }, @@ -5337,10 +5153,7 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "exemplar": true, "expr": "min(rate(container_fs_sector_reads_total{id=\"/\", device=~\"^(^$)|(overlay_.*)|(/dev/mapper/.*)$\", }[$__rate_interval])) != 0", - "hide": false, - "interval": "", "legendFormat": "min", "refId": "min" } diff --git a/charts/system/templates/telemetry/cadvisor/cadvisor.yml b/charts/system/templates/telemetry/cadvisor/cadvisor.yml index b1df8e97..24dfeba2 100644 --- a/charts/system/templates/telemetry/cadvisor/cadvisor.yml +++ b/charts/system/templates/telemetry/cadvisor/cadvisor.yml @@ -52,6 +52,11 @@ spec: nsenter -t ${mainPID} -C -- mount -t cgroup2 none /sys/fs/cgroup } + # update the limits temporarily to avoid "inotify_init: too many open files" + sysctl fs.inotify.max_user_instances=8192 + sysctl fs.inotify.max_user_watches=524288 + sysctl -p + get_main_pid && enter_ns # Start the monitoring diff --git a/charts/system/templates/telemetry/grafana/grafana.yml b/charts/system/templates/telemetry/grafana/grafana.yml index 949b71f7..19196586 100644 --- a/charts/system/templates/telemetry/grafana/grafana.yml +++ b/charts/system/templates/telemetry/grafana/grafana.yml @@ -17,13 +17,14 @@ spec: containers: - name: main image: grafana/grafana-oss:9.4.7 + #image: grafana/grafana-oss:9.2.1 ports: - name: http containerPort: {{.Values.telemetry.grafana.port}} resources: requests: - memory: "4Gi" - cpu: "1" + memory: "14Gi" + cpu: "2" volumeMounts: - name: config mountPath: /etc/grafana/grafana.ini @@ -82,6 +83,10 @@ data: grafana.ini: | # For more options see https://github.com/grafana/grafana/blob/main/conf/defaults.ini + #################################### Logging ########################## + [log] + # Either "debug", "info", "warn", "error", "critical", default is "info" + level = debug #################################### Users ############################### [users] diff --git a/cmd/kubectl-frisbee/commands/common/kubectl-wrapper.go b/cmd/kubectl-frisbee/commands/common/kubectl-wrapper.go index 090778fe..44a584b2 100644 --- a/cmd/kubectl-frisbee/commands/common/kubectl-wrapper.go +++ b/cmd/kubectl-frisbee/commands/common/kubectl-wrapper.go @@ -48,9 +48,10 @@ const ( NotResourcesFoundReg = `No resources found*` NotFound = `Error from server (NotFound)` - NoResources = `No resources found in .+ namespace.` - PodNotFound = `Error from server \(NotFound\): pods ".+" not found` - NamespaceNotFound = `Error from server \(NotFound\): namespaces ".+" not found` + NoResources = `No resources found in .+ namespace.` + NoMatchingResources = `no matching resources found` + PodNotFound = `Error from server \(NotFound\): pods ".+" not found` + NamespaceNotFound = `Error from server \(NotFound\): namespaces ".+" not found` ) func ErrContainerNotReady(out []byte) bool { @@ -89,6 +90,15 @@ func ErrNoResources(out []byte) bool { return match } +func ErrNoMatchingResources(out []byte) bool { + match, err := regexp.Match(NoMatchingResources, out) + if err != nil { + panic("unhandled output") + } + + return match +} + func ErrNotFound(out []byte) bool { { // First form if strings.Contains(string(out), NotFound) { @@ -121,28 +131,40 @@ func ErrNotFound(out []byte) bool { return false } -func Kubectl(testName string, arguments ...string) ([]byte, error) { +func Kubectl(testName string, command ...string) ([]byte, error) { + var kubectlArgs []string + if env.Default.KubeConfigPath != "" { - arguments = append(arguments, "--kubeconfig", env.Default.KubeConfigPath) + kubectlArgs = append(kubectlArgs, "--kubeconfig", env.Default.KubeConfigPath) } if testName != "" { - arguments = append(arguments, "--namespace", testName) + kubectlArgs = append(kubectlArgs, "--namespace", testName) } - return process.Execute(env.Default.Kubectl(), arguments...) + kubectlArgs = append(kubectlArgs, command...) + + ui.Debug(env.Default.Kubectl(), strings.Join(kubectlArgs, " ")) + + return process.Execute(env.Default.Kubectl(), kubectlArgs...) } -func LoggedKubectl(testName string, arguments ...string) ([]byte, error) { +func LoggedKubectl(testName string, command ...string) ([]byte, error) { + var kubectlArgs []string + if env.Default.KubeConfigPath != "" { - arguments = append(arguments, "--kubeconfig", env.Default.KubeConfigPath) + kubectlArgs = append(kubectlArgs, "--kubeconfig", env.Default.KubeConfigPath) } if testName != "" { - arguments = append(arguments, "--namespace", testName) + kubectlArgs = append(kubectlArgs, "--namespace", testName) } - return process.LoggedExecuteInDir("", os.Stdout, env.Default.Kubectl(), arguments...) + kubectlArgs = append(kubectlArgs, command...) + + ui.Debug(env.Default.Kubectl(), strings.Join(kubectlArgs, " ")) + + return process.LoggedExecuteInDir("", os.Stdout, env.Default.Kubectl(), kubectlArgs...) } func HelmIgnoreNotFound(err error) error { @@ -153,32 +175,40 @@ func HelmIgnoreNotFound(err error) error { return err } -func Helm(testName string, arguments ...string) ([]byte, error) { +func Helm(testName string, command ...string) ([]byte, error) { + var helmArgs []string + if env.Default.KubeConfigPath != "" { - arguments = append(arguments, "--kubeconfig", env.Default.KubeConfigPath) + helmArgs = append(helmArgs, "--kubeconfig", env.Default.KubeConfigPath) } if env.Default.Debug { - arguments = append(arguments, "--debug") + helmArgs = append(helmArgs, "--debug") } if testName != "" { - arguments = append(arguments, "--namespace", testName) + helmArgs = append(helmArgs, "--namespace", testName) } - return process.Execute(env.Default.Helm(), arguments...) + helmArgs = append(helmArgs, command...) + + return process.Execute(env.Default.Helm(), helmArgs...) } -func LoggedHelm(testName string, arguments ...string) ([]byte, error) { +func LoggedHelm(testName string, command ...string) ([]byte, error) { + var helmArgs []string + if env.Default.KubeConfigPath != "" { - arguments = append(arguments, "--kubeconfig", env.Default.KubeConfigPath) + helmArgs = append(helmArgs, "--kubeconfig", env.Default.KubeConfigPath) } if testName != "" { - arguments = append(arguments, "--namespace", testName) + helmArgs = append(helmArgs, "--namespace", testName) } - return process.LoggedExecuteInDir("", os.Stdout, env.Default.Helm(), arguments...) + helmArgs = append(helmArgs, command...) + + return process.LoggedExecuteInDir("", os.Stdout, env.Default.Helm(), helmArgs...) } func setOutput(command []string) []string { @@ -284,16 +314,31 @@ func GetTemplateResources(testName string) error { return err } -func WaitForCondition(testName string, condition v1alpha1.ConditionType, timeout string) error { +func WaitForCondition(ctx context.Context, testName string, condition v1alpha1.ConditionType, timeout string) error { command := []string{ "wait", "scenario", "--all=true", "--for=condition=" + condition.String(), "--timeout=" + timeout, } - _, err := LoggedKubectl(testName, command...) + return wait.ExponentialBackoffWithContext(ctx, common.DefaultBackoffForK8sEndpoint, func(ctx context.Context) (done bool, err error) { + out, err := Kubectl(testName, command...) - return err + switch { + case ErrNamespaceNotFound(out): + return true, nil + case len(out) == 0, ErrNoMatchingResources(out): // resource initialization + // ui.Info("Waiting for pods to become ready ...", string(out)) + return false, nil + case err != nil: // abort + return false, err + default: // ok + ui.Info("Condition successful") + + // Output printing is not required as it is printed by the os.Stdout + return true, nil + } + }) } /* @@ -420,43 +465,52 @@ Filter query: - Run with '--logs pod1,pod2,pod3,...'. -> monitoring multiple pods */ func KubectlLogs(ctx context.Context, testName string, tail bool, lines int, pods ...string) error { - command := []string{ - "logs", - // "-c", v1alpha1.MainContainerName, - "--all-containers", - "--prefix=true", - } + ui.Debug("Streaming logs is generally not advisable. Setting Max Limit: 100") + + command := []string{"logs", "--max-log-requests=100"} if len(pods) == 0 { - command = append(command, "-l", v1alpha1.LabelScenario) + panic("this should not happen") } + // Case: monitor a specific class of pods. if len(pods) == 1 { switch pods[0] { case "all": - ui.Warn("Streaming all logs in not advised for large experiments. Limit: 100") - + // eq: kubectl logs -l "scenario.frisbee.dev/name" + // We assume that only one scenario is running per namespace. command = append(command, "-l", v1alpha1.LabelScenario) - command = append(command, "--max-log-requests=100") case "SYS": + // eq: kubectl logs -l "scenario.frisbee.dev/name,scenario.frisbee.dev/component=SYS" command = append(command, "-l", strings.Join([]string{v1alpha1.LabelScenario, FilterSYS}, ",")) case "SUT": + // eq: kubectl logs -l "scenario.frisbee.dev/name,scenario.frisbee.dev/component=SUT" command = append(command, "-l", strings.Join([]string{v1alpha1.LabelScenario, FilterSUT}, ",")) default: - command = append(command, pods...) + // eq: kubectl logs + command = append(command, pods[0]) } } + // Case: monitor a pod list if len(pods) > 1 { - command = append(command, pods...) + // eq: kubectl logs -l 'scenario.frisbee.dev/action in (wfa-server,wfb-server)' + command = append(command, "-l", + fmt.Sprintf("%s in (%s)", v1alpha1.LabelAction, strings.Join(pods, ",")), + ) } + // how to present it + command = append(command, + // "-c", v1alpha1.MainContainerName, + "--all-containers", + "--prefix=true", + ) + // with tail if tail { command = append(command, "--ignore-errors=false", "--follow") - ui.Debug(env.Default.Kubectl(), strings.Join(command, " ")) - return wait.ExponentialBackoffWithContext(ctx, common.DefaultBackoffForServiceEndpoint, func(ctx context.Context) (done bool, err error) { out, err := LoggedKubectl(testName, command...) @@ -478,7 +532,6 @@ func KubectlLogs(ctx context.Context, testName string, tail bool, lines int, pod // without tail command = append(command, "--ignore-errors=true", fmt.Sprintf("--tail=%d", lines)) - ui.Debug(env.Default.Kubectl(), strings.Join(command, " ")) out, err := Kubectl(testName, command...) switch { @@ -642,8 +695,6 @@ func ForceDelete(testName string) error { // remove the resource finalizer patch := []string{"patch", crd, resources, "--type", "json", K8SRemoveFinalizer} - ui.Debug("Use patch", env.Default.Kubectl(), strings.Join(patch, " ")) - if _, err := Kubectl(testName, patch...); err != nil { return errors.Wrapf(err, "cannot patch '%s' finalizers", crd) } diff --git a/cmd/kubectl-frisbee/commands/tests/inspect.go b/cmd/kubectl-frisbee/commands/tests/inspect.go index a2af7e79..7f147202 100644 --- a/cmd/kubectl-frisbee/commands/tests/inspect.go +++ b/cmd/kubectl-frisbee/commands/tests/inspect.go @@ -215,7 +215,7 @@ func NewInspectTestCmd() *cobra.Command { err := common.KubectlLogs(cmd.Context(), testName, false, options.Loglines, options.Logs...) env.Default.Hint("For more logs use:", "kubectl logs -n", testName, "") - ui.ExitOnError("== Logs FromTime Pods ==", err) + ui.ExitOnError("== Pod Logs ==", err) ui.Success("== Scenario Logs ==") } diff --git a/cmd/kubectl-frisbee/commands/tests/report.go b/cmd/kubectl-frisbee/commands/tests/report.go index 3a38d88c..f41287f8 100644 --- a/cmd/kubectl-frisbee/commands/tests/report.go +++ b/cmd/kubectl-frisbee/commands/tests/report.go @@ -156,7 +156,7 @@ func NewReportTestCmd() *cobra.Command { if options.Wait { ui.Info("Waiting for scenario actions to be completed...") - err = common.WaitForCondition(testName, v1alpha1.ConditionAllJobsAreCompleted, Timeout) + err = common.WaitForCondition(cmd.Context(), testName, v1alpha1.ConditionAllJobsAreCompleted, Timeout) ui.ExitOnError("abnormal termination. err:", err) // get the new status diff --git a/cmd/kubectl-frisbee/commands/tests/submit.go b/cmd/kubectl-frisbee/commands/tests/submit.go index e45ee18f..6af790bc 100644 --- a/cmd/kubectl-frisbee/commands/tests/submit.go +++ b/cmd/kubectl-frisbee/commands/tests/submit.go @@ -125,7 +125,7 @@ func NewSubmitTestCmd() *cobra.Command { // the actual submission. err := common.RunTest(testName, testFile, common.ValidationClient) ui.ExitOnError("Validating testfile: "+testFile, err) - ui.Success("Test File has been successfully validated.", testFile) + ui.Success("Scenario validated:", testFile) /*--------------------------------------------------- * Ensure environment isolation @@ -148,7 +148,7 @@ func NewSubmitTestCmd() *cobra.Command { ui.ExitOnError("Setting namespace quotas", err) } */ - ui.Success("Test Environment is ready.", testName) + ui.Success("Namespace is ready:", testName) /*--------------------------------------------------- * Install Helm Dependencies, if any @@ -164,7 +164,7 @@ func NewSubmitTestCmd() *cobra.Command { ui.ExitOnError("Installing Dependency: "+dependency, err) } - ui.Success("Scenario dependencies are installed.", dependentCharts...) + ui.Success("Installed Dependencies:", dependentCharts...) } /*--------------------------------------------------- @@ -172,7 +172,7 @@ func NewSubmitTestCmd() *cobra.Command { *---------------------------------------------------*/ err = common.RunTest(testName, testFile, common.ValidationNone) ui.ExitOnError("Starting test-case execution ", err) - ui.Success("Test has been successfully submitted.") + ui.Success("Scenario submitted.") // Control test output ControlOutput(cmd.Context(), testName, &options) @@ -189,7 +189,7 @@ func ControlOutput(ctx context.Context, testName string, options *SubmitTestCmdO case options.ExpectSuccess: ui.Info("Expecting the test to complete successfully within ", options.Timeout) - err := common.WaitForCondition(testName, v1alpha1.ConditionAllJobsAreCompleted, options.Timeout) + err := common.WaitForCondition(ctx, testName, v1alpha1.ConditionAllJobsAreCompleted, options.Timeout) env.Default.Hint("To inspect the execution:", "kubectl frisbee inspect test ", testName) ui.ExitOnError("waiting for test to complete successfully", err) @@ -197,7 +197,7 @@ func ControlOutput(ctx context.Context, testName string, options *SubmitTestCmdO case options.ExpectFailure: ui.Info("Expecting the test to fail within ", options.Timeout) - err := common.WaitForCondition(testName, v1alpha1.ConditionJobUnexpectedTermination, options.Timeout) + err := common.WaitForCondition(ctx, testName, v1alpha1.ConditionJobUnexpectedTermination, options.Timeout) env.Default.Hint("To inspect the execution:", "kubectl frisbee inspect test ", testName) ui.ExitOnError("waiting for test to fail", err) @@ -205,7 +205,7 @@ func ControlOutput(ctx context.Context, testName string, options *SubmitTestCmdO case options.ExpectError: ui.Info("Expecting the test to raise an assertion error within ", options.Timeout) - err := common.WaitForCondition(testName, v1alpha1.ConditionAssertionError, options.Timeout) + err := common.WaitForCondition(ctx, testName, v1alpha1.ConditionAssertionError, options.Timeout) env.Default.Hint("To inspect the execution:", "kubectl frisbee inspect test ", testName) ui.ExitOnError("waiting for test to raise an assertion error", err) @@ -217,11 +217,12 @@ func ControlOutput(ctx context.Context, testName string, options *SubmitTestCmdO ui.ExitOnError("Watching for changes in the test status error", err) case options.Logs != nil: - ui.Info("Tailing test logs ...") + ui.Warn("Streaming Logs from:", options.Logs...) err := common.KubectlLogs(ctx, testName, true, -1, options.Logs...) env.Default.Hint("To inspect the execution logs use:", "kubectl frisbee inspect test ", testName, " --logs all") + ui.ExitOnError("Getting logs", err) } } diff --git a/cmd/kubectl-frisbee/commands/tests/validate.go b/cmd/kubectl-frisbee/commands/tests/validate.go index f7f9adb4..4f2c6978 100644 --- a/cmd/kubectl-frisbee/commands/tests/validate.go +++ b/cmd/kubectl-frisbee/commands/tests/validate.go @@ -73,7 +73,7 @@ func NewValidateTestCmd() *cobra.Command { ui.ExitOnError("Chart Validation ...", err) - ui.Success("Chart has been successfully validated.", testFile) + ui.Success("Chart validated.", testFile) return } @@ -83,7 +83,7 @@ func NewValidateTestCmd() *cobra.Command { err := validateScenario(testFile) ui.ExitOnError("Validating ...", err) - ui.Success("Scenario has been successfully validated.", testFile) + ui.Success("Scenario validated:", testFile) } }, } diff --git a/cmd/kubectl-frisbee/env/logo.go b/cmd/kubectl-frisbee/env/logo.go index b707440a..24a344a0 100644 --- a/cmd/kubectl-frisbee/env/logo.go +++ b/cmd/kubectl-frisbee/env/logo.go @@ -42,5 +42,5 @@ func Logo() { fmt.Fprint(ui.Writer, ui.Blue(logo())) fmt.Fprintln(ui.Writer) - ui.Success("Connecting to:", Default.KubeConfig.Host) + ui.Success("Kubernetes API:", Default.KubeConfig.Host) } diff --git a/controllers/common/defaults.go b/controllers/common/defaults.go index 88bd4779..44d85e2e 100644 --- a/controllers/common/defaults.go +++ b/controllers/common/defaults.go @@ -62,8 +62,8 @@ var DefaultTimeoutFork8sEndpoint = 1 * time.Minute // DefaultBackoffForServiceEndpoint is the default backoff for controller-to-pod communication var DefaultBackoffForServiceEndpoint = wait.Backoff{ - Duration: 5 * time.Second, - Factor: 5, + Duration: 10 * time.Second, + Factor: 0.2, Jitter: 0.1, Steps: 6, } diff --git a/controllers/scenario/controller.go b/controllers/scenario/controller.go index e32dcf92..aae19b64 100644 --- a/controllers/scenario/controller.go +++ b/controllers/scenario/controller.go @@ -151,12 +151,12 @@ func (r *Controller) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu return lifecycle.Pending(ctx, r, &scenario, "Initializing the testing environment") case v1alpha1.PhasePending: - actionList, nextRun, err := r.NextJobs(&scenario) + nextActionList, nextRun, err := r.NextJobs(&scenario) if err != nil { return lifecycle.Failed(ctx, r, &scenario, errors.Wrapf(err, "scheduling error")) } - if len(actionList) == 0 { + if len(nextActionList) == 0 { if nextRun.IsZero() { // nothing to do on this cycle. wait the next cycle trigger by watchers. return common.Stop(r, req) @@ -165,7 +165,7 @@ func (r *Controller) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu return common.RequeueAfter(r, req, time.Until(nextRun)) } - if err := r.RunActions(ctx, &scenario, actionList); err != nil { + if err := r.RunActions(ctx, &scenario, nextActionList); err != nil { return lifecycle.Failed(ctx, r, &scenario, errors.Wrapf(err, "actions failed")) } @@ -370,7 +370,7 @@ func (r *Controller) HasFailed(ctx context.Context, scenario *v1alpha1.Scenario) return common.Update(ctx, r, scenario) } -func (r *Controller) RunActions(ctx context.Context, scenario *v1alpha1.Scenario, actionList []v1alpha1.Action) error { +func (r *Controller) RunActions(ctx context.Context, scenario *v1alpha1.Scenario, nextActionList []v1alpha1.Action) error { if scenario.Status.GrafanaEndpoint == "" { r.Logger.Info("Grafana endpoint is empty. Skip telemetry.", "scenario", scenario.GetName()) } else { @@ -379,7 +379,7 @@ func (r *Controller) RunActions(ctx context.Context, scenario *v1alpha1.Scenario } } - for _, action := range actionList { + for _, action := range nextActionList { if action.Assert.HasMetricsExpr() { // Assert belong to the top-level workflow. Not to the job if err := expressions.SetAlert(ctx, scenario, action.Assert.Metrics); err != nil { diff --git a/examples/databases/elasticity/install.sh b/examples/databases/elasticity/install.sh index c012e23a..8813adc5 100755 --- a/examples/databases/elasticity/install.sh +++ b/examples/databases/elasticity/install.sh @@ -1,6 +1,8 @@ #!/bin/bash -set -eux +set -eu +set -o pipefail + export NAMESPACE=elasticity export SCENARIO=$(dirname -- "$0")/manifest.yml @@ -10,12 +12,12 @@ export DEPENDENCIES=(./charts/system/ ./charts/databases/cockroachdb ./charts/da # Prepare the Reporting folder mkdir -p "${REPORTS}" -# Submit the scenario and follow logs -kubectl-frisbee submit test "${NAMESPACE}" "${SCENARIO}" "${DEPENDENCIES[@]}" - # Copy the manifest cp "${SCENARIO}" "${REPORTS}" +# Submit the scenario and follow logs +kubectl-frisbee submit test "${NAMESPACE}" "${SCENARIO}" "${DEPENDENCIES[@]}" + # wait for the scenario to be submitted sleep 10 diff --git a/examples/databases/network-partition/install.sh b/examples/databases/network-partition/install.sh index 50180ed6..e186c9fc 100755 --- a/examples/databases/network-partition/install.sh +++ b/examples/databases/network-partition/install.sh @@ -1,6 +1,7 @@ #!/bin/bash -set -eux +set -eu +set -o pipefail export NAMESPACE=network-partition export SCENARIO=$(dirname -- "$0")/manifest.yml @@ -10,12 +11,12 @@ export DEPENDENCIES=(./charts/system/ ./charts/databases/cockroachdb ./charts/da # Prepare the Reporting folder mkdir -p "${REPORTS}" -# Submit the scenario and follow logs -kubectl-frisbee submit test "${NAMESPACE}" "${SCENARIO}" "${DEPENDENCIES[@]}" - # Copy the manifest cp "${SCENARIO}" "${REPORTS}" +# Submit the scenario and follow logs +kubectl-frisbee submit test "${NAMESPACE}" "${SCENARIO}" "${DEPENDENCIES[@]}" + # wait for the scenario to be submitted sleep 10 diff --git a/examples/databases/network-partition/manifest.yml b/examples/databases/network-partition/manifest.yml index 701b6926..5ccc4538 100644 --- a/examples/databases/network-partition/manifest.yml +++ b/examples/databases/network-partition/manifest.yml @@ -39,7 +39,7 @@ spec: name: wait-for-3x-replication depends: { success: [ import-workload ] } call: - callable: test2-wait-for-3x-replication + callable: wait-for-3x-replication services: [ masters-1 ] # Step 4A. run workload for up-to 10 mins (node1) diff --git a/examples/databases/normal-load/install.sh b/examples/databases/normal-load/install.sh index 675143d5..cbb7cf18 100755 --- a/examples/databases/normal-load/install.sh +++ b/examples/databases/normal-load/install.sh @@ -1,6 +1,8 @@ #!/bin/bash -set -eux +set -eu +set -o pipefail + export NAMESPACE=normal-load export SCENARIO=$(dirname -- "$0")/manifest.yml @@ -11,12 +13,12 @@ export DASHBOARDS=(summary ingleton ycsb) # Prepare the Reporting folder mkdir -p "${REPORTS}" -# Submit the scenario and follow logs -kubectl-frisbee submit test "${NAMESPACE}" "${SCENARIO}" "${DEPENDENCIES[@]}" - # Copy the manifest cp "${SCENARIO}" "${REPORTS}" +# Submit the scenario and follow logs +kubectl-frisbee submit test "${NAMESPACE}" "${SCENARIO}" "${DEPENDENCIES[@]}" + # wait for the scenario to be submitted sleep 10 diff --git a/examples/databases/sstable-bitrot/install.sh b/examples/databases/sstable-bitrot/install.sh index 9c9188b4..844ca634 100755 --- a/examples/databases/sstable-bitrot/install.sh +++ b/examples/databases/sstable-bitrot/install.sh @@ -1,6 +1,7 @@ #!/bin/bash -set -eux +set -eu +set -o pipefail export NAMESPACE=sstable-bitrot export SCENARIO=$(dirname -- "$0")/manifest.yml @@ -10,12 +11,12 @@ export DEPENDENCIES=(./charts/system/ ./charts/databases/cockroachdb ./charts/da # Prepare the Reporting folder mkdir -p "${REPORTS}" -# Submit the scenario and follow logs -kubectl-frisbee submit test "${NAMESPACE}" "${SCENARIO}" "${DEPENDENCIES[@]}" - # Copy the manifest cp "${SCENARIO}" "${REPORTS}" +# Submit the scenario and follow the failing client's logs +kubectl-frisbee submit test "${NAMESPACE}" "${SCENARIO}" "${DEPENDENCIES[@]}" --logs masters-1 |& tee -a "${REPORTS}"/logs & + # wait for the scenario to be submitted sleep 10 diff --git a/examples/databases/sstable-bitrot/manifest.yml b/examples/databases/sstable-bitrot/manifest.yml index 3d4b96f1..85839a4d 100644 --- a/examples/databases/sstable-bitrot/manifest.yml +++ b/examples/databases/sstable-bitrot/manifest.yml @@ -39,7 +39,7 @@ spec: depends: { success: [ import-workload ] } call: callable: bitrot - services: [ masters-1, masters-2, masters-3 ] + services: [ masters-1 ] # Step 4. run workload for up-to 10 mins (node1) - action: Service diff --git a/examples/federated_learning/5.crashing-nodes.yml b/examples/federated_learning/5.crashing-nodes.yml deleted file mode 100644 index 74e6a6dd..00000000 --- a/examples/federated_learning/5.crashing-nodes.yml +++ /dev/null @@ -1,53 +0,0 @@ -# This experiment is designed for the evaluation of failed clients. -# For this purpose, we use start with multiple clients and periodically kill three of them. ---- -apiVersion: frisbee.dev/v1alpha1 -kind: Scenario -metadata: - name: crashing-nodes -spec: - actions: - # Step 1: Create FedBed server - - action: Service - name: server - service: - templateRef: fedbed.server - inputs: - - { backend: "tensorflow", min_fit_clients: 5, min_available_clients: 5 } - - # Step 2: Create FedBed clients - - action: Cluster - name: clients - depends: { running: [ server ] } - cluster: - templateRef: fedbed.client - resources: # Change values here - total: { cpu: 2, mem: 40Gi } - distribution: { name: uniform } - inputs: - - { fl_server: server, backend: "tensorflow", total_nodes: 5, node_id: 0 } - - { fl_server: server, backend: "tensorflow", total_nodes: 5, node_id: 1 } - - { fl_server: server, backend: "tensorflow", total_nodes: 5, node_id: 2 } - - { fl_server: server, backend: "tensorflow", total_nodes: 5, node_id: 3 } - - { fl_server: server, backend: "tensorflow", total_nodes: 5, node_id: 4 } - - - # When all clients are up and running, kill some of them periodically - - action: Cascade - name: killer - depends: { running: [ clients ] } - cascade: - templateRef: system.chaos.pod.kill - inputs: - - { target: clients-1 } - - { target: clients-7 } - - { target: clients-13 } - schedule: - cron: "@every 1m" - - # Teardown - - action: Delete - name: teardown - depends: { success: [ clients ] } - delete: - jobs: [ server ] diff --git a/examples/federated_learning/crash-on-epoch/install.sh b/examples/federated_learning/crash-on-epoch/install.sh index 39194f1f..8e9d8889 100755 --- a/examples/federated_learning/crash-on-epoch/install.sh +++ b/examples/federated_learning/crash-on-epoch/install.sh @@ -1,6 +1,7 @@ #!/bin/bash -set -eux +set -eu +set -o pipefail export NAMESPACE=crash-on-epoch export SCENARIO=$(dirname -- "$0")/manifest.yml @@ -10,12 +11,12 @@ export DEPENDENCIES=(./charts/system/ ./charts/federated-learning/fedbed/) # Prepare the Reporting folder mkdir -p "${REPORTS}" -# Submit the scenario and follow logs -kubectl-frisbee submit test "${NAMESPACE}" "${SCENARIO}" "${DEPENDENCIES[@]}" - # Copy the manifest cp "${SCENARIO}" "${REPORTS}" +# Submit the scenario and follow logs +kubectl-frisbee submit test "${NAMESPACE}" "${SCENARIO}" "${DEPENDENCIES[@]}" + # wait for the scenario to be submitted sleep 10 diff --git a/examples/federated_learning/crash-on-epoch/manifest.yml b/examples/federated_learning/crash-on-epoch/manifest.yml index 0fb9e17c..7a064ebe 100644 --- a/examples/federated_learning/crash-on-epoch/manifest.yml +++ b/examples/federated_learning/crash-on-epoch/manifest.yml @@ -12,7 +12,7 @@ spec: service: templateRef: fedbed.server inputs: - - { backend: "tensorflow", min_fit_clients: 4, min_available_clients: 5 } + - { backend: "pytorch", min_fit_clients: 4, min_available_clients: 5 } # Step 2: Create FedBed clients - action: Cluster @@ -24,11 +24,11 @@ spec: total: { cpu: 10, mem: 40Gi } distribution: { name: uniform } inputs: - - { fl_server: server, backend: "tensorflow", total_nodes: 5, node_id: 0 } - - { fl_server: server, backend: "tensorflow", total_nodes: 5, node_id: 1 } - - { fl_server: server, backend: "tensorflow", total_nodes: 5, node_id: 2 } - - { fl_server: server, backend: "tensorflow", total_nodes: 5, node_id: 3 } - - { fl_server: server, backend: "tensorflow", total_nodes: 5, node_id: 4 } + - { fl_server: server, backend: "pytorch", total_nodes: 5, node_id: 0 } + - { fl_server: server, backend: "pytorch", total_nodes: 5, node_id: 1 } + - { fl_server: server, backend: "pytorch", total_nodes: 5, node_id: 2 } + - { fl_server: server, backend: "pytorch", total_nodes: 5, node_id: 3 } + - { fl_server: server, backend: "pytorch", total_nodes: 5, node_id: 4 } tolerate: failedJobs: 1 diff --git a/examples/federated_learning/ml-backend/install.sh b/examples/federated_learning/ml-backend/install.sh index 79aa1f67..3140827d 100755 --- a/examples/federated_learning/ml-backend/install.sh +++ b/examples/federated_learning/ml-backend/install.sh @@ -1,5 +1,8 @@ #!/bin/bash +set -eu +set -o pipefail + export NAMESPACE=ml-backend export SCENARIO=$(dirname -- "$0")/manifest.yml export REPORTS=${HOME}/frisbee-reports/${NAMESPACE}/ @@ -8,12 +11,12 @@ export DEPENDENCIES=(./charts/system/ ./charts/federated-learning/fedbed/) # Prepare the Reporting folder mkdir -p "${REPORTS}" -# Submit the scenario and follow logs -kubectl-frisbee submit test "${NAMESPACE}" "${SCENARIO}" "${DEPENDENCIES[@]}" - # Copy the manifest cp "${SCENARIO}" "${REPORTS}" +# Submit the scenario and follow server logs +kubectl-frisbee submit test "${NAMESPACE}" "${SCENARIO}" "${DEPENDENCIES[@]}" --logs server |& tee -a "${REPORTS}"/logs & + # wait for the scenario to be submitted sleep 10 diff --git a/examples/federated_learning/ml-backend/manifest.yml b/examples/federated_learning/ml-backend/manifest.yml index 91f87c7f..d6008358 100644 --- a/examples/federated_learning/ml-backend/manifest.yml +++ b/examples/federated_learning/ml-backend/manifest.yml @@ -4,7 +4,7 @@ apiVersion: frisbee.dev/v1alpha1 kind: Scenario metadata: - name: mnist-python + name: backend spec: actions: # Step 1: Create FedBed server @@ -13,7 +13,7 @@ spec: service: templateRef: fedbed.server inputs: - - { dataset: "MNIST", backend: "python" } # Change values here + - { dataset: "MNIST", backend: "pytorch" } # Change values here # Step 2: Create FedBed clients - action: Cluster @@ -22,7 +22,7 @@ spec: cluster: templateRef: fedbed.client inputs: - - { fl_server: server, dataset: "MNIST", backend: "python" } # Change values here + - { fl_server: server, dataset: "MNIST", backend: "pytorch" } # Change values here # Teardown - action: Delete diff --git a/examples/federated_learning/node-placement/install.sh b/examples/federated_learning/node-placement/install.sh new file mode 100755 index 00000000..e9243f4c --- /dev/null +++ b/examples/federated_learning/node-placement/install.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +set -eu +set -o pipefail + +export NAMESPACE=node-placement +export SCENARIO=$(dirname -- "$0")/manifest.yml +export REPORTS=${HOME}/frisbee-reports/${NAMESPACE}/ +export DEPENDENCIES=(./charts/system/ ./charts/federated-learning/fedbed/) + +# Prepare the Reporting folder +mkdir -p "${REPORTS}" + +# Submit the scenario and follow server logs +kubectl-frisbee submit test "${NAMESPACE}" "${SCENARIO}" "${DEPENDENCIES[@]}" --logs server |& tee -a "${REPORTS}"/logs & + +# Copy the manifest +cp "${SCENARIO}" "${REPORTS}" + +# wait for the scenario to be submitted +sleep 10 + +# Report the scenario +kubectl-frisbee report test "${NAMESPACE}" "${REPORTS}" --pdf --data --aggregated-pdf --wait diff --git a/examples/federated_learning/3.client-placement.yml b/examples/federated_learning/node-placement/manifest.yml similarity index 92% rename from examples/federated_learning/3.client-placement.yml rename to examples/federated_learning/node-placement/manifest.yml index 5e29470d..c16e5e17 100644 --- a/examples/federated_learning/3.client-placement.yml +++ b/examples/federated_learning/node-placement/manifest.yml @@ -3,7 +3,7 @@ apiVersion: frisbee.dev/v1alpha1 kind: Scenario metadata: - name: client-placement + name: node-placement spec: actions: # Step 1: Create FedBed server @@ -14,7 +14,7 @@ spec: inputs: - { min_fit_clients: 20, min_available_clients: 20 } - # Step 2: Place clients[0,4] to Node A + # Step 2: Place clients[0,4] to Node-1 - action: Cluster name: group-a depends: { running: [ server ] } @@ -29,7 +29,7 @@ spec: - { fl_server: server, total_nodes: 20, node_id: 3 } - { fl_server: server, total_nodes: 20, node_id: 4 } - # Step 2: Place clients[5,9] to Node B + # Step 2: Place clients[5,9] to Node-2 - action: Cluster name: group-b depends: { running: [ server ] } @@ -45,7 +45,7 @@ spec: - { fl_server: server, total_nodes: 20, node_id: 9 } - # Step 2: Place clients[10,14] of clients to Node C + # Step 2: Place clients[10,14] of clients to Node-3 - action: Cluster name: group-c depends: { running: [ server ] } @@ -61,7 +61,7 @@ spec: - { fl_server: server, total_nodes: 20, node_id: 14 } - # Step 2: Place clients[15,19] to Node D + # Step 2: Place clients[15,19] to Node-4 - action: Cluster name: group-d depends: { running: [ server ] } diff --git a/examples/federated_learning/parallel-workflows/install.sh b/examples/federated_learning/parallel-workflows/install.sh index 37f01d79..2ad1de86 100755 --- a/examples/federated_learning/parallel-workflows/install.sh +++ b/examples/federated_learning/parallel-workflows/install.sh @@ -1,6 +1,7 @@ #!/bin/bash -set -eux +set -eu +set -o pipefail export NAMESPACE=parallel-workflows export SCENARIO=$(dirname -- "$0")/manifest.yml @@ -10,12 +11,12 @@ export DEPENDENCIES=(./charts/system/ ./charts/federated-learning/fedbed/) # Prepare the Reporting folder mkdir -p "${REPORTS}" -# Submit the scenario and follow logs -kubectl-frisbee submit test "${NAMESPACE}" "${SCENARIO}" "${DEPENDENCIES[@]}" - # Copy the manifest cp "${SCENARIO}" "${REPORTS}" +# Submit the scenario and follow server logs +kubectl-frisbee submit test "${NAMESPACE}" "${SCENARIO}" "${DEPENDENCIES[@]}" --logs wfa-server,wfb-server |& tee -a "${REPORTS}"/logs & + # wait for the scenario to be submitted sleep 10 diff --git a/examples/federated_learning/parallel-workflows/manifest.yml b/examples/federated_learning/parallel-workflows/manifest.yml index b2b6f3a2..5bc7b0c6 100644 --- a/examples/federated_learning/parallel-workflows/manifest.yml +++ b/examples/federated_learning/parallel-workflows/manifest.yml @@ -1,5 +1,5 @@ # This experiment is designed for the evaluation of parallel workflows on same clients. -# For this purpose, we run two workflows with controllable interference +# For this purpose, we run two workflows with controllable interference. --- apiVersion: frisbee.dev/v1alpha1 kind: Scenario @@ -13,45 +13,76 @@ spec: service: templateRef: fedbed.server inputs: - - { backend: "tensorflow", min_fit_clients: 5, min_available_clients: 5 } + - { min_fit_clients: 20, min_available_clients: 20 } - action: Cluster name: wfa-clients depends: { running: [ wfa-server ] } cluster: templateRef: fedbed.client - resources: - total: { cpu: 2 } + resources: # Change values here + total: { cpu: 40 } distribution: { name: uniform } inputs: - - { fl_server: wfa-server, backend: "tensorflow", total_nodes: 5, node_id: 0 } - - { fl_server: wfa-server, backend: "tensorflow", total_nodes: 5, node_id: 1 } - - { fl_server: wfa-server, backend: "tensorflow", total_nodes: 5, node_id: 2 } - - { fl_server: wfa-server, backend: "tensorflow", total_nodes: 5, node_id: 3 } + - { fl_server: wfa-server, total_nodes: 20, node_id: 0 } + - { fl_server: wfa-server, total_nodes: 20, node_id: 1 } + - { fl_server: wfa-server, total_nodes: 20, node_id: 2 } + - { fl_server: wfa-server, total_nodes: 20, node_id: 3 } + - { fl_server: wfa-server, total_nodes: 20, node_id: 4 } + - { fl_server: wfa-server, total_nodes: 20, node_id: 5 } + - { fl_server: wfa-server, total_nodes: 20, node_id: 6 } + - { fl_server: wfa-server, total_nodes: 20, node_id: 7 } + - { fl_server: wfa-server, total_nodes: 20, node_id: 8 } + - { fl_server: wfa-server, total_nodes: 20, node_id: 9 } + - { fl_server: wfa-server, total_nodes: 20, node_id: 10 } + - { fl_server: wfa-server, total_nodes: 20, node_id: 11 } + - { fl_server: wfa-server, total_nodes: 20, node_id: 12 } + - { fl_server: wfa-server, total_nodes: 20, node_id: 13 } + - { fl_server: wfa-server, total_nodes: 20, node_id: 14 } + - { fl_server: wfa-server, total_nodes: 20, node_id: 15 } + - { fl_server: wfa-server, total_nodes: 20, node_id: 16 } + - { fl_server: wfa-server, total_nodes: 20, node_id: 17 } + - { fl_server: wfa-server, total_nodes: 20, node_id: 18 } + # - { fl_server: wfa-server, total_nodes: 20, node_id: 19 } ########### Training Workflow B ################ - action: Service name: wfb-server - depends: { running: [ wfa-server ] } service: templateRef: fedbed.server inputs: - - { backend: "tensorflow", min_fit_clients: 5, min_available_clients: 5 } + - { min_fit_clients: 20, min_available_clients: 20 } - action: Cluster name: wfb-clients depends: { running: [ wfb-server ] } cluster: templateRef: fedbed.client - resources: - total: { cpu: 2 } - distribution: { name: uniform } + resources: # Change values here + total: { cpu: 40 } + distribution: { name: uniform} inputs: - - { fl_server: wfb-server, backend: "tensorflow", total_nodes: 5, node_id: 0 } - - { fl_server: wfb-server, backend: "tensorflow", total_nodes: 5, node_id: 1 } - - { fl_server: wfb-server, backend: "tensorflow", total_nodes: 5, node_id: 2 } - - { fl_server: wfb-server, backend: "tensorflow", total_nodes: 5, node_id: 3 } + - { fl_server: wfb-server, total_nodes: 20, node_id: 0 } + - { fl_server: wfb-server, total_nodes: 20, node_id: 1 } + - { fl_server: wfb-server, total_nodes: 20, node_id: 2 } + - { fl_server: wfb-server, total_nodes: 20, node_id: 3 } + - { fl_server: wfb-server, total_nodes: 20, node_id: 4 } + - { fl_server: wfb-server, total_nodes: 20, node_id: 5 } + - { fl_server: wfb-server, total_nodes: 20, node_id: 6 } + - { fl_server: wfb-server, total_nodes: 20, node_id: 7 } + - { fl_server: wfb-server, total_nodes: 20, node_id: 8 } + - { fl_server: wfb-server, total_nodes: 20, node_id: 9 } + - { fl_server: wfb-server, total_nodes: 20, node_id: 10 } + - { fl_server: wfb-server, total_nodes: 20, node_id: 11 } + - { fl_server: wfb-server, total_nodes: 20, node_id: 12 } + - { fl_server: wfb-server, total_nodes: 20, node_id: 13 } + - { fl_server: wfb-server, total_nodes: 20, node_id: 14 } + - { fl_server: wfb-server, total_nodes: 20, node_id: 15 } + - { fl_server: wfb-server, total_nodes: 20, node_id: 16 } + - { fl_server: wfb-server, total_nodes: 20, node_id: 17 } + - { fl_server: wfb-server, total_nodes: 20, node_id: 18 } +# - { fl_server: wfb-server, total_nodes: 20, node_id: 19 } ########### Common Client among Workflows A and B ################ @@ -61,14 +92,14 @@ spec: cluster: templateRef: fedbed.client resources: - total: { cpu: 2 } + total: { cpu: 4 } distribution: { name: constant } inputs: - - { fl_server: "wfa-server wfb-server", backend: "tensorflow", total_nodes: 5, node_id: 4 } + - { fl_server: "wfa-server wfb-server", total_nodes: 20, node_id: 19 } ########### Teardown all workflows ################ - action: Delete name: teardown - depends: { success: [ wfa-clients, wfb-clients, common-client ] } + depends: { success: [ wfa-clients, wfb-clients, common-client ] } # , common-client delete: - jobs: [ wfa-server, wfb-server ] + jobs: [ wfa-server, wfb-server ] \ No newline at end of file diff --git a/examples/federated_learning/resource-distribution/install.sh b/examples/federated_learning/resource-distribution/install.sh new file mode 100755 index 00000000..e0e16bfb --- /dev/null +++ b/examples/federated_learning/resource-distribution/install.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +set -eu +set -o pipefail + +export NAMESPACE=resource-distribution +export SCENARIO=$(dirname -- "$0")/manifest.yml +export REPORTS=${HOME}/frisbee-reports/${NAMESPACE}/ +export DEPENDENCIES=(./charts/system/ ./charts/federated-learning/fedbed/) + +# Prepare the Reporting folder +mkdir -p "${REPORTS}" + +# Copy the manifest +cp "${SCENARIO}" "${REPORTS}" + +# Submit the scenario and follow server logs +kubectl-frisbee submit test "${NAMESPACE}" "${SCENARIO}" "${DEPENDENCIES[@]}" --logs server |& tee -a "${REPORTS}"/logs & + +# wait for the scenario to be submitted +sleep 10 + +# Report the scenario +kubectl-frisbee report test "${NAMESPACE}" "${REPORTS}" --pdf --data --aggregated-pdf --wait diff --git a/examples/federated_learning/2.resource-distribution.yml b/examples/federated_learning/resource-distribution/manifest.yml similarity index 93% rename from examples/federated_learning/2.resource-distribution.yml rename to examples/federated_learning/resource-distribution/manifest.yml index 85395d72..e8cb9275 100644 --- a/examples/federated_learning/2.resource-distribution.yml +++ b/examples/federated_learning/resource-distribution/manifest.yml @@ -21,9 +21,9 @@ spec: depends: { running: [ server ] } cluster: templateRef: fedbed.client - resources: # Change values here - total: { cpu: 40, mem: 40Gi } - distribution: { name: uniform } +# resources: # Change values here +# total: { cpu: 75} +# distribution: { name: normal} inputs: - { fl_server: server, total_nodes: 20, node_id: 0 } - { fl_server: server, total_nodes: 20, node_id: 1 } diff --git a/examples/tutorial/19.sla-assertions.yml b/examples/tutorial/19.sla-assertions.yml index b180e39b..81745714 100644 --- a/examples/tutorial/19.sla-assertions.yml +++ b/examples/tutorial/19.sla-assertions.yml @@ -71,7 +71,7 @@ spec: name: client depends: { running: [ server ] } assert: - metrics: "avg() of query(summary/184/transmit, 1m, now) is above(300000000)" + metrics: "avg() of query(summary/184/transmit, 1m, now) is above(300M)" service: templateRef: iperf.client inputs: diff --git a/go.mod b/go.mod index 8d89325c..aeaa9cf5 100644 --- a/go.mod +++ b/go.mod @@ -6,10 +6,13 @@ require ( github.com/Knetic/govaluate v3.0.0+incompatible github.com/Masterminds/sprig/v3 v3.2.3 github.com/armon/circbuf v0.0.0-20190214190532-5111143e8da2 + github.com/dariubs/percent v1.0.0 github.com/dimiro1/banner v1.1.0 github.com/go-logr/logr v1.2.4 github.com/golanghelper/grafana-webhook v0.0.0-20180512191629-e0da26114467 github.com/gosimple/slug v1.13.1 + github.com/grafana-tools/sdk v0.0.0-20220919052116-6562121319fc + github.com/grafana/grafana-api-golang-client v0.21.1 github.com/hashicorp/go-multierror v1.1.1 github.com/imroc/req/v3 v3.35.0 github.com/kubeshop/testkube v1.11.22 @@ -57,8 +60,8 @@ require ( github.com/google/uuid v1.3.0 // indirect github.com/gookit/color v1.5.2 // indirect github.com/gosimple/unidecode v1.0.1 // indirect - github.com/grafana-tools/sdk v0.0.0-20220919052116-6562121319fc // indirect github.com/hashicorp/errwrap v1.1.0 // indirect + github.com/hashicorp/go-cleanhttp v0.5.2 // indirect github.com/huandu/xstrings v1.3.3 // indirect github.com/imdario/mergo v0.3.13 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect diff --git a/go.sum b/go.sum index 218c009e..f1018290 100644 --- a/go.sum +++ b/go.sum @@ -38,6 +38,8 @@ github.com/common-nighthawk/go-figure v0.0.0-20200609044655-c4b36f998cf2 h1:tjT4 github.com/common-nighthawk/go-figure v0.0.0-20200609044655-c4b36f998cf2/go.mod h1:mk5IQ+Y0ZeO87b858TlA645sVcEcbiX6YqP98kt+7+w= github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= +github.com/dariubs/percent v1.0.0 h1:fY8q40FRYaCiFZ0gTOa73Cmp21hS32w+tSSmqbGnUzc= +github.com/dariubs/percent v1.0.0/go.mod h1:NDZpkezJ8QqyIW/510MywB5T2KdC8v/0oTlEoPcMsRM= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -72,6 +74,8 @@ github.com/go-openapi/swag v0.22.3 h1:yMBqmnQ0gyZvEb/+KzuWZOXgllrXT4SADYbvDaXHv/ github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI= github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572/go.mod h1:9Pwr4B2jHnOSGXyyzV8ROjYa2ojvAY6HCGYYfMoC3Ls= +github.com/gobs/pretty v0.0.0-20180724170744-09732c25a95b h1:/vQ+oYKu+JoyaMPDsv5FzwuL2wwWBgBbtj/YLCi4LuA= +github.com/gobs/pretty v0.0.0-20180724170744-09732c25a95b/go.mod h1:Xo4aNUOrJnVruqWQJBtW6+bTBDTniY8yZum5rF3b5jw= github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU= github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM= github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og= @@ -133,10 +137,14 @@ github.com/gosimple/unidecode v1.0.1 h1:hZzFTMMqSswvf0LBJZCZgThIZrpDHFXux9KeGmn6 github.com/gosimple/unidecode v1.0.1/go.mod h1:CP0Cr1Y1kogOtx0bJblKzsVWrqYaqfNOnHzpgWw4Awc= github.com/grafana-tools/sdk v0.0.0-20220919052116-6562121319fc h1:PXZQA2WCxe85Tnn+WEvr8fDpfwibmEPgfgFEaC87G24= github.com/grafana-tools/sdk v0.0.0-20220919052116-6562121319fc/go.mod h1:AHHlOEv1+GGQ3ktHMlhuTUwo3zljV3QJbC0+8o2kn+4= +github.com/grafana/grafana-api-golang-client v0.21.1 h1:39Nqvk5qPBpdrA+uF8sThIGCD9DDbYsJLbOo2WN0g5U= +github.com/grafana/grafana-api-golang-client v0.21.1/go.mod h1:24W29gPe9yl0/3A9X624TPkAOR8DpHno490cPwnkv8E= github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw= github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I= github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= +github.com/hashicorp/go-cleanhttp v0.5.2 h1:035FKYIWjmULyFRBKPs8TBQoi0x6d9G4xc9neXJWAZQ= +github.com/hashicorp/go-cleanhttp v0.5.2/go.mod h1:kO/YDlP8L1346E6Sodw+PrpBSV4/SoxCXGY6BqNFT48= github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo= github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM= github.com/huandu/xstrings v1.3.3 h1:/Gcsuc1x8JVbJ9/rlye4xZnVAbEkGauT8lbebqcQws4= diff --git a/pkg/distributions/constant.go b/pkg/distributions/constant.go index 82a285a1..b4e5a04b 100644 --- a/pkg/distributions/constant.go +++ b/pkg/distributions/constant.go @@ -23,9 +23,7 @@ type Constant struct { // NewConstant creates a new Constant distribution. func NewConstant() *Constant { - return &Constant{ - Number{LastValue: 1}, - } + return &Constant{} } // Next computes the value of the probability density function at x. diff --git a/pkg/distributions/sample_generator.go b/pkg/distributions/sample_generator.go index 960ee7c3..1c3f9229 100644 --- a/pkg/distributions/sample_generator.go +++ b/pkg/distributions/sample_generator.go @@ -38,13 +38,21 @@ func GenerateProbabilitySliceFromSpec(samples int64, spec *v1alpha1.Distribution panic("default distribution is a pointer to an already evaluated distribution, and therefore it should be handled before reaching this point") case v1alpha1.DistributionConstant: - return GenerateProbabilitySlice(samples, NewConstant()) + pdfSlice := genProbabilityDensitySlice(samples, NewConstant()) + + return pdfSlice case v1alpha1.DistributionUniform: - return GenerateProbabilitySlice(samples, NewUniform(1, samples)) + pdfSlice := genProbabilityDensitySlice(samples, NewUniform(1, samples)) + + // normalize to the generated values + return pdfSlice.divide(pdfSlice.sum()) case v1alpha1.DistributionNormal: - return GenerateProbabilitySlice(samples, NewNormal(1, samples)) + pdfSlice := genProbabilityDensitySlice(samples, NewNormal(1, samples)) + + // normalize to the generated values + return pdfSlice.divide(pdfSlice.sum()) case v1alpha1.DistributionPareto: if spec.DistParamsPareto == nil { @@ -54,7 +62,10 @@ func GenerateProbabilitySliceFromSpec(samples int64, spec *v1alpha1.Distribution } } - return GenerateProbabilitySlice(samples, NewPareto(spec.DistParamsPareto.Scale, spec.DistParamsPareto.Shape)) + pdfSlice := genProbabilityDensitySlice(samples, NewPareto(spec.DistParamsPareto.Scale, spec.DistParamsPareto.Shape)) + + // normalize to the generated values + return pdfSlice.divide(pdfSlice.sum()) default: // This condition should be captured by upper layers. @@ -62,7 +73,7 @@ func GenerateProbabilitySliceFromSpec(samples int64, spec *v1alpha1.Distribution } } -func GenerateProbabilitySlice(samples int64, distgenerator Generator) ProbabilitySlice { +func genProbabilityDensitySlice(samples int64, distgenerator Generator) ProbabilitySlice { // discard the first 0 value distgenerator.Next() @@ -72,8 +83,7 @@ func GenerateProbabilitySlice(samples int64, distgenerator Generator) Probabilit dist[i] = distgenerator.Next() } - // normalize not to exceed total. - return dist.divide(dist.sum()) + return dist } // ProbabilitySlice provides the value of the probability density function at x. diff --git a/pkg/distributions/sample_generator_test.go b/pkg/distributions/sample_generator_test.go index 30b3524d..9b5da7f3 100644 --- a/pkg/distributions/sample_generator_test.go +++ b/pkg/distributions/sample_generator_test.go @@ -22,18 +22,37 @@ func Test_ProbabilityGenerator(t *testing.T) { expected distributions.ProbabilitySlice }{ { - name: "uniform", - dist: distributions.GenerateProbabilitySlice(Samples, distributions.NewUniform(1, Samples)), + name: "constant", + dist: distributions.GenerateProbabilitySliceFromSpec(Samples, + &v1alpha1.DistributionSpec{Name: "constant"}, + ), + expected: distributions.ProbabilitySlice{1, 1, 1, 1, 1}, + }, + { + name: "uniform", + dist: distributions.GenerateProbabilitySliceFromSpec(Samples, + &v1alpha1.DistributionSpec{Name: "uniform"}, + ), expected: distributions.ProbabilitySlice{0.2, 0.2, 0.2, 0.2, 0.2}, }, { - name: "normal", - dist: distributions.GenerateProbabilitySlice(Samples, distributions.NewNormal(1, Samples)), + name: "normal", + dist: distributions.GenerateProbabilitySliceFromSpec(Samples, + &v1alpha1.DistributionSpec{Name: "normal"}, + ), expected: distributions.ProbabilitySlice{0.19, 0.21, 0.21, 0.21, 0.19}, }, { - name: "pareto", - dist: distributions.GenerateProbabilitySlice(Samples, distributions.NewPareto(1, 0.1)), + name: "pareto", + dist: distributions.GenerateProbabilitySliceFromSpec(Samples, + &v1alpha1.DistributionSpec{ + Name: "pareto", + DistParamsPareto: &v1alpha1.DistParamsPareto{ + Scale: 1, + Shape: 0.1, + }, + }, + ), expected: distributions.ProbabilitySlice{0.46, 0.22, 0.14, 0.1, 0.08}, }, } @@ -66,9 +85,40 @@ func Test_ResourceDistribution(t *testing.T) { args args want v1alpha1.ResourceDistribution }{ + { + name: "constant", + dist: distributions.GenerateProbabilitySliceFromSpec(Nodes, + &v1alpha1.DistributionSpec{Name: "constant"}, + ), + args: args{total: total}, + want: []corev1.ResourceList{ + { + corev1.ResourceCPU: resource.MustParse("40"), + corev1.ResourceMemory: resource.MustParse("40G"), + }, + { + corev1.ResourceCPU: resource.MustParse("40"), + corev1.ResourceMemory: resource.MustParse("40G"), + }, + { + corev1.ResourceCPU: resource.MustParse("40"), + corev1.ResourceMemory: resource.MustParse("40G"), + }, + { + corev1.ResourceCPU: resource.MustParse("40"), + corev1.ResourceMemory: resource.MustParse("40G"), + }, + { + corev1.ResourceCPU: resource.MustParse("40"), + corev1.ResourceMemory: resource.MustParse("40G"), + }, + }, + }, { name: "uniform", - dist: distributions.GenerateProbabilitySlice(Nodes, distributions.NewUniform(1, Nodes)), + dist: distributions.GenerateProbabilitySliceFromSpec(Nodes, + &v1alpha1.DistributionSpec{Name: "uniform"}, + ), args: args{total: total}, want: []corev1.ResourceList{ { @@ -95,7 +145,9 @@ func Test_ResourceDistribution(t *testing.T) { }, { name: "normal", - dist: distributions.GenerateProbabilitySlice(Nodes, distributions.NewNormal(1, Nodes)), + dist: distributions.GenerateProbabilitySliceFromSpec(Nodes, + &v1alpha1.DistributionSpec{Name: "normal"}, + ), args: args{total: total}, want: []corev1.ResourceList{ { @@ -122,7 +174,15 @@ func Test_ResourceDistribution(t *testing.T) { }, { name: "pareto", - dist: distributions.GenerateProbabilitySlice(Nodes, distributions.NewPareto(1, 0.1)), + dist: distributions.GenerateProbabilitySliceFromSpec(Nodes, + &v1alpha1.DistributionSpec{ + Name: "pareto", + DistParamsPareto: &v1alpha1.DistParamsPareto{ + Scale: 1, + Shape: 0.1, + }, + }, + ), args: args{total: total}, want: []corev1.ResourceList{ { @@ -180,9 +240,25 @@ func Test_TimelineDistribution(t *testing.T) { args args want v1alpha1.Timeline }{ + { + name: "constant", + dist: distributions.GenerateProbabilitySliceFromSpec(Timesteps, + &v1alpha1.DistributionSpec{Name: "constant"}, + ), + args: args{total: total}, + want: []metav1.Time{ + {Time: startingTime.Add(300 * time.Second)}, + {Time: startingTime.Add(600 * time.Second)}, + {Time: startingTime.Add(900 * time.Second)}, + {Time: startingTime.Add(1200 * time.Second)}, + {Time: startingTime.Add(1500 * time.Second)}, + }, + }, { name: "uniform", - dist: distributions.GenerateProbabilitySlice(Timesteps, distributions.NewUniform(1, Timesteps)), + dist: distributions.GenerateProbabilitySliceFromSpec(Timesteps, + &v1alpha1.DistributionSpec{Name: "uniform"}, + ), args: args{total: total}, want: []metav1.Time{ {Time: startingTime.Add(60 * time.Second)}, @@ -194,7 +270,9 @@ func Test_TimelineDistribution(t *testing.T) { }, { name: "normal", - dist: distributions.GenerateProbabilitySlice(Timesteps, distributions.NewNormal(1, Timesteps)), + dist: distributions.GenerateProbabilitySliceFromSpec(Timesteps, + &v1alpha1.DistributionSpec{Name: "normal"}, + ), args: args{total: total}, want: []metav1.Time{ {Time: startingTime.Add(57 * time.Second)}, @@ -206,7 +284,15 @@ func Test_TimelineDistribution(t *testing.T) { }, { name: "pareto", - dist: distributions.GenerateProbabilitySlice(Timesteps, distributions.NewPareto(1, 0.1)), + dist: distributions.GenerateProbabilitySliceFromSpec(Timesteps, + &v1alpha1.DistributionSpec{ + Name: "pareto", + DistParamsPareto: &v1alpha1.DistParamsPareto{ + Scale: 1, + Shape: 0.1, + }, + }, + ), args: args{total: total}, want: []metav1.Time{ {Time: startingTime.Add(138 * time.Second)}, diff --git a/pkg/expressions/metrics.go b/pkg/expressions/metrics.go index 59f2270a..1fdf0898 100644 --- a/pkg/expressions/metrics.go +++ b/pkg/expressions/metrics.go @@ -108,10 +108,6 @@ func DispatchAlert(ctx context.Context, r common.Reconciler, alertBody *notifier r.Info("New Grafana Alert", "name", alertBody.RuleName, "message", alertBody.Message, "state", alertBody.State) - if true { - panic("POUUUUUUUUUUUUTSES") - } - /*---------------------------------------------------* * Patch Boilerplate *---------------------------------------------------*/ diff --git a/pkg/grafana/alerts.go b/pkg/grafana/alerts.go index f16d06d9..ffb1d32f 100644 --- a/pkg/grafana/alerts.go +++ b/pkg/grafana/alerts.go @@ -26,6 +26,7 @@ import ( "github.com/grafana-tools/sdk" "github.com/pkg/errors" "github.com/sirupsen/logrus" + "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/util/wait" ) @@ -177,12 +178,13 @@ func ParseAlertExpr(query v1alpha1.ExprMetrics) (*AlertRule, error) { params := make([]float64, len(paramsStr)) for i, m := range paramsStr { - param, err := strconv.ParseFloat(m, 32) + + quantity, err := resource.ParseQuantity(m) if err != nil { return nil, errors.Wrapf(err, "erroneous parameters") } - params[i] = param + params[i] = quantity.AsApproximateFloat64() } alert.Evaluator.Params = params @@ -201,7 +203,7 @@ func ParseAlertExpr(query v1alpha1.ExprMetrics) (*AlertRule, error) { return &alert, nil } -// SetAlert adds a new alert to Grafana. +// SetAlert adds a new alert to Grafana using the Legacy API. func (c *Client) SetAlert(ctx context.Context, alert *AlertRule, name string, msg string) error { if c == nil { panic("empty client was given") @@ -211,6 +213,9 @@ func (c *Client) SetAlert(ctx context.Context, alert *AlertRule, name string, ms return errors.New("NIL alert was given") } + ctxTimeout, cancel := context.WithTimeout(ctx, Timeout) + defer cancel() + /*---------------------------------------------------* * Get the dashboard *---------------------------------------------------*/ @@ -278,19 +283,16 @@ func (c *Client) SetAlert(ctx context.Context, alert *AlertRule, name string, ms } /*---------------------------------------------------* - * Update the Dashboard + * Update the dashboard *---------------------------------------------------*/ - params := sdk.SetDashboardParams{ - Overwrite: false, - PreserveId: true, - } - retryCond := func(ctx context.Context) (done bool, err error) { - resp, errReq := c.Conn.SetDashboard(ctx, board, params) + resp, errReq := c.Conn.SetDashboard(ctx, board, sdk.SetDashboardParams{ + Overwrite: true, // Needed to avoid "someone else had written the dashboard". + PreserveId: true, // Needed to avoid "someone else had written the dashboard". + }) // Retry if errReq != nil { - c.logger.Info("Connection error. Retry", "alertName", name, "resp", resp, "err", errReq) return false, nil @@ -306,7 +308,7 @@ func (c *Client) SetAlert(ctx context.Context, alert *AlertRule, name string, ms panic("should not go here") } - if err := wait.ExponentialBackoffWithContext(ctx, common.DefaultBackoffForServiceEndpoint, retryCond); err != nil { + if err := wait.ExponentialBackoffWithContext(ctxTimeout, common.DefaultBackoffForServiceEndpoint, retryCond); err != nil { return errors.Wrapf(err, "cannot set alert '%s'", name) } diff --git a/pkg/grafana/client.go b/pkg/grafana/client.go index 10f0b3c8..ac15f43a 100644 --- a/pkg/grafana/client.go +++ b/pkg/grafana/client.go @@ -24,6 +24,7 @@ import ( "github.com/go-logr/logr" notifier "github.com/golanghelper/grafana-webhook" "github.com/grafana-tools/sdk" + gapi "github.com/grafana/grafana-api-golang-client" "github.com/pkg/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/wait" @@ -82,10 +83,12 @@ type Client struct { Conn *sdk.Client + GapiClient *gapi.Client + BaseURL string } -func New(ctx context.Context, setters ...Option) (*Client, error) { +func New(parentCtx context.Context, setters ...Option) (*Client, error) { var args Options for _, setter := range setters { @@ -131,12 +134,20 @@ func New(ctx context.Context, setters ...Option) (*Client, error) { return true, nil } - if err := wait.ExponentialBackoffWithContext(ctx, common.DefaultBackoffForServiceEndpoint, retryCond); err != nil { + if err := wait.ExponentialBackoffWithContext(parentCtx, common.DefaultBackoffForServiceEndpoint, retryCond); err != nil { return nil, errors.Wrapf(err, "endpoint is unreachable ('%s')", *args.HTTPEndpoint) } client.Conn = conn client.BaseURL = *args.HTTPEndpoint + + // Start Gapi client + gapiClient, err := gapi.New(*args.HTTPEndpoint, gapi.Config{}) + if err != nil { + return nil, errors.Wrapf(err, "Failed to initialize gapi client") + } + + client.GapiClient = gapiClient } /*---------------------------------------------------* @@ -147,7 +158,7 @@ func New(ctx context.Context, setters ...Option) (*Client, error) { // Although the notification channel is backed by the Grafana Pod, the Grafana Service is different // from the Alerting Service. For this reason, we must be sure that both Services are linked to the Grafana Pod. - if err := client.SetNotificationChannel(ctx, *args.WebhookURL); err != nil { + if err := client.SetNotificationChannel(parentCtx, *args.WebhookURL); err != nil { return nil, errors.Wrapf(err, "failed to set notification channel") } } diff --git a/pkg/grafana/notifications.go b/pkg/grafana/notifications.go index 78514bf7..4832fcfb 100644 --- a/pkg/grafana/notifications.go +++ b/pkg/grafana/notifications.go @@ -25,7 +25,7 @@ import ( "k8s.io/apimachinery/pkg/util/wait" ) -func (c *Client) SetNotificationChannel(ctx context.Context, webhookURL string) error { +func (c *Client) SetNotificationChannel(parentCtx context.Context, webhookURL string) error { // use the webhook as notification channel for grafana feedback := sdk.AlertNotification{ Name: "Frisbee-Webhook", @@ -60,5 +60,8 @@ func (c *Client) SetNotificationChannel(ctx context.Context, webhookURL string) return true, nil } - return wait.ExponentialBackoffWithContext(ctx, common.DefaultBackoffForServiceEndpoint, retryCond) + ctxTimeout, cancel := context.WithTimeout(parentCtx, Timeout) + defer cancel() + + return wait.ExponentialBackoffWithContext(ctxTimeout, common.DefaultBackoffForServiceEndpoint, retryCond) }