Skip to content

Commit

Permalink
Improve panels in the playground grafana (#299)
Browse files Browse the repository at this point in the history
  • Loading branch information
harjotgill committed Sep 9, 2022
1 parent 76ee48f commit ce78525
Show file tree
Hide file tree
Showing 9 changed files with 62 additions and 50 deletions.
2 changes: 1 addition & 1 deletion manifests/charts/istioconfig/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@ agentservice:
port: 80
otlpPort: 4317

authzGrpcTimeout: 0.01s
authzGrpcTimeout: 0.5s
maxRequestBytes: 8192
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@ local k = import 'github.com/jsonnet-libs/k8s-libsonnet/1.22/main.libsonnet';

local demoApp = import 'apps/demoapp/main.libsonnet';
local latencyGradientPolicy = import 'github.com/fluxninja/aperture-blueprints/lib/1.0/policies/latency-gradient.libsonnet';
local aperture = import 'github.com/fluxninja/aperture/libsonnet/1.0/main.libsonnet';

local Workload = aperture.v1.SchedulerWorkload;
local LabelMatcher = aperture.v1.LabelMatcher;
local WorkloadWithLabelMatcher = aperture.v1.SchedulerWorkloadAndLabelMatcher;

local demoappMixin =
demoApp {
Expand Down Expand Up @@ -32,6 +37,22 @@ local policy = latencyGradientPolicy({
serviceSelector+: {
service: 'service1-demo-app.demoapp.svc.cluster.local',
},
concurrencyLimiter+: {
defaultWorkload: {
priority: 20,
timeout: '0.025s',
},
workloads: [
WorkloadWithLabelMatcher.new(
workload=Workload.withPriority(50) + Workload.withTimeout('0.025s'),
label_matcher=LabelMatcher.withMatchLabels({ 'request_header_user-type': 'guest' })
),
WorkloadWithLabelMatcher.new(
workload=Workload.withPriority(200) + Workload.withTimeout('0.025s'),
label_matcher=LabelMatcher.withMatchLabels({ 'request_header_user-type': 'subscriber' })
),
],
},
}).policy;

{
Expand Down
10 changes: 5 additions & 5 deletions manifests/k8s/tanka/jsonnetfile.lock.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
"subdir": ""
}
},
"version": "f41e919200a8283c21d8d3669caab6505e72972d",
"sum": "EZe68l9CbQtOwVyJQd7XezZcwE8zXZtjNi+s5MN3WCg="
"version": "99977877059d086c9a4253b26778fc567f381e52",
"sum": "eF4lEXF1Wq8/jXEhIEPK+zjATFGjcju+guoad5sflrk="
},
{
"source": {
Expand All @@ -18,8 +18,8 @@
"subdir": "lib/1.0"
}
},
"version": "f41e919200a8283c21d8d3669caab6505e72972d",
"sum": "VZw1LdVqxkqL6mbWTHVPWfzcjrkkM8FyDkmv2HoJddc="
"version": "99977877059d086c9a4253b26778fc567f381e52",
"sum": "cQ5J2PMhqaw3NqkMAEml9ocF86HAvwgOS7pxVeBrvJY="
},
{
"source": {
Expand All @@ -28,7 +28,7 @@
"subdir": "libsonnet/1.0"
}
},
"version": "7ac610333b11916c02915943c5e9953c5ac111de",
"version": "5408d30c44d75de5fec6a4c7de66023e753eb00c",
"sum": "CEvwNmmpYPp6Yp1F0mqO1tTpTcaKUvnWge5Vh1G64k0="
},
{
Expand Down
4 changes: 2 additions & 2 deletions manifests/k8s/tanka/lib/apps/aperture-grafana/main.libsonnet
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
local grafanaOperator = import 'github.com/jsonnet-libs/grafana-operator-libsonnet/4.3/main.libsonnet';
local kubernetesMixin = import 'github.com/kubernetes-monitoring/kubernetes-mixin/mixin.libsonnet';

local decisionDashboard = import 'github.com/fluxninja/aperture-blueprints/lib/1.0/dashboards/decision.libsonnet';
local policyDashboard = import 'github.com/fluxninja/aperture-blueprints/lib/1.0/dashboards/latency-gradient.libsonnet';

local grafana = grafanaOperator.integreatly.v1alpha1.grafana;
local dashboard = grafanaOperator.integreatly.v1alpha1.grafanaDashboard;
Expand Down Expand Up @@ -40,7 +40,7 @@ local dashboards =
[
dashboard.new('example-dashboard') +
dashboard.metadata.withLabels({ 'fluxninja.com/grafana-instance': 'aperture-grafana' }) +
dashboard.spec.withJson(std.manifestJsonEx(decisionDashboard({
dashboard.spec.withJson(std.manifestJsonEx(policyDashboard({
policyName: 'service1-demo-app',
}).dashboard, indent=' ')) +
dashboard.spec.withDatasources({
Expand Down
8 changes: 4 additions & 4 deletions pkg/flowcontrol/common/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,25 +48,25 @@ func NewPrometheusMetrics(registry *prometheus.Registry) (*PrometheusMetrics, er
registry: registry,
checkReceivedTotal: prometheus.NewCounter(
prometheus.CounterOpts{
Name: metrics.FlowControlCheckRequestsMetricName,
Name: metrics.FlowControlRequestsMetricName,
Help: "Total number of aperture check requests handled",
},
),
checkDecision: *prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: metrics.FlowControlCheckDecisionsMetricName,
Name: metrics.FlowControlDecisionsMetricName,
Help: "Number of aperture check decisions",
}, []string{metrics.FlowControlCheckDecisionTypeLabel},
),
errorReason: *prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: metrics.FlowControlCheckErrorReasonMetricName,
Name: metrics.FlowControlErrorReasonMetricName,
Help: "Number of error reasons other than unspecified",
}, []string{metrics.FlowControlCheckErrorReasonLabel},
),
rejectReason: *prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: metrics.FlowControlCheckRejectReasonMetricName,
Name: metrics.FlowControlRejectReasonMetricName,
Help: "Number of reject reasons other than unspecified",
}, []string{metrics.FlowControlCheckRejectReasonLabel},
),
Expand Down
34 changes: 13 additions & 21 deletions pkg/metrics/schema.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,30 +27,22 @@ const (
WFQFlowsMetricName = "wfq_flows"
// WFQRequestsMetricName - weighted fair queuing number of requests gauge.
WFQRequestsMetricName = "wfq_requests"
// FlowControlCheckRequestsMetricName - counter for Check requests for flowcontrol.
FlowControlCheckRequestsMetricName = "flowcontrol_check_requests_total"
// FlowControlCheckDecisionsMetricName - counter for Check requests per decision type.
FlowControlCheckDecisionsMetricName = "flowcontrol_check_decisions_total"
// FlowControlCheckErrorReasonMetricName - metric for error reason on FCS Check requests.
FlowControlCheckErrorReasonMetricName = "flowcontrol_check_error_reason_total"
// FlowControlCheckRejectReasonMetricName - metric for reject reason on FCS Check requests.
FlowControlCheckRejectReasonMetricName = "flowcontrol_check_reject_reason_total"
// FlowControlRequestsMetricName - counter for Check requests for flowcontrol.
FlowControlRequestsMetricName = "flowcontrol_requests_count"
// FlowControlDecisionsMetricName - counter for Check requests per decision type.
FlowControlDecisionsMetricName = "flowcontrol_decisions_count"
// FlowControlErrorReasonMetricName - metric for error reason on FCS Check requests.
FlowControlErrorReasonMetricName = "flowcontrol_error_reason_count"
// FlowControlRejectReasonMetricName - metric for reject reason on FCS Check requests.
FlowControlRejectReasonMetricName = "flowcontrol_reject_reason_count"
// TokenBucketMetricName - a gauge that tracks the load shed factor.
TokenBucketMetricName = "token_bucket_lsf"
// TokenBucketFillRateMetricName - a gauge that tracks the fill rate of token bucket.
TokenBucketFillRateMetricName = "token_bucket_bucket_fill_rate"
TokenBucketFillRateMetricName = "token_bucket_fill_rate"
// TokenBucketCapacityMetricName - a gauge that tracks the capacity of token bucket.
TokenBucketCapacityMetricName = "token_bucket_bucket_capacity"
TokenBucketCapacityMetricName = "token_bucket_capacity"
// TokenBucketAvailableMetricName - a gauge that tracks the number of tokens available in token bucket.
TokenBucketAvailableMetricName = "token_bucket_available_tokens"
// GroupJobRegisteredMetricName - current number of group job registered.
GroupJobRegisteredMetricName = "group_job_registered_number"
// GroupJobScheduledMetricName - current number of group job scheduled.
GroupJobScheduledMetricName = "group_job_scheduled_number"
// GroupJobCompletedMetricName - total number of group job completed.
GroupJobCompletedMetricName = "group_job_completed_total"
// GroupJobLatencyMetricName - the latency of the group jobs.
GroupJobLatencyMetricName = "group_job_latency_seconds"

// PROMETHEUS LABELS.

Expand All @@ -75,11 +67,11 @@ const (
// ResponseStatusCodeLabel - label from response status code.
ResponseStatusCodeLabel = "response_status_code"
// FlowControlCheckDecisionTypeLabel - label for decision type dropped or accepted.
FlowControlCheckDecisionTypeLabel = "flowcontrol_check_decision_type"
FlowControlCheckDecisionTypeLabel = "decision_type"
// FlowControlCheckErrorReasonLabel - label for error reason on FCS Check request.
FlowControlCheckErrorReasonLabel = "flowcontrol_check_error_reason"
FlowControlCheckErrorReasonLabel = "error_reason"
// FlowControlCheckRejectReasonLabel - label for reject reason on FCS Check request.
FlowControlCheckRejectReasonLabel = "flowcontrol_check_reject_reason"
FlowControlCheckRejectReasonLabel = "reject_reason"

// DEFAULTS.

Expand Down
15 changes: 7 additions & 8 deletions pkg/otelcollector/metricsprocessor/processor.go
Original file line number Diff line number Diff line change
Expand Up @@ -225,19 +225,19 @@ func (p *metricsProcessor) updateMetrics(
statusCodeStr := statusCode.StringVal()

for _, decision := range checkResponse.LimiterDecisions {
workload := ""
if cl := decision.GetConcurrencyLimiter(); cl != nil {
workload = cl.GetWorkloadIndex()
}
labels := map[string]string{
metrics.PolicyNameLabel: decision.PolicyName,
metrics.PolicyHashLabel: decision.PolicyHash,
metrics.ComponentIndexLabel: fmt.Sprintf("%d", decision.ComponentIndex),
metrics.DecisionTypeLabel: checkResponse.DecisionType.String(),
metrics.WorkloadIndexLabel: workload,
}
log.Trace().Msgf("labels: %v", labels)

workload := ""
if cl := decision.GetConcurrencyLimiter(); cl != nil {
workload = cl.GetWorkloadIndex()
}
err = p.updateMetricsForWorkload(labels, latency, workload)
err = p.updateMetricsForWorkload(labels, latency)
if err != nil {
return err
}
Expand All @@ -250,8 +250,7 @@ func (p *metricsProcessor) updateMetrics(
return nil
}

func (p *metricsProcessor) updateMetricsForWorkload(labels map[string]string, latency float64, workload string) error {
labels[metrics.WorkloadIndexLabel] = workload
func (p *metricsProcessor) updateMetricsForWorkload(labels map[string]string, latency float64) error {
latencyHistogram, err := p.workloadLatencyHistogram.GetMetricWith(labels)
if err != nil {
log.Warn().Err(err).Msg("Getting latency histogram")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,11 +117,11 @@ func newLoadShedActuatorFactory(
errMulti = multierr.Append(errMulti, err)
}
if !prometheusRegistry.Unregister(f.tokenBucketFillRateGaugeVec) {
err := fmt.Errorf("failed to unregister token_bucket_bucket_fill_rate metric")
err := fmt.Errorf("failed to unregister token_bucket_fill_rate metric")
errMulti = multierr.Append(errMulti, err)
}
if !prometheusRegistry.Unregister(f.tokenBucketBucketCapacityGaugeVec) {
err := fmt.Errorf("failed to unregister token_bucket_bucket_capacity metric")
err := fmt.Errorf("failed to unregister token_bucket_capacity metric")
errMulti = multierr.Append(errMulti, err)
}
if !prometheusRegistry.Unregister(f.tokenBucketAvailableTokensGaugeVec) {
Expand Down Expand Up @@ -218,11 +218,11 @@ func (lsaFactory *loadShedActuatorFactory) newLoadShedActuator(conLimiter *concu
}
deleted = lsaFactory.tokenBucketFillRateGaugeVec.Delete(metricLabels)
if !deleted {
errMulti = multierr.Append(errMulti, errors.New("failed to delete token_bucket_bucket_fill_rate gauge from its metric vector"))
errMulti = multierr.Append(errMulti, errors.New("failed to delete token_bucket_fill_rate gauge from its metric vector"))
}
deleted = lsaFactory.tokenBucketBucketCapacityGaugeVec.Delete(metricLabels)
if !deleted {
errMulti = multierr.Append(errMulti, errors.New("failed to delete token_bucket_bucket_capacity gauge from its metric vector"))
errMulti = multierr.Append(errMulti, errors.New("failed to delete token_bucket_capacity gauge from its metric vector"))
}
deleted = lsaFactory.tokenBucketAvailableTokensGaugeVec.Delete(metricLabels)
if !deleted {
Expand Down
10 changes: 5 additions & 5 deletions tools/load_generator/scenarios/load_test.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@ import http from "k6/http";
import { check } from "k6";

export let vuStages = [
{ duration: "30s", target: 5 }, // simulate ramp-up of traffic from 0 to 5 users over 30 seconds
{ duration: "30s", target: 5 }, // stay at 5 users for 30s minutes
{ duration: "2m", target: 15 }, // ramp-up to 10 users over 1 minutes
{ duration: "1s", target: 5 }, // simulate ramp-up of traffic from 0 to 5 users over 30 seconds
{ duration: "2m", target: 5 }, // stay at 5 users for 30s minutes
{ duration: "1m", target: 15 }, // ramp-up to 10 users over 1 minutes
{ duration: "2m", target: 15 }, // stay at 10 users for 2 minutes (peak hour)
{ duration: "10s", target: 5 }, // ramp-down to 5 users in 10 seconds
{ duration: "2m", target: 5 }, // stay at to 5 users in 30 seconds
{ duration: "1s", target: 5 }, // ramp-down to 5 users in 10 seconds
{ duration: "5m", target: 5 }, // stay at to 5 users in 30 seconds
];

export let options = {
Expand Down

0 comments on commit ce78525

Please sign in to comment.