Skip to content

Commit

Permalink
observability: prometheus labels (major)
Browse files Browse the repository at this point in the history
* further split StatsD and Prometheus sources:
  - `statsValue` - with no labels for Prometheus
  - `runner.reg` - simplified     --/--
- units and naming: computed latencies are always reported in millis,
  computed throughput - in MB/s
  - unlike respective "total"s that are always in nanoseconds and bytes,
    respectively
- units and naming: use "bytes" suffix for all ".size" metrics
  (formerly: "mbytes")
- uptime is now "uptime" (formerly, "up_ms_time")
* Prometheus: add all help descriptions
* part six, prev. commit: c901626

Signed-off-by: Alex Aizman <[email protected]>
  • Loading branch information
alex-aizman committed Jul 31, 2024
1 parent b7d0765 commit f5d271b
Show file tree
Hide file tree
Showing 5 changed files with 334 additions and 192 deletions.
33 changes: 23 additions & 10 deletions ais/backend/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,15 @@ func (b *base) init(snode *meta.Snode, tr stats.Tracker) {
stats.KindCounter,
&stats.Extra{
Help: "GET: total number of executed remote requests (cold GETs)",
StrName: "remote_get_n",
StrName: "remote_get_count",
Labels: labels,
},
)
tr.RegExtMetric(snode,
b.metrics[stats.GetLatencyTotal],
stats.KindTotal,
&stats.Extra{
Help: "GET: total cumulative time (nanoseconds) to execute cold GETs and store new content in-cluster",
Help: "GET: total cumulative time (nanoseconds) to execute cold GETs and store new object versions in-cluster",
StrName: "remote_get_ns_total",
Labels: labels,
},
Expand All @@ -61,15 +61,19 @@ func (b *base) init(snode *meta.Snode, tr stats.Tracker) {
b.metrics[stats.GetE2ELatencyTotal],
stats.KindTotal,
&stats.Extra{
Help: "GET: total end-to-end time (nanoseconds) servicing remote requests; includes: cold GET, store in-cluster, transmit response",
Help: "GET: total end-to-end time (nanoseconds) servicing remote requests; " +
"includes: receiving request, executing cold-GET, storing new object version in-cluster, and transmitting response",
StrName: "remote_e2e_get_ns_total",
Labels: labels,
},
)
tr.RegExtMetric(snode,
b.metrics[stats.GetSize],
stats.KindSize,
&stats.Extra{Help: "GET: total cumulative size (bytes) of all cold GET operations", StrName: "remote_get_bytes_total", Labels: labels},
&stats.Extra{
Help: "GET: total cumulative size (bytes) of all cold-GET transactions",
StrName: "remote_get_bytes_total",
Labels: labels},
)

// PUT
Expand All @@ -81,13 +85,17 @@ func (b *base) init(snode *meta.Snode, tr stats.Tracker) {
tr.RegExtMetric(snode,
b.metrics[stats.PutCount],
stats.KindCounter,
&stats.Extra{Help: "PUT: total number of executed remote requests", StrName: "remote_put_n", Labels: labels},
&stats.Extra{
Help: "PUT: total number of executed remote requests to a given backend",
StrName: "remote_put_count",
Labels: labels,
},
)
tr.RegExtMetric(snode,
b.metrics[stats.PutLatencyTotal],
stats.KindTotal,
&stats.Extra{
Help: "PUT: total cumulative time (nanoseconds) to execute remote requests and store new content in-cluster",
Help: "PUT: total cumulative time (nanoseconds) to execute remote requests and store new object versions in-cluster",
StrName: "remote_put_ns_total",
Labels: labels,
},
Expand All @@ -97,13 +105,18 @@ func (b *base) init(snode *meta.Snode, tr stats.Tracker) {
stats.KindTotal,
&stats.Extra{
StrName: "remote_e2e_put_ns_total",
Help: "PUT: total end-to-end time (nanoseconds) servicing remote requests; includes: store in-cluster, execute remote PUT",
Labels: labels},
Help: "PUT: total end-to-end time (nanoseconds) servicing remote requests; " +
"includes: receiving PUT payload, storing it in-cluster, executing remote PUT, finalizing new in-cluster object",
Labels: labels},
)
tr.RegExtMetric(snode,
b.metrics[stats.PutSize],
stats.KindSize,
&stats.Extra{Help: "PUT: total cumulative size (bytes) of all PUTs to remote backend", StrName: "remote_e2e_put_bytes_total", Labels: labels},
&stats.Extra{
Help: "PUT: total cumulative size (bytes) of all PUTs to a given remote backend",
StrName: "remote_e2e_put_bytes_total",
Labels: labels,
},
)

// version changed out-of-band
Expand All @@ -115,7 +128,7 @@ func (b *base) init(snode *meta.Snode, tr stats.Tracker) {
stats.KindCounter,
&stats.Extra{
Help: "number of out-of-band updates (by a 3rd party performing remote PUTs outside this cluster)",
StrName: "remote_ver_change_n",
StrName: "remote_ver_change_count",
Labels: labels,
},
)
Expand Down
107 changes: 21 additions & 86 deletions stats/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ package stats

import (
"encoding/json"
"fmt"
rfs "io/fs"
"os"
"path/filepath"
Expand Down Expand Up @@ -92,6 +91,7 @@ const (
ErrPutMirrorCount = errPrefix + "put.mirror.n"

// KindLatency
// latency stats have numSamples used to compute average latency
GetLatency = "get.ns"
GetLatencyTotal = "get.ns.total"
GetE2ELatencyTotal = "e2e.get.ns.total" // // e2e cold-GET latency
Expand All @@ -117,19 +117,6 @@ type (

// primitives: values and maps
type (
// Stats are tracked via a map of stats names (key) to statsValue (values).
// There are two main types of stats: counter and latency declared
// using the the kind field. Only latency stats have numSamples used to compute latency.
statsValue struct {
kind string // enum { KindCounter, ..., KindSpecial }
label struct {
comm string // common part of the metric label (as in: <prefix> . comm . <suffix>)
stpr string // StatsD _or_ Prometheus label (depending on build tag)
}
Value int64 `json:"v,string"`
numSamples int64 // (log + StatsD) only
cumulative int64
}
copyValue struct {
Value int64 `json:"v,string"`
}
Expand Down Expand Up @@ -174,76 +161,76 @@ func (r *runner) regCommon(snode *meta.Snode) {
// basic counters
r.reg(snode, GetCount, KindCounter,
&Extra{
Help: "number of executed GET(object) requests",
Help: "total number of executed GET(object) requests",
},
)
r.reg(snode, PutCount, KindCounter,
&Extra{
Help: "number of executed PUT(object) requests",
Help: "total number of executed PUT(object) requests",
},
)
r.reg(snode, AppendCount, KindCounter,
&Extra{
Help: "number of executed APPEND(object) requests",
Help: "total number of executed APPEND(object) requests",
},
)
r.reg(snode, DeleteCount, KindCounter,
&Extra{
Help: "number of executed DELETE(object) requests",
Help: "total number of executed DELETE(object) requests",
},
)
r.reg(snode, RenameCount, KindCounter,
&Extra{
Help: "number of executed Rename(object) requests",
Help: "total number of executed rename(object) requests",
},
)
r.reg(snode, ListCount, KindCounter,
&Extra{
Help: "number of executed list-objects requests",
Help: "total number of executed list-objects requests",
},
)

// basic error counters, respectively
r.reg(snode, ErrGetCount, KindCounter,
&Extra{
Help: "number of GET(object) errors",
Help: "total number of GET(object) errors",
},
)
r.reg(snode, ErrPutCount, KindCounter,
&Extra{
Help: "number of PUT(object) errors",
Help: "total number of PUT(object) errors",
},
)
r.reg(snode, ErrAppendCount, KindCounter,
&Extra{
Help: "number of APPEND(object) errors",
Help: "total number of APPEND(object) errors",
},
)
r.reg(snode, ErrDeleteCount, KindCounter,
&Extra{
Help: "number of DELETE(object) errors",
Help: "total number of DELETE(object) errors",
},
)
r.reg(snode, ErrRenameCount, KindCounter,
&Extra{
Help: "number of Rename(object) errors",
Help: "total number of rename(object) errors",
},
)
r.reg(snode, ErrListCount, KindCounter,
&Extra{
Help: "number of list-objects errors",
Help: "total number of list-objects errors",
},
)

// even more error counters
r.reg(snode, ErrHTTPWriteCount, KindCounter,
&Extra{
Help: "number of HTTP write-response errors",
Help: "total number of HTTP write-response errors",
},
)
r.reg(snode, ErrDownloadCount, KindCounter,
&Extra{
Help: "number of download errors",
Help: "downloader: number of download errors",
},
)
r.reg(snode, ErrPutMirrorCount, KindCounter,
Expand All @@ -255,31 +242,30 @@ func (r *runner) regCommon(snode *meta.Snode) {
// basic latencies
r.reg(snode, GetLatency, KindLatency,
&Extra{
Help: "total cumulative time (nanoseconds) to execute GET requests " +
"(in the logs: average GET latency over the last stats-time interval)",
Help: "GET: average time (milliseconds) over the last periodic.stats_time interval",
},
)
r.reg(snode, GetLatencyTotal, KindTotal,
&Extra{
Help: "total cumulative time (nanoseconds) to execute GET requests",
Help: "GET: total cumulative time (nanoseconds)",
},
)
r.reg(snode, ListLatency, KindLatency,
&Extra{
Help: "total cumulative time (nanoseconds) to execute list-objects requests " +
"(in the logs: average list-objects latency over the last stats-time interval)",
Help: "list-objects: average time (milliseconds) over the last periodic.stats_time interval",
},
)
r.reg(snode, KeepAliveLatency, KindLatency,
&Extra{
Help: "in-cluster keep-alive latency (heartbeat latency, nanoseconds)",
Help: "in-cluster keep-alive (heartbeat): average time (milliseconds) over the last periodic.stats_time interval",
},
)

// special uptime
r.reg(snode, Uptime, KindSpecial,
&Extra{
Help: "this node's uptime since startup (nanoseconds)",
Help: "this node's uptime since its startup (seconds)",
StrName: "uptime",
},
)

Expand All @@ -292,57 +278,6 @@ func (r *runner) regCommon(snode *meta.Snode) {
)
}

// naming convention: ".n" for the count and ".ns" for duration (nanoseconds)
// compare with coreStats.initProm()
func (r *runner) reg(snode *meta.Snode, name, kind string, extra *Extra) {
v := &statsValue{kind: kind}
f := func(units string) string {
return fmt.Sprintf("%s.%s.%s.%s", "ais"+snode.Type(), snode.ID(), v.label.comm, units)
}
switch kind {
case KindCounter:
debug.Assert(strings.HasSuffix(name, ".n"), name) // naming convention
v.label.comm = strings.TrimSuffix(name, ".n")
v.label.comm = strings.ReplaceAll(v.label.comm, ":", "_")
v.label.stpr = f("count")
case KindTotal:
debug.Assert(strings.HasSuffix(name, ".total"), name) // naming convention
v.label.comm = strings.TrimSuffix(name, ".total")
v.label.comm = strings.ReplaceAll(v.label.comm, ":", "_")
v.label.stpr = f("total")
case KindSize:
debug.Assert(strings.HasSuffix(name, ".size"), name) // naming convention
v.label.comm = strings.TrimSuffix(name, ".size")
v.label.comm = strings.ReplaceAll(v.label.comm, ":", "_")
v.label.stpr = f("mbytes")
case KindLatency:
debug.Assert(strings.Contains(name, ".ns"), name) // ditto
v.label.comm = strings.TrimSuffix(name, ".ns")
v.label.comm = strings.ReplaceAll(v.label.comm, ".ns.", ".")
v.label.comm = strings.ReplaceAll(v.label.comm, ":", "_")
v.label.stpr = f("ms")
case KindThroughput, KindComputedThroughput:
debug.Assert(strings.HasSuffix(name, ".bps"), name) // ditto
v.label.comm = strings.TrimSuffix(name, ".bps")
v.label.comm = strings.ReplaceAll(v.label.comm, ":", "_")
v.label.stpr = f("mbps")
default:
debug.Assert(kind == KindGauge || kind == KindSpecial)
v.label.comm = name
v.label.comm = strings.ReplaceAll(v.label.comm, ":", "_")
if name == Uptime {
v.label.comm = strings.ReplaceAll(v.label.comm, ".ns.", ".")
v.label.stpr = f("seconds")
} else {
v.label.stpr = fmt.Sprintf("%s.%s.%s", "ais"+snode.Type(), snode.ID(), v.label.comm)
}
}
r.core.Tracker[name] = v

// no-op for StatsD
r.regProm(snode, name, extra, v)
}

//
// as cos.StatsUpdater
//
Expand Down
Loading

0 comments on commit f5d271b

Please sign in to comment.